<a href="https://colab.research.google.com/github/evaisherexd/dissertation-RS/blob/main/modeling_and_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ml_metrics
!pip install implicit

In [None]:
import pandas as pd
import numpy as np
import ml_metrics as mm
from sklearn.model_selection import train_test_split
import implicit

inter_df = pd.read_csv('drive/MyDrive/df.csv')

del inter_df['Unnamed: 0']
del inter_df['mlogViewTime']

In [None]:
# Evaluation metrics

# precision
def recall(actual, predicted):
    tp = len(set(actual) & set(predicted))
    result = tp / len(actual)
    return result


# recall
def precision(actual, predicted):
    tp = len(set(actual) & set(predicted))
    result = tp / len(predicted)
    return result


# nDCG, the closer to 1, the better
def dcg_at_k(score, k=None):
    """
    discounted cumulative gain (dcg)

    Parameters
    ----------
    score : 1d nd.array
        ranking/relevance score

    k : int, default None
        evaluate the measure for the top-k ranking score,
        default None evaluates all

    Returns
    -------
    dcg: float
    """
    if k is not None:
        score = score[:k]

    discounts = np.log2(np.arange(2, len(score) + 2))
    dcg = np.sum(score / discounts)
    return dcg
  
def ndcg_at_k(score, k=None):
    """
    normalized discounted cumulative gain (ndcg)

    Parameters
    ----------
    score : 1d nd.array
        ranking/relevance score

    k : int, default None
        evaluate the measure for the top-k ranking score,
        default None evaluates all

    Returns
    -------
    ndcg: float, 0.0 ~ 1.0
    """
    actual_dcg = dcg_at_k(score, k)
    sorted_score = np.sort(score)[::-1]
    best_dcg = dcg_at_k(sorted_score, k)
    if best_dcg == 0:
        ndcg = 0
    else: ndcg = actual_dcg / best_dcg
    return ndcg

In [None]:
# set a weighting scheme for all user behaviours
w_isClick=10
w_isViewComment=0
w_isLike=0
w_isComment=0
w_share=0
w_homepage=10

# calculate a pseudo score for each impression
inter_df['score'] = inter_df['isClick'] * w_isClick + inter_df['isViewComment'] * w_isViewComment + inter_df['isLike'] * w_isLike   + \
                     inter_df['isComment']* w_isComment + inter_df['isShare'] * w_share + inter_df['isIntoPersonalHomepage'] * w_homepage

# select only necessary columns
score_df = inter_df[["userId", "mlogId", 'score']]

# check duplicated rows
score_df = score_df.drop_duplicates(subset=['userId', 'mlogId'])


# split data into training/test set 
# using stratifies sampling to ensure that both data sets have impressions of every user
y = score_df['userId']
score_df['userId'] = score_df['userId'].astype('category')
score_df['mlogId'] = score_df['mlogId'].astype('category')
score_df['user_Id'] = score_df['userId'].cat.codes
score_df['mlog_Id'] = score_df['mlogId'].cat.codes
X_train, X_test, y_train, y_test = train_test_split(score_df, y, test_size=0.3, stratify=y, random_state=10)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
X_test = X_test[X_test['score'] != 0]

# convert str into numerical data since csr matrix only accept numerical data
ui_matrix = sparse.csr_matrix((X_train['score'], (X_train['user_Id'], X_train['mlog_Id'])))

# assign alpha and the number of latent factors to the model
alpha=10
factor = 100

# train ALS model
model = implicit.als.AlternatingLeastSquares(use_gpu=1,factors=factor, alpha=alpha, iterations=10,random_state=23)
data_conf = (ui_matrix).astype('double')
model.fit(data_conf)

# make recommendations and calculate evaluation metrics for the model
reca = 0
k = 0
n_dcg = 0
N = 100
X_test.sort_values(by=['user_Id'])

for i in X_test['user_Id'].unique():
    score = []
    recommendations = model.recommend(i, data_conf[i],N=N)
    predicted_item = recommendations[0].tolist()
    actual_item = X_test[X_test['user_Id'] == i]['mlog_Id'].tolist()
    reca+=recall(actual_item, predicted_item)
    for j in predicted_item:
        if j in actual_item:
          score.append(1)
        else:
          score.append(0)
    n_dcg+=ndcg_at_k(score, 100)
    k += 1

# export results to a log
import csv
with open('drive/MyDrive/out.log', 'a') as f:
  print(f'a={alpha}, w_isClick={w_isClick}, w_isViewComment={w_isViewComment}, w_isLike={w_isLike}, w_isComment={w_isComment}, w_share={w_share}, w_homepage={w_homepage}, f={factor}, r={N}',file=f)
  print('recall:', reca/k,',ndcg:', n_dcg/k,file=f)


  0%|          | 0/10 [00:00<?, ?it/s]