In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [42]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
import math

# Data processing

In [3]:
# Reading users file
u_cols = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
users = pd.read_csv('/content/drive/MyDrive/Colab/ML/users.dat', sep='::', names=u_cols, encoding='latin-1', engine='python')
n_users = users.shape[0]
n_users

6040

In [4]:
# Reading ratings file
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('/content/drive/MyDrive/Colab/ML/ratings.dat', sep='::', names=r_cols, encoding='latin-1', engine='python')
ratings = ratings_base.to_numpy()
rate_train, rate_test = train_test_split(ratings, test_size=0.1, stratify=ratings[:,0])
ratings

array([[        1,      1193,         5, 978300760],
       [        1,       661,         3, 978302109],
       [        1,       914,         3, 978301968],
       ...,
       [     6040,       562,         5, 956704746],
       [     6040,      1096,         4, 956715648],
       [     6040,      1097,         4, 956715569]])

In [29]:
mean = np.array([np.mean(ratings[:,2][ratings[:,0]==1]) for i in range(n_users)])

In [5]:
# Reading movies file
i_cols = ['movie id', 'movie title', 'genres']
movies = pd.read_csv('/content/drive/MyDrive/Colab/ML/movies.dat', sep='::', names=i_cols, encoding='latin-1', engine='python')
genres_columns = movies['genres'].str.get_dummies(sep='|')
movies = pd.concat([movies, genres_columns], axis=1)
movies = movies.drop('genres', axis=1)
movies

Unnamed: 0,movie id,movie title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,3949,Requiem for a Dream (2000),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,3950,Tigerland (2000),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,3951,Two Family House (2000),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


Handle missing data

In [6]:
#Find and insert missing data to the dataframe with: movie title = 'XXX' and all values in genre column = '0'
existing_ids = movies['movie id'].tolist()
missing_ids = [id for id in range(1, max(existing_ids) + 1) if id not in existing_ids]
missing_data = pd.DataFrame({'movie id': missing_ids, 'movie title': 'XXX'})
missing_data = pd.concat([missing_data, pd.DataFrame(0, index=np.arange(len(missing_data)), columns=genres_columns.columns)], axis=1)
movies = pd.concat([movies, missing_data], ignore_index=True)


# Prediction


In [8]:
#Since we rely on the movie genre to build our profile, we'll only be interested in the 18 binary values at the end of each row:
genre = 18
X0 = movies.to_numpy()
X_train_counts = X0[:, -genre:]
X_train_counts

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=object)

In [9]:
#tfidf
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
tfidf = transformer.fit_transform(X_train_counts.tolist()).toarray()


def get_items_rated_by_user(rate_matrix, user_id):
    y = rate_matrix[:,0]
    ids = np.where(y == user_id +1)[0]
    item_ids = rate_matrix[ids, 1] - 1
    scores = rate_matrix[ids, 2]
    return (item_ids, scores)

d = tfidf.shape[1] # data dimension
W = np.zeros((d, n_users))
b = np.zeros((1, n_users))

for n in range(n_users):
    ids, scores = get_items_rated_by_user(rate_train, n)
    clf = Ridge(alpha=0.01, fit_intercept  = True)
    Xhat = tfidf[ids, :]

    clf.fit(Xhat, scores)
    W[:, n] = clf.coef_
    b[0, n] = clf.intercept_


In [10]:
#Predicted scores
Yhat = tfidf.dot(W) + b

# Evaluate model

In [14]:
def RMSE(Yhat, rates):
    se = 0
    cnt = 0
    for n in range(n_users):
        ids, scores_truth = get_items_rated_by_user(rates, n)
        scores_pred = Yhat[ids, n]
        e = scores_truth - scores_pred
        se += (e*e).sum(axis = 0)
        cnt += e.size
    return math.sqrt(se/cnt)

print('RMSE for training:', RMSE(Yhat, rate_train))
print('RMSE for test    :', RMSE(Yhat, rate_test))


RMSE for training: 0.9696753246095215
RMSE for test    : 1.1040514088605575


In [19]:
def MAE(Yhat, rates):
    se = 0
    cnt = 0
    for n in range(n_users):
        ids, scores_truth = get_items_rated_by_user(rates, n)
        scores_pred = Yhat[ids, n]
        e = abs(scores_truth - scores_pred )
        se += e.sum(axis = 0)
        cnt += e.size
    return math.sqrt(se/cnt)

print('MAE for training:', MAE(Yhat, rate_train))
print('MAE for test    :', MAE(Yhat, rate_test))

MAE for training: 0.874263504067169
MAE for test    : 0.9347834659234732


In [44]:
tp=0
tp_fp=0
tp_fn=0
para = 0.05
DCG = []
IDCG = sorted(rate_test[:,2])[::-1]
for r in rate_test:
    trueRate = r[2]-mean[r[0]-1]
    predictRate = Yhat[r[1]-1,r[0]-1]-mean[r[0]-1]
    DCG.append(predictRate)
    if predictRate>para:
        tp_fp+=1
    if trueRate>para:
        tp_fn += 1
        if predictRate>para:
            tp+=1
precision = tp/tp_fp
recall = tp/tp_fn
DCG = sorted(DCG)[::-1]
NDCG=(ndcg_score([IDCG[:10]], [DCG[:10]]))
print('Precision    :', precision)
print('Reacll       :', recall)
print('NDCG         :', NDCG)

Precision    : 0.40826538176426985
Reacll       : 0.19438807023735993
NDCG         : 0.9999999999999997
