In [1]:
import os
import re
import pandas as pd
import numpy as np
import nltk
import pickle
import parent_modules
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import pairwise_distances

from definitions import *

ratings_dataset_path = os.path.join(DATA_DIR, "ratings.csv")
answers_dataset_path = os.path.join(DATA_DIR, "answers.csv")
movies_dataset_path = os.path.join(DATA_DIR, "movies.csv")


ratings_df = pd.read_csv(ratings_dataset_path)
answers_df = pd.read_csv(answers_dataset_path)
movies_df = pd.read_csv(movies_dataset_path)

ratings_movies_df = pd.merge(ratings_df, movies_df, on='movieId', how="inner")
answers_movies_df = pd.merge(answers_df, movies_df, on='movieId', how="outer")


In [2]:
# Preprocessing over datasets
# Finding the users with multiple reviews
ratings_movies_df.drop(columns="timestamp")
grouped_user_ratings = ratings_movies_df.groupby("userId")
print(f"Starting data filtering shape{ratings_movies_df.shape}")

mean_user_ratings = grouped_user_ratings.count()["rating"].mean()
filtered_by_user = grouped_user_ratings.filter(lambda x: len(x["rating"]) >= 50)
print(f"1st data filtering shape{filtered_by_user.shape}")
grouped_movie_ratings = filtered_by_user.groupby("movieId")
mean_movies_ratings = grouped_movie_ratings.count()["rating"].mean()
final_ratings_movies = grouped_movie_ratings.filter(lambda x: len(x["rating"]) >= mean_movies_ratings)
print(f"2nd data filtering shape{final_ratings_movies.shape}")


Starting data filtering shape(100836, 6)
1st data filtering shape(93812, 6)
2nd data filtering shape(74253, 6)


In [3]:
print(c.groupby("movieId").count().shape, ratings_movies_df.groupby("movieId").count().shape)

(2221, 5) (9724, 5)


In [3]:
from scipy.sparse import csr_matrix
# pivot ratings into movie features
df_movie_features = final_ratings_movies.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)
mat_movie_features = csr_matrix(df_movie_features.values)

In [4]:
# Jaccard Similarity

# convert dataframe to zero - one format
df_movie_features_zero_one = df_movie_features.copy(deep=True)
df_movie_features_zero_one[df_movie_features_zero_one > 0.5] = 1
df_movie_features_zero_one = df_movie_features_zero_one.astype(int)

jac_sim_matrix = 1 - pairwise_distances(df_movie_features_zero_one.values.T, metric = "jaccard")

# optionally convert it to a DataFrame
jac_sim = pd.DataFrame(
    jac_sim_matrix,
    index=df_movie_features_zero_one.columns,
    columns=df_movie_features_zero_one.columns
)
jac_sim.head()



movieId,1,2,3,5,6,7,9,10,11,12,...,159093,164179,166528,168250,168252,174055,176371,177765,179819,187593
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.301887,0.142857,0.12973,0.248826,0.132979,0.039548,0.296804,0.203046,0.067797,...,0.027778,0.076923,0.069892,0.038889,0.065934,0.045198,0.05,0.027933,0.033898,0.027933
2,0.301887,1.0,0.2,0.161017,0.220126,0.128,0.027523,0.325,0.178571,0.073394,...,0.027273,0.097345,0.104348,0.036036,0.119266,0.046296,0.083333,0.018182,0.037383,0.046729
3,0.142857,0.2,1.0,0.222222,0.165217,0.246154,0.130435,0.169231,0.129032,0.1875,...,0.0,0.04918,0.046875,0.018519,0.050847,0.039216,0.055556,0.0,0.02,0.0
5,0.12973,0.161017,0.222222,1.0,0.154545,0.298246,0.097561,0.150794,0.166667,0.086957,...,0.022222,0.055556,0.052632,0.021277,0.037736,0.022222,0.041667,0.0,0.023256,0.0
6,0.248826,0.220126,0.165217,0.154545,1.0,0.149123,0.0625,0.311688,0.192308,0.080808,...,0.03,0.096154,0.083333,0.029412,0.076923,0.040404,0.07,0.0,0.020202,0.030303


In [5]:
# Pearson Similarity
pearson_sim = df_movie_features.corr(method="pearson")
pearson_sim.head()

movieId,1,2,3,5,6,7,9,10,11,12,...,159093,164179,166528,168250,168252,174055,176371,177765,179819,187593
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.178983,0.136128,0.153725,0.122948,0.099621,0.112117,0.185387,0.144952,0.149678,...,-0.016475,0.079961,0.028192,0.018223,0.057158,0.061225,0.057341,-0.010076,0.051466,0.014426
2,0.178983,1.0,0.173598,0.17939,0.101651,0.104321,-0.023199,0.258169,0.124763,0.0826,...,0.009875,0.129452,0.145067,0.022873,0.219752,0.048388,0.154305,-0.033434,0.110026,0.117877
3,0.136128,0.173598,1.0,0.359519,0.1224,0.373282,0.254815,0.129034,0.164854,0.202242,...,-0.055394,-0.008482,-0.016528,-0.043928,-0.004138,0.04574,0.033977,-0.050395,0.024357,-0.053386
5,0.153725,0.17939,0.359519,1.0,0.135922,0.458714,0.216356,0.144774,0.20356,0.117635,...,0.020997,0.006304,0.002984,-0.003354,-0.022626,-0.003864,0.002731,-0.045835,-0.001546,-0.048556
6,0.122948,0.101651,0.1224,0.135922,1.0,0.100664,0.138454,0.254995,0.15915,0.131322,...,0.017029,0.127574,0.083804,0.029891,0.10694,0.073934,0.124326,-0.082322,0.000549,0.028282


In [6]:
# Cosine Similarity
cos_sim_matrix = 1 - pairwise_distances(df_movie_features.values.T, metric = "cosine")
# optionally convert it to a DataFrame
cos_sim = pd.DataFrame(
    cos_sim_matrix,
    index=df_movie_features_zero_one.columns,
    columns=df_movie_features_zero_one.columns
)
cos_sim.head()

movieId,1,2,3,5,6,7,9,10,11,12,...,159093,164179,166528,168250,168252,174055,176371,177765,179819,187593
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.446971,0.305854,0.302137,0.396841,0.271328,0.183813,0.46352,0.358906,0.225869,...,0.095783,0.213547,0.184635,0.130367,0.1903,0.152449,0.169952,0.090992,0.137871,0.114891
2,0.446971,1.0,0.301151,0.294109,0.317119,0.237974,0.056544,0.452402,0.295874,0.157833,...,0.090493,0.226755,0.246286,0.108297,0.298076,0.122652,0.228011,0.046223,0.169823,0.179888
3,0.305854,0.301151,1.0,0.418275,0.253466,0.433397,0.287134,0.269804,0.268005,0.244136,...,0.0,0.066637,0.063582,0.015093,0.067422,0.094188,0.092896,0.0,0.070658,0.0
5,0.302137,0.294109,0.418275,1.0,0.253221,0.50634,0.248879,0.269911,0.293094,0.161519,...,0.067381,0.074178,0.075035,0.048315,0.044274,0.043489,0.058785,0.0,0.042411,0.0
6,0.396841,0.317119,0.253466,0.253221,1.0,0.229346,0.193095,0.441042,0.316268,0.197054,...,0.093126,0.221449,0.190364,0.110655,0.19921,0.141733,0.199744,0.0,0.072779,0.100128


In [7]:
# Cosine Similarity
ham_sim_matrix = 1 - pairwise_distances(df_movie_features.values.T, metric = "hamming")
# optionally convert it to a DataFrame
ham_sim = pd.DataFrame(
    ham_sim_matrix,
    index=df_movie_features_zero_one.columns,
    columns=df_movie_features_zero_one.columns
)
ham_sim.head()

movieId,1,2,3,5,6,7,9,10,11,12,...,159093,164179,166528,168250,168252,174055,176371,177765,179819,187593
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.490909,0.524675,0.532468,0.488312,0.519481,0.537662,0.462338,0.503896,0.537662,...,0.537662,0.532468,0.52987,0.537662,0.532468,0.542857,0.535065,0.532468,0.537662,0.532468
2,0.490909,1.0,0.703896,0.701299,0.615584,0.690909,0.714286,0.631169,0.651948,0.719481,...,0.714286,0.709091,0.703896,0.714286,0.716883,0.716883,0.719481,0.714286,0.719481,0.724675
3,0.524675,0.703896,1.0,0.854545,0.711688,0.838961,0.880519,0.667532,0.758442,0.880519,...,0.85974,0.841558,0.833766,0.857143,0.846753,0.864935,0.857143,0.862338,0.862338,0.862338
5,0.532468,0.701299,0.854545,1.0,0.724675,0.867532,0.893506,0.680519,0.781818,0.883117,...,0.880519,0.857143,0.849351,0.875325,0.85974,0.880519,0.872727,0.880519,0.880519,0.880519
6,0.488312,0.615584,0.711688,0.724675,1.0,0.719481,0.753247,0.649351,0.685714,0.745455,...,0.74026,0.732468,0.72987,0.737662,0.74026,0.742857,0.74026,0.735065,0.74026,0.742857


In [12]:
final_ratings_movies[['userId', 'movieId', 'rating']]

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
2,7,1,4.5
3,15,1,2.5
4,17,1,4.5
5,18,1,3.5
6,19,1,4.0
7,21,1,3.5
8,27,1,3.0
9,31,1,5.0
10,32,1,3.0


In [13]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split

# Load the movielens-100k dataset  UserID::MovieID::Rating::Timestamp
#data = Dataset.load_builtin('ml-100k')
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(final_ratings_movies[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.15)

# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)

# run the trained model against the testset
test_pred = algo.test(testset)

# get RMSE
print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

# if you wanted to evaluate on the trainset
print("Item-based Model : Training Set")
train_pred = algo.test(trainset.build_testset())
accuracy.rmse(train_pred)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Item-based Model : Test Set
RMSE: 0.8236
Item-based Model : Training Set
RMSE: 0.3306


0.33058054608331217

In [16]:
test_pred


[Prediction(uid=294, iid=3263, r_ui=1.0, est=2.5853429852198397, details={'actual_k': 50, 'was_impossible': False}),
 Prediction(uid=91, iid=2054, r_ui=2.5, est=2.684193083174844, details={'actual_k': 50, 'was_impossible': False}),
 Prediction(uid=610, iid=87222, r_ui=3.5, est=3.700300633052635, details={'actual_k': 50, 'was_impossible': False}),
 Prediction(uid=527, iid=1017, r_ui=4.0, est=4.02186135547585, details={'actual_k': 47, 'was_impossible': False}),
 Prediction(uid=68, iid=1391, r_ui=3.0, est=2.378602977984614, details={'actual_k': 50, 'was_impossible': False}),
 Prediction(uid=68, iid=33004, r_ui=3.0, est=2.992182792273708, details={'actual_k': 50, 'was_impossible': False}),
 Prediction(uid=249, iid=8641, r_ui=4.5, est=4.182844494121231, details={'actual_k': 50, 'was_impossible': False}),
 Prediction(uid=282, iid=5810, r_ui=4.0, est=3.5066470453239464, details={'actual_k': 50, 'was_impossible': False}),
 Prediction(uid=517, iid=587, r_ui=4.0, est=2.942977630072362, details={