In [1]:
import pandas as pd 
import numpy as np 
from collections import defaultdict
import random
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('./filtered_data.csv', header=0, sep=',')

In [3]:
data.shape

(26077009, 9)

In [4]:
data.head()

Unnamed: 0,movieId,userId,rating,description,genre,director,actors,country,original_title
0,30,900816,3,A swinger on the cusp of being a senior citize...,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",USA,Something's Gotta Give
1,30,1990901,4,A swinger on the cusp of being a senior citize...,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",USA,Something's Gotta Give
2,30,1402412,4,A swinger on the cusp of being a senior citize...,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",USA,Something's Gotta Give
3,30,1601783,3,A swinger on the cusp of being a senior citize...,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",USA,Something's Gotta Give
4,30,306466,3,A swinger on the cusp of being a senior citize...,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",USA,Something's Gotta Give


In [5]:
genre_df = data[['movieId','genre']].drop_duplicates().reset_index(drop=True)
genre_df.head()

Unnamed: 0,movieId,genre
0,30,"Comedy, Drama, Romance"
1,77,"Action, Adventure, Mystery"
2,83,"Biography, Drama, History"
3,108,"Action, Crime, Drama"
4,118,"Action, Adventure, Thriller"


In [6]:
genres = set()
for i in range(genre_df.shape[0]):
    genres = genres.union(set(genre_df['genre'][i].split(', ')))

In [7]:
index_genres = dict(zip(range(len(genres)), list(genres)))
genres_index = dict(zip(list(genres), range(len(genres))))
index_genres

{0: 'Biography',
 1: 'Music',
 2: 'Thriller',
 3: 'Mystery',
 4: 'Western',
 5: 'Sport',
 6: 'War',
 7: 'Musical',
 8: 'Crime',
 9: 'Action',
 10: 'Sci-Fi',
 11: 'Comedy',
 12: 'Drama',
 13: 'Film-Noir',
 14: 'Horror',
 15: 'Family',
 16: 'Animation',
 17: 'Romance',
 18: 'History',
 19: 'Fantasy',
 20: 'Adventure'}

In [8]:
len(genres)

21

In [9]:
data.head()

Unnamed: 0,movieId,userId,rating,description,genre,director,actors,country,original_title
0,30,900816,3,A swinger on the cusp of being a senior citize...,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",USA,Something's Gotta Give
1,30,1990901,4,A swinger on the cusp of being a senior citize...,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",USA,Something's Gotta Give
2,30,1402412,4,A swinger on the cusp of being a senior citize...,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",USA,Something's Gotta Give
3,30,1601783,3,A swinger on the cusp of being a senior citize...,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",USA,Something's Gotta Give
4,30,306466,3,A swinger on the cusp of being a senior citize...,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",USA,Something's Gotta Give


In [10]:
sample = data[['movieId','userId','rating','genre']].sample(frac=0.1, ignore_index=True)
sample.shape

(2607701, 4)

In [11]:
sample.isna().any()

movieId    False
userId     False
rating     False
genre      False
dtype: bool

In [12]:
# a dictionary that stores its feature vectors for each movie
movie_vector = {}
for i in range(genre_df.shape[0]):
    y = np.zeros((len(genres),))

    movie_genre = genre_df['genre'][i].split(', ')
    genre_len = len(movie_genre)
    coef = round(1/genre_len,2)
    for g in movie_genre:
        y[genres_index[g]] = coef

    movie_vector[genre_df['movieId'][i]]  = y

In [13]:
moviesPerUser = defaultdict(set)
usersPerMovie = defaultdict(set)
movieRatings = {}

for i in range(data.shape[0]):
    movie = data['movieId'][i]
    user = data['userId'][i]
    moviesPerUser[user].add(movie)
    usersPerMovie[movie].add(user)
    movieRatings[user,movie] = data['rating'][i]

In [14]:
movie_index = dict(zip(list(usersPerMovie),range(len(usersPerMovie))))
index_movie = dict(zip(range(len(usersPerMovie)), list(usersPerMovie)))
user_index = dict(zip(list(moviesPerUser),range(len(moviesPerUser))))
index_user = dict(zip(range(len(moviesPerUser)), list(moviesPerUser)))

In [15]:
random_user = random.choice(list(moviesPerUser.keys()))

In [16]:
watched_movies = moviesPerUser[random_user]
X_train = list(watched_movies)[:round(len(watched_movies)*0.8)]
X_test = list(watched_movies)[round(len(watched_movies)*0.8):]

In [17]:
X_train_all, y_train_all = [], []
for m in X_train:   
    X_train_all.append(movie_vector[m])
    y_train_all.append(movieRatings[random_user,m])

X_train_all = np.array(X_train_all)
y_train_all = np.array(y_train_all)

assert len(X_train_all) == len(y_train_all)

X_test_all, y_test_all = [], []

for m in X_test:   
    X_test_all.append(movie_vector[m])
    y_test_all.append(movieRatings[random_user,m])

assert len(X_test_all) == len(y_test_all)


In [18]:
len(X_test_all)

127

In [19]:
reg = Ridge(alpha=1.0)
reg.fit(X_train_all, y_train_all)

Ridge()

In [20]:
reg.score(X_train_all, y_train_all)

0.06678660415199333

In [21]:
y_pred = reg.predict(X_test_all)
y_pred

array([4.77083073, 4.77083073, 4.74526374, 4.74612143, 4.6656517 ,
       4.68998299, 4.78457566, 4.72810859, 4.78378824, 4.85360301,
       4.74612143, 4.64270396, 4.64270396, 4.86082014, 4.75670612,
       5.03057511, 4.74612143, 4.55798993, 4.88948935, 4.79133693,
       5.0160759 , 4.92331539, 4.52679119, 4.906685  , 4.83985808,
       4.6656517 , 4.93874836, 4.84842102, 4.4676635 , 4.52679119,
       4.91618474, 4.69651612, 4.74526374, 4.74612143, 4.73303039,
       4.97152475, 4.87840556, 4.6656517 , 4.84868121, 4.68905886,
       4.95012047, 4.95603011, 4.85190973, 4.81362496, 4.9177874 ,
       4.79133693, 4.65644889, 4.81362496, 4.9177874 , 4.77083073,
       4.85587829, 4.9432267 , 4.84715739, 5.03057511, 4.71903062,
       4.85190973, 4.85428804, 4.84842102, 4.53236571, 4.4676635 ,
       4.95974111, 4.85587829, 4.9432267 , 4.98096534, 4.8985752 ,
       4.6656517 , 4.74526374, 4.9473416 , 4.6385845 , 4.95473222,
       5.01455957, 4.53236571, 4.41047766, 4.95473222, 4.79133

In [22]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test_all, y_pred)

0.453947510799744

In [23]:
cnt = 0
MSE = []
test_size = 0
predictors = {}
for u in moviesPerUser:
    X,y = [], []
    watched_movies = moviesPerUser[u]
    for m in watched_movies:
        X.append(movie_vector[m])
        y.append(movieRatings[u,m])

    assert len(X) == len(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

    reg = Ridge(alpha=1.0)
    reg.fit(X_train, y_train)
    predictors[u] = reg
    y_pred = reg.predict(X_test)
    MSE.append(mean_squared_error(y_test, y_pred)*len(y_pred))
    test_size+= len(y_pred)

In [24]:
sum(MSE)/test_size

0.8732795726055811

In [25]:
sum(MSE)/test_size

0.8732795726055811

In [26]:
n_movies = len(usersPerMovie)
n_users = len(moviesPerUser)

# initialize the matrix size with all zeros
matrix = np.zeros((n_users,n_movies))
matrix.shape

(48033, 1478)

In [27]:
test_size = round(len(movieRatings.keys())*0.2)
test_sample = set(random.sample(list(movieRatings.keys()), test_size))

In [28]:
for u in moviesPerUser:
    movies = moviesPerUser[u]
    for m in movies:
        if (u,m) not in test_sample:
            i = user_index[u]
            j = movie_index[m]
            matrix[i][j] = movieRatings[u,m]

In [29]:
for i in range(matrix.shape[0]):
    if i % 500==0:
        print(i)
    u = index_user[i]
    for j in range(len(matrix[i])):
        if matrix[i][j]== 0:
            movie = index_movie[j]
            user_predictor = predictors[u]    
            y_pred = user_predictor.predict(np.array([movie_vector[movie]]))
            matrix[i][j] = y_pred

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000
23500
24000
24500
25000
25500
26000
26500
27000
27500
28000
28500
29000
29500
30000
30500
31000
31500
32000
32500
33000
33500
34000
34500
35000
35500
36000
36500
37000
37500
38000
38500
39000
39500
40000
40500
41000
41500
42000
42500
43000
43500
44000
44500
45000
45500
46000
46500
47000
47500
48000


In [30]:
len(matrix[0])

1478

In [59]:
import random
test_size = round(len(movieRatings.keys())*0.2)
test_sample = random.sample(list(movieRatings.keys()),test_size)

In [69]:
test_users = set()
for (u,m) in test_sample:
    test_users.add(u)

In [75]:
len(test_users)*0.0002

9.6066

In [78]:
test_size = round(len(test_users)*0.0002)
test_size
test_users = random.sample(list(test_users),test_size)

In [79]:
test_size

10

In [80]:
len(test_users)

10

In [81]:
test_size

10

In [82]:
from sklearn.metrics.pairwise import cosine_similarity

def cal_cosine_similarity(i_x,i_y,matrix):
    # find the corresponding rows in the matrix
    x = np.array([matrix[i_x]])
    y = np.array([matrix[i_y]])
    return float(cosine_similarity(x,y))

In [83]:
cnt = 0
similarity_dict = {}
for u in test_users:
    cnt += 1
    if cnt % 1000 == 0:
        print(cnt)
    similarity_list = []
    i = user_index[u]
    for j in range(matrix.shape[0]):
        if j%5000==0:
            print(cnt, j)
        if i != j:
            sim = cal_cosine_similarity(i,j,matrix)
            if sim > 0.5:
                similarity_list.append((sim,index_user[j]))
        
    similarity_dict[u] = sorted(similarity_list,reverse=True)[:5]


1 0
1 5000
1 10000
1 15000
1 20000
1 25000
1 30000
1 35000
1 40000
1 45000
2 0
2 5000
2 10000
2 15000
2 20000
2 25000
2 30000
2 35000
2 40000
2 45000
3 0
3 5000
3 10000
3 15000
3 20000
3 25000
3 30000
3 35000
3 40000
3 45000
4 0
4 5000
4 10000
4 15000
4 20000
4 25000
4 30000
4 35000
4 40000
4 45000
5 0
5 5000
5 10000
5 15000
5 20000
5 25000
5 30000
5 35000
5 40000
5 45000
6 0
6 5000
6 10000
6 15000
6 20000
6 25000
6 30000
6 35000
6 40000
6 45000
7 0
7 5000
7 10000
7 15000
7 20000
7 25000
7 30000
7 35000
7 40000
7 45000
8 0
8 5000
8 10000
8 15000
8 20000
8 25000
8 30000
8 35000
8 40000
8 45000
9 0
9 5000
9 10000
9 15000
9 20000
9 25000
9 30000
9 35000
9 40000
9 45000
10 0
10 5000
10 10000
10 15000
10 20000
10 25000
10 30000
10 35000
10 40000
10 45000


In [85]:
residual_total = 0
cnt = 0
for (u,m) in test_sample:
    if u in test_users:
        print(u)
        y_true = movieRatings[u,m]
        similarity_list = []
        watched_users = usersPerMovie[m]
        for su in watched_users:
            cs = cal_cosine_similarity(user_index[u], user_index[su],matrix)
            if cs > 0.5:
                similarity_list.append((cs, su))
        similarity_list = sorted(similarity_list, reverse=True)
        temp_sum = 0
        for sim_u in similarity_list:
            temp_sum += sim_u[0]* movieRatings[sim_u[1],m]
        y_pred = temp_sum/len(similarity_list)
        residual = (y_true-y_pred)**2
        residual_total += residual
        cnt += 1

2210387
666639
2064227
1365840
1365840
1365840
1275651
2064227
2064227
2210387
1275651
2064227
1365840
1545130
1365840
1275651
1275651
1545130
1365840
1365840
505260
2210387
1365840
1545130
1365840
1365840
180325
666639
1275651
479932
666639
2210387
1365840
479932
666639
479932
505260
1275651
666639
1275651
479932
2210387
1365840
505260
1365840
1545130
505260
1545130
1365840
1365840
1545130
1275651
180325
1365840
505260
505260
1365840
1275651
1275651
2210387
479932
479932
2210387
1365840
479932
180325
1365840
666639
1365840
666639
666639
1275651
666639
1365840
1365840
505260
505260
1545130
1545130
1545130
1275651
479932
479932
1365840
1275651
666639
505260
1365840
505260
1073838
2210387
1365840
2210387
1365840
479932
666639
1365840
1545130
1275651
479932
2210387
180325
1545130
2210387
1073838
666639
1365840
1365840
2210387
479932
1365840
666639
505260
1545130
1275651
505260
2210387
1545130
1545130
1073838
479932
1365840
1365840
479932
1365840
2064227
505260
1275651
180325
1365840
12756

In [87]:
residual_total/cnt

0.8591669422674685

In [None]:
len(test_users)

10

In [None]:
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import cosine_similarity

a = np.array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

