In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [7]:
select_movies = ratings.movieId.value_counts().head(1000).index.to_list()
movies = movies.loc[movies.movieId.isin(select_movies)]
ratings = ratings.loc[ratings.movieId.isin(select_movies)]

### USE Movie Metadata

In [8]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller


In [13]:
m=movies.copy()
m['genres']=m['genres'].str.split('|')
m=m.explode('genres')
m = m.pivot(index='movieId', columns='genres', values='title')
m = ~m.isna()
m = m.astype(int)
m

genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109374,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
109487,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
111759,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
112852,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [14]:
def hem_dist(x,y):
    return sum(abs(x-y))

In [15]:
ranks = []

for query in m.index:
    for candidate in m.index:
        if candidate == query:
            continue
        ranks.append([query, candidate, hem_dist(m.loc[query], m.loc[candidate])])
ranks = pd.DataFrame(ranks, columns=['query', 'candidate', 'distance'])

In [17]:
ranks = ranks.merge(movies[['movieId', 'title']], left_on='query', right_on='movieId').rename(columns={'title': 'query_tittle'}).drop(columns=['movieId'])
ranks = ranks.merge(movies[['movieId', 'title']], left_on='candidate', right_on='movieId').rename(columns={'title': 'candidate_tittle'}).drop(columns=['movieId'])
ranks = ranks.sort_values(by=['query', 'distance'])
ranks.head()

Unnamed: 0,query,candidate,distance,query_tittle,candidate_tittle
539460,1,2294,0,Toy Story (1995),Antz (1998)
665334,1,3114,0,Toy Story (1995),Toy Story 2 (1999)
792207,1,4886,0,Toy Story (1995),"Monsters, Inc. (2001)"
187812,1,673,1,Toy Story (1995),Space Jam (1996)
549450,1,2355,1,Toy Story (1995),"Bug's Life, A (1998)"


In [18]:
ranks.loc[ranks['query']==2].head()

Unnamed: 0,query,candidate,distance,query_tittle,candidate_tittle
523477,2,2161,0,Jumanji (1995),"NeverEnding Story, The (1984)"
793207,2,4896,0,Jumanji (1995),Harry Potter and the Sorcerer's Stone (a.k.a. ...
912088,2,41566,0,Jumanji (1995),"Chronicles of Narnia: The Lion, the Witch and ..."
43957,2,158,1,Jumanji (1995),Casper (1995)
230770,2,919,1,Jumanji (1995),"Wizard of Oz, The (1939)"


### User to user match

In [23]:
r = ratings.copy()
r['hour'] = r['timestamp'].apply(lambda x: datetime.fromtimestamp(x).hour)
r.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,hour
0,1,16,4.0,1217897793,6
1,1,24,1.5,1217895807,5
2,1,32,4.0,1217896246,6
3,1,47,4.0,1217896556,6
4,1,50,4.0,1217896523,6


In [24]:
users = pd.read_csv('users.csv')
users.head(5)

Unnamed: 0,userId,age,time_spent_per_day
0,1,16,3.976315
1,2,24,1.891303
2,3,20,4.521478
3,4,23,2.095284
4,5,35,1.75986


In [25]:
users = users.merge(r.groupby('userId').rating.mean().reset_index(), on='userId')
users = users.merge(r.groupby('userId').hour.mean().reset_index(), on='userId')

In [26]:
users.head()

Unnamed: 0,userId,age,time_spent_per_day,rating,hour
0,1,16,3.976315,3.691589,5.616822
1,2,24,1.891303,3.923077,21.0
2,3,20,4.521478,3.806452,14.370968
3,4,23,2.095284,4.15942,8.0
4,5,35,1.75986,2.864865,0.513514


In [27]:
u = users.copy()
u = u.set_index('userId')
u.columns = ['age', 'time_spent_per_day', 'u_avg_rating', 'hour']

In [28]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
u = pd.DataFrame(scaler.fit_transform(u), columns=u.columns, index=u.index)
u.head()

Unnamed: 0_level_0,age,time_spent_per_day,u_avg_rating,hour
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-1.470292,0.341073,-0.073572,-0.882006
2,-0.135616,-1.079947,0.426461,1.477906
3,-0.802954,0.712624,0.174541,0.460955
4,-0.30245,-0.940926,0.936982,-0.516406
5,1.699565,-1.169532,-1.859363,-1.664898


In [29]:
def euclidean_dist(x, y):
    return np.linalg.norm(x-y)

In [33]:
dist = []
## only checking for 1 user
userid=1
for user in u.index:
    dist.append(euclidean_dist(u.loc[userid], u.loc[user]))
    
u_rank = pd.DataFrame()
u_rank['id'] = u.index
u_rank['dist'] = dist
u_rank = u_rank.loc[u_rank.id != userid]
u_rank = u_rank.sort_values(by='dist')
u_rank.head()

Unnamed: 0,id,dist
648,649,0.323322
91,92,0.418899
408,409,0.682935
624,625,0.77912
176,177,0.816294


In [35]:
ratings.loc[ratings.userId==649].sort_values(by='rating', ascending=False).head(100)

Unnamed: 0,userId,movieId,rating,timestamp
56484,408,161,5.0,831667447
23259,176,318,5.0,965402064
23312,176,2019,5.0,965402139
23247,176,44,5.0,965403137
23248,176,50,5.0,965402114
...,...,...,...,...
23250,176,112,4.0,965402961
23245,176,1,4.0,965402628
10937,91,3255,4.0,949282115
23267,176,552,4.0,965403078
