In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

In [2]:
links = pd.read_csv('C:/Users/User/Desktop/Data Scientist/AML/Recsys/data/links.csv')
movies = pd.read_csv('C:/Users/User/Desktop/Data Scientist/AML/Recsys/data/movies.csv')
ratings = pd.read_csv('C:/Users/User/Desktop/Data Scientist/AML/Recsys/data/ratings.csv')
tags = pd.read_csv('C:/Users/User/Desktop/Data Scientist/AML/Recsys/data/tags.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.shape

(9742, 3)

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
ratings.shape

(100836, 4)

In [7]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [8]:
tags.shape

(3683, 4)

In [9]:
len(tags.movieId.unique())

1572

In [10]:
len(set(tags.movieId.unique()) & set(ratings.movieId.unique()))

1554

In [11]:
len(tags.userId.unique())

58

In [12]:
ratings_tags = pd.merge(ratings, tags, how='left', left_on=['userId', 'movieId'], right_on=['userId', 'movieId'])

In [13]:
tags.shape[0] - ratings_tags[ratings_tags.tag.notna()].shape[0]

207

In [15]:
ratings_tags

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y
0,1,1,4.0,964982703,,
1,1,3,4.0,964981247,,
2,1,6,4.0,964982224,,
3,1,47,5.0,964983815,,
4,1,50,5.0,964982931,,
...,...,...,...,...,...,...
102672,610,166534,4.0,1493848402,,
102673,610,168248,5.0,1493850091,Heroic Bloodshed,1.493844e+09
102674,610,168250,5.0,1494273047,,
102675,610,168252,5.0,1493846352,,


In [16]:
ratings_tags_genres = pd.merge(ratings_tags, movies, how='left', on='movieId')

In [17]:
ratings_tags_genres.loc[ratings_tags_genres.genres == '(no genres listed)'].shape[0]

49

In [18]:
ratings_tags_genres = ratings_tags_genres.loc[ratings_tags_genres.genres != '(no genres listed)'].reset_index()

In [19]:
ratings_tags_genres

Unnamed: 0,index,userId,movieId,rating,timestamp_x,tag,timestamp_y,title,genres
0,0,1,1,4.0,964982703,,,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,1,3,4.0,964981247,,,Grumpier Old Men (1995),Comedy|Romance
2,2,1,6,4.0,964982224,,,Heat (1995),Action|Crime|Thriller
3,3,1,47,5.0,964983815,,,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,4,1,50,5.0,964982931,,,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...,...,...,...
102623,102672,610,166534,4.0,1493848402,,,Split (2017),Drama|Horror|Thriller
102624,102673,610,168248,5.0,1493850091,Heroic Bloodshed,1.493844e+09,John Wick: Chapter Two (2017),Action|Crime|Thriller
102625,102674,610,168250,5.0,1494273047,,,Get Out (2017),Horror
102626,102675,610,168252,5.0,1493846352,,,Logan (2017),Action|Sci-Fi


In [20]:
def change_genre(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [21]:
ratings_tags_genres.loc[:, 'genres'] = ratings_tags_genres.loc[:, 'genres'].apply(change_genre)

In [22]:
movie_tags = {}
for movie, group in tqdm(tags.groupby('movieId')):
    movie_tags[movie] = ' '.join([str(s).title().replace(' ', '').replace('-', '') for s in group.tag.values])

HBox(children=(FloatProgress(value=0.0, max=1572.0), HTML(value='')))




In [23]:
ratings_tags_genres = pd.merge(ratings_tags_genres, pd.Series(movie_tags).rename('all_tags'), how='left', left_on='movieId', right_index=True)

In [24]:
ratings_tags_genres

Unnamed: 0,index,userId,movieId,rating,timestamp_x,tag,timestamp_y,title,genres,all_tags
0,0,1,1,4.0,964982703,,,Toy Story (1995),Adventure Animation Children Comedy Fantasy,Pixar Pixar Fun
1,1,1,3,4.0,964981247,,,Grumpier Old Men (1995),Comedy Romance,Moldy Old
2,2,1,6,4.0,964982224,,,Heat (1995),Action Crime Thriller,
3,3,1,47,5.0,964983815,,,Seven (a.k.a. Se7en) (1995),Mystery Thriller,Mystery TwistEnding SerialKiller
4,4,1,50,5.0,964982931,,,"Usual Suspects, The (1995)",Crime Mystery Thriller,Mindfuck Suspense Thriller Tricky TwistEnding ...
...,...,...,...,...,...,...,...,...,...,...
102623,102672,610,166534,4.0,1493848402,,,Split (2017),Drama Horror Thriller,
102624,102673,610,168248,5.0,1493850091,Heroic Bloodshed,1.493844e+09,John Wick: Chapter Two (2017),Action Crime Thriller,Action DarkHero GunTactics Hitman KeanuReeves ...
102625,102674,610,168250,5.0,1494273047,,,Get Out (2017),Horror,
102626,102675,610,168252,5.0,1493846352,,,Logan (2017),Action SciFi,Dark Emotional Gritty Heartbreaking Predictibl...


In [25]:
ratings_tags_genres.groupby('userId').count().rating.agg(['min', 'max'])

min      20
max    2792
Name: rating, dtype: int64

In [26]:
user_ratings = ratings.groupby('userId').agg(['min', 'max', 'mean', 'median']).rating

In [27]:
user_ratings.rename(columns={'mean': 'r_mean_user'}, inplace=True)

In [28]:
user_ratings['r_value_user'] = user_ratings.apply(lambda row: (row['max'] - row['min']) * row['median']
                                                  if row['max'] != row['min'] else row['median'], axis=1)

In [29]:
user_ratings.head()

Unnamed: 0_level_0,min,max,r_mean_user,median,r_value_user
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.0,5.0,4.366379,5.0,20.0
2,2.0,5.0,3.948276,4.0,12.0
3,0.5,5.0,2.435897,0.5,2.25
4,1.0,5.0,3.555556,4.0,16.0
5,1.0,5.0,3.636364,4.0,16.0


In [30]:
movie_ratings = ratings.groupby('movieId').agg(['min', 'max', 'mean', 'median']).rating

In [31]:
movie_ratings.rename(columns={'mean': 'r_mean_movie'}, inplace=True)

In [32]:
movie_ratings['r_value_movie'] = movie_ratings.apply(lambda row: (row['max'] - row['min']) * row['median']
                                                    if row['max'] != row['min'] else row['median'], axis=1)

In [33]:
movie_ratings.head()

Unnamed: 0_level_0,min,max,r_mean_movie,median,r_value_movie
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.5,5.0,3.92093,4.0,18.0
2,0.5,5.0,3.431818,3.5,15.75
3,0.5,5.0,3.259615,3.0,13.5
4,1.0,3.0,2.357143,3.0,6.0
5,0.5,5.0,3.071429,3.0,13.5


In [34]:
ratings_tags_genres = pd.merge(ratings_tags_genres, user_ratings[['r_mean_user', 'r_value_user']],
                               how='left', left_on='userId', right_index=True)

In [35]:
ratings_tags_genres = pd.merge(ratings_tags_genres, movie_ratings[['r_mean_movie', 'r_value_movie']],
                               how='left', left_on='movieId', right_index=True)

In [36]:
ratings_tags_genres.head()

Unnamed: 0,index,userId,movieId,rating,timestamp_x,tag,timestamp_y,title,genres,all_tags,r_mean_user,r_value_user,r_mean_movie,r_value_movie
0,0,1,1,4.0,964982703,,,Toy Story (1995),Adventure Animation Children Comedy Fantasy,Pixar Pixar Fun,4.366379,20.0,3.92093,18.0
1,1,1,3,4.0,964981247,,,Grumpier Old Men (1995),Comedy Romance,Moldy Old,4.366379,20.0,3.259615,13.5
2,2,1,6,4.0,964982224,,,Heat (1995),Action Crime Thriller,,4.366379,20.0,3.946078,16.0
3,3,1,47,5.0,964983815,,,Seven (a.k.a. Se7en) (1995),Mystery Thriller,Mystery TwistEnding SerialKiller,4.366379,20.0,3.975369,18.0
4,4,1,50,5.0,964982931,,,"Usual Suspects, The (1995)",Crime Mystery Thriller,Mindfuck Suspense Thriller Tricky TwistEnding ...,4.366379,20.0,4.237745,18.0


In [38]:
ratings_tags_genres.all_tags.isna().sum()

52503

In [40]:
ratings_tags_genres.shape[0] - ratings_tags_genres.all_tags.isna().sum()

50125

In [51]:
ratings_tags_genres = ratings_tags_genres.loc[ratings_tags_genres.all_tags.notna()]

In [52]:
count_vect = CountVectorizer()

gen_feat = count_vect.fit_transform(ratings_tags_genres.genres)

In [54]:
tfidf_transformer = TfidfTransformer()

genres_tfidf = tfidf_transformer.fit_transform(gen_feat)

In [53]:
pd.DataFrame(gen_feat.toarray(), columns=count_vect.get_feature_names())

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,romance,scifi,thriller,war,western
0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0
4,0,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50120,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
50121,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
50122,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
50123,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0


In [55]:
pd.DataFrame(genres_tfidf.toarray(), columns=count_vect.get_feature_names())

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,romance,scifi,thriller,war,western
0,0.000000,0.361211,0.541923,0.50779,0.310225,0.000000,0.0,0.0,0.470909,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
1,0.000000,0.000000,0.000000,0.00000,0.620114,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.784512,0.000000,0.000000,0.0,0.0
2,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.819353,0.000000,0.000000,0.573289,0.0,0.0
3,0.000000,0.000000,0.000000,0.00000,0.000000,0.546055,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.686412,0.000000,0.000000,0.480273,0.0,0.0
4,0.000000,0.491568,0.000000,0.00000,0.422181,0.543005,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.534106,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50120,0.000000,0.000000,0.867860,0.00000,0.496808,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
50121,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,1.000000,0.0,0.0
50122,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,1.000000,0.000000,0.0,0.0
50123,0.545042,0.000000,0.000000,0.00000,0.000000,0.629551,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.553710,0.0,0.0


In [56]:
tag_feat = count_vect.fit_transform(ratings_tags_genres.all_tags)

In [57]:
tag_tfidf = tfidf_transformer.fit_transform(tag_feat)

In [62]:
pd.DataFrame(tag_tfidf.toarray(), columns=count_vect.get_feature_names())

Unnamed: 0,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,2danimation,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
ratings_tags_genres.all_tags

0                                           Pixar Pixar Fun
1                                                 Moldy Old
3                          Mystery TwistEnding SerialKiller
4         Mindfuck Suspense Thriller Tricky TwistEnding ...
6                                Crime OffBeatComedy Quirky
                                ...                        
102608            CrudeHumor Mindfuck Sarcasm Satire Vulgar
102617                                              Intense
102621    BeautifulVisuals Cerebral Cinematography GoodC...
102624    Action DarkHero GunTactics Hitman KeanuReeves ...
102626    Dark Emotional Gritty Heartbreaking Predictibl...
Name: all_tags, Length: 50125, dtype: object