In [1]:
# inspired by https://www.kaggle.com/ibtesama/getting-started-with-a-movie-recommendation-system#Collaborative-Filtering

In [2]:
import pandas as pd 
import numpy as np 

In [3]:
credit=pd.read_csv('tmdb_5000_credits.csv')
movie=pd.read_csv('tmdb_5000_movies.csv')

In [4]:
df = credit.merge(movie, left_on=['movie_id','title'], right_on=['id','title'])
df.drop('id', axis=1, inplace=True)
df.head(1)

Unnamed: 0,movie_id,title,cast,crew,budget,genres,homepage,keywords,original_language,original_title,...,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,vote_average,vote_count
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,...,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,7.2,11800


In [5]:
df.shape

(4803, 22)

# Demographic Filtering

They offer generalized recommendations to every user, based on movie popularity and/or genre. The System recommends the same movies to users with similar demographic features. Since each user is different, this approach is considered to be too simple. The basic idea behind this system is that movies that are more popular and critically acclaimed will have a higher probability of being liked by the average audience.

## Find the most popular items

In [6]:
# Caculated weighted rating as it is not fair to campare 5 average rating with 10 votes and 4.5 average rating with 10000 votes

In [7]:
# C is the mean vote across the whole report
C = df['vote_average'].mean()

In [8]:
# m is the minimum votes required to be listed in the chart
m = df['vote_count'].quantile(0.9)

In [9]:
q_movies = df[df['vote_count'] >= m]
q_movies.shape

(481, 22)

In [10]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [11]:
q_movies['adj_score'] = q_movies.apply(weighted_rating, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q_movies['adj_score'] = q_movies.apply(weighted_rating, axis=1)


In [12]:
q_movies = q_movies.sort_values('adj_score', ascending=False)
q_movies[['title','vote_count','vote_average','adj_score']]

Unnamed: 0,title,vote_count,vote_average,adj_score
1881,The Shawshank Redemption,8205,8.5,8.059258
662,Fight Club,9413,8.3,7.939256
65,The Dark Knight,12002,8.2,7.920020
3232,Pulp Fiction,8428,8.3,7.904645
96,Inception,13752,8.1,7.863239
...,...,...,...,...
41,Green Lantern,2487,5.1,5.521697
337,A Good Day to Die Hard,3493,5.2,5.507643
193,After Earth,2532,5.0,5.459420
91,Independence Day: Resurgence,2491,4.9,5.406234


## Non-personalized Suggestions- Association Rules

In [13]:
# to be continued

# Content Based Filtering

They suggest similar items based on a particular item. This system uses item metadata, such as genre, director, description, actors, etc. for movies, to make these recommendations. The general idea behind these recommender systems is that if a person liked a particular item, he or she will also like an item that is similar to it.

## Movie Description Based Recommendation

In [14]:
import nltk
from nltk.stem import WordNetLemmatizer
import string
from nltk.corpus import stopwords

In [15]:
df['overview'] = df['overview'].fillna('')

In [16]:
def text_process(mess):
    #mess = mess.fillna('')

    # Check characters to see if they are in punctuation
    nopunc = [char.lower() for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Remove stopwords
    nostop = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in nostop]

In [17]:
#df['overview'].head(5).apply(text_process)[0]

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer = CountVectorizer(analyzer=text_process).fit(df['overview'])

count = CountVectorizer(analyzer=text_process)
count_matrix = count.fit_transform(df['overview'])
# Print total number of vocab words
#print(len(count_matrix))

In [19]:
#messages_bow = bow_transformer.transform(df['overview'])

In [20]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(count_matrix)

In [21]:
tfidf_matrix = tfidf_transformer.transform(count_matrix)
print(tfidf_matrix.shape)

(4803, 21144)


In [22]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df['overview'] = df['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(4803, 20978)

In [23]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [24]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [25]:
idx = indices['The Dark Knight Rises']
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

In [26]:
sim_scores[1:11]

[(65, 0.30151176591665485),
 (299, 0.29857045255396825),
 (428, 0.2878505467001694),
 (1359, 0.264460923827995),
 (3854, 0.18545003006561456),
 (119, 0.16799626199850706),
 (2507, 0.16682891043358278),
 (9, 0.1337400906655523),
 (1181, 0.13219702138476813),
 (210, 0.13045537014449818)]

In [27]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    movie_sim = [round(i[1],2) for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices], movie_sim

In [28]:
get_recommendations('The Dark Knight Rises')

(65                              The Dark Knight
 299                              Batman Forever
 428                              Batman Returns
 1359                                     Batman
 3854    Batman: The Dark Knight Returns, Part 2
 119                               Batman Begins
 2507                                  Slow Burn
 9            Batman v Superman: Dawn of Justice
 1181                                        JFK
 210                              Batman & Robin
 Name: title, dtype: object,
 [0.3, 0.3, 0.29, 0.26, 0.19, 0.17, 0.17, 0.13, 0.13, 0.13])

## Credits, Genres and Keywords Based Recommender

In [29]:
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [30]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

def get_producer(x):
    for i in x:
        if i['job'] == 'Producer':
            return i['name']
    return np.nan

def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        return names
    return []

In [31]:
df['director'] = df['crew'].apply(get_director)
df['producer'] = df['crew'].apply(get_producer)

In [32]:
features = ['cast', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(get_list)

In [33]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [34]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df[feature] = df[feature].apply(clean_data)

In [35]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
df['soup'] = df.apply(create_soup, axis=1)

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [38]:
# Reset index of our main DataFrame and construct reverse mapping as before
df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

In [39]:
get_recommendations('The Dark Knight Rises', cosine_sim2)

(4638    Amidst the Devil's Wings
 65               The Dark Knight
 96                     Inception
 119                Batman Begins
 4099                 Harsh Times
 2060          Out of the Furnace
 210               Batman & Robin
 4408              Jimmy and Judy
 1431                Premium Rush
 95                  Interstellar
 Name: title, dtype: object,
 [0.13, 0.12, 0.12, 0.11, 0.09, 0.09, 0.08, 0.08, 0.08, 0.08])

# Collaborative Filtering

This system matches persons with similar interests and provides recommendations based on this matching. Collaborative filters do not require item metadata like its content-based counterparts.

Essentially, we want to turn the recommendation problem into an optimization problem. We can view it as how good we are in predicting the rating for items given a user. One common metric is Root Mean Square Error (RMSE). The lower the RMSE, the better the performance.

In [40]:
ratings = pd.read_csv('ratings_small.csv')
print(len(ratings))
ratings.head()

100004


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


## Cosine Similarity (Item Based)

In [41]:
user_ratings_pivot0 = ratings.pivot(index='userId', columns='movieId', values='rating')
user_ratings_pivot0

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,,,,,,4.0,,,,,...,,,,,,,,,,
668,,,,,,,,,,,...,,,,,,,,,,
669,,,,,,,,,,,...,,,,,,,,,,
670,4.0,,,,,,,,,,...,,,,,,,,,,


In [42]:
avg_ratings = user_ratings_pivot0.mean(axis=1)
user_ratings_pivot = user_ratings_pivot0.sub(avg_ratings, axis=0)
user_ratings_pivot.fillna(0, inplace=True)

In [43]:
# Change from user_based to item_based
movie_ratings_pivot = user_ratings_pivot.T

In [44]:
movie_ratings_pivot

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.00,0.0,-0.465909,0.0,0.244444,0.0,...,0.000000,0.269231,-0.296724,0.000000,0.0,0.0,0.0,0.0,0.193548,1.082609
2,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,0.0,0.000000,0.0,...,1.603448,0.000000,0.000000,-0.285714,0.0,0.0,0.0,0.0,0.000000,0.000000
3,0.0,0.0,0.0,0.0,0.09,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,-0.285714,0.0,0.0,0.0,0.0,0.000000,0.000000
4,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
5,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,-0.285714,0.0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161944,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
162376,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
162542,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
162672,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000


In [45]:
similarities = cosine_similarity(movie_ratings_pivot)
cosine_similarity_df = pd.DataFrame(similarities, 
                                    index=movie_ratings_pivot.index,
                                    columns=movie_ratings_pivot.index)
cosine_similarity_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,-0.042287,-0.064368,-0.080701,-0.041198,-0.014749,-0.028232,0.005169,-0.078349,-0.048015,...,-0.009536,0.04451,-0.116383,-0.150744,-0.150744,0.033538,0.116383,0.0,0.0,0.009536
2,-0.042287,1.0,-0.031495,-0.059223,-0.063168,-0.102614,0.036673,0.049508,-0.024567,0.200814,...,0.0,0.079861,0.112288,-0.013585,-0.013585,0.060176,-0.112288,0.0,0.0,0.0
3,-0.064368,-0.031495,1.0,0.049531,0.17535,-0.086597,-0.034197,0.179569,0.042033,0.016735,...,0.0,0.0,0.0,-0.012656,-0.012656,0.0,0.0,0.0,0.0,0.0
4,-0.080701,-0.059223,0.049531,1.0,0.052369,-0.005074,0.07509,-0.105059,0.04235,0.03938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,-0.041198,-0.063168,0.17535,0.052369,1.0,0.066489,-0.009678,0.017522,-0.029532,-0.000736,...,0.0,-0.275465,0.0,-0.015511,-0.015511,0.0,0.0,0.0,0.0,0.0


In [46]:
# Find movies that are similar to movieId = 1
cosine_similarity_df[1].sort_values(ascending=False).head(10)

movieId
1        1.000000
3114     0.401537
78499    0.267200
2355     0.244279
471      0.206070
8961     0.205355
3034     0.200939
1198     0.198590
58559    0.198550
4886     0.198196
Name: 1, dtype: float64

## KNN (User Based)

### 1. Step by Step KNN

In [47]:
user_ratings_pivot

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.513158,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,-0.348039,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.000000,0.0,0.09,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.000000,0.0,0.00,0.0,0.0,0.352941,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.000000,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,0.000000,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,0.193548,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
similarities = cosine_similarity(user_ratings_pivot)
cosine_similarity_df = pd.DataFrame(similarities, 
                                    index=user_ratings_pivot.index,
                                    columns=user_ratings_pivot.index)
cosine_similarity_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.00362,-0.002274,0.0,-0.070321,0.0,0.042632,0.0,...,0.0,0.0,0.018643,0.001031,0.0,0.0,0.0,0.044095,0.0,-0.013096
2,0.0,1.0,-0.001852,-0.004854,0.012639,0.0,0.042691,0.021066,0.011109,-0.007989,...,-0.018248,-0.021546,0.018902,-0.058952,0.028515,-0.106828,-0.007999,-0.041628,-0.090233,0.056258
3,0.0,-0.001852,1.0,0.018594,-0.025903,-0.0632,0.0549,0.026488,-0.036187,0.038021,...,0.044297,0.019581,0.070702,0.030669,0.143705,0.096713,0.027451,0.089297,-0.009815,0.062276
4,0.00362,-0.004854,0.018594,1.0,0.010801,0.019224,0.057519,0.05543,-0.010442,0.005126,...,0.011978,0.006569,0.027687,0.092092,0.021334,0.040833,0.018428,0.028642,0.019848,0.032749
5,-0.002274,0.012639,-0.025903,0.010801,1.0,-0.005843,-0.015075,-0.038886,0.013708,0.0305,...,0.046134,0.001903,0.00162,0.036819,-0.038269,-0.019537,-0.071721,0.00376,-0.029455,-0.036814


In [49]:
# find the users that are similar to user 1
nearest_neighbors = cosine_similarity_df[1].sort_values(ascending=False)[1:4].index
nearest_neighbors

Int64Index([35, 197, 539], dtype='int64', name='userId')

1. We find the 3 most similar user.
2. We then find the ratings there users gave to the movie from the orginal rating dataframe and get the mean.
This rating represents the rating the user would likely give to movieX based on the rating users similar to them gave it.

In [50]:
ratings[(ratings['movieId'] == 197)&(ratings['userId'] == 539)]

Unnamed: 0,userId,movieId,rating,timestamp


### 2. Sklearn KNN Model

In [51]:
# Use the knn model to predict the target user's rating for the movie
# For example, I want to know how much user 1 would rate movie 1

In [52]:
# movieId = 1
user_ratings_pivot.drop(1, axis=1, inplace=True)
# target userId = 1
target_user_x = user_ratings_pivot.loc[[1]]
target_user_x

movieId,2,3,4,5,6,7,8,9,10,11,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
other_users_y = user_ratings_pivot0[1]
other_users_y

userId
1      NaN
2      NaN
3      NaN
4      NaN
5      NaN
      ... 
667    NaN
668    NaN
669    NaN
670    4.0
671    5.0
Name: 1, Length: 671, dtype: float64

In [54]:
# find users who rated movieId = 1
other_users_x = user_ratings_pivot[other_users_y.notnull()]
other_users_x

movieId,2,3,4,5,6,7,8,9,10,11,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,-0.465909,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,-0.621765,0.000000,0.000000,1.878235,1.378235,0.000000,0.0,0.000000,0.378235,-0.121765,...,0.0,-2.121765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,-0.534279,-0.534279,-0.534279,0.000000,-0.534279,-0.534279,0.0,-0.534279,-0.534279,-0.534279,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
663,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
664,0.000000,0.000000,0.000000,0.000000,0.203276,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
# find the ratings those users gave
other_users_y.dropna(inplace=True)
other_users_y

userId
7      3.0
9      4.0
13     5.0
15     2.0
19     3.0
      ... 
660    2.5
663    4.0
664    3.5
670    4.0
671    5.0
Name: 1, Length: 247, dtype: float64

In [56]:
from sklearn.neighbors import KNeighborsRegressor
user_knn = KNeighborsRegressor(metric='cosine', n_neighbors=3)
user_knn.fit(other_users_x, other_users_y)
user_user_pred = user_knn.predict(target_user_x)
print(user_user_pred)

[3.66666667]


## SVD (Singular value secomposition)

In [57]:
from surprise import Reader, Dataset, SVD
from surprise import accuracy
#from sklearn.model_selection import KFold
from surprise.model_selection import cross_validate
reader = Reader()

In [58]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
#kf = KFold(n_splits=5)
#kf.split(data)

In [59]:
# Measuring sparity per column
user_ratings_pivot0.notnull().sum()

movieId
1         247
2         107
3          59
4          13
5          56
         ... 
161944      1
162376      1
162542      1
162672      1
163949      1
Length: 9066, dtype: int64

### Method 1: Cross Validation

In [60]:
svd = SVD()

In [61]:
cross_validate(svd, data, measures=['rmse', 'mae'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8929  0.8930  0.8912  0.8982  0.8970  0.8945  0.0027  
MAE (testset)     0.6858  0.6885  0.6842  0.6912  0.6935  0.6886  0.0034  
Fit time          4.24    4.17    4.18    4.17    4.19    4.19    0.02    
Test time         0.12    0.11    0.11    0.11    0.11    0.11    0.01    


{'test_rmse': array([0.89285237, 0.89302635, 0.89121126, 0.89822261, 0.89703548]),
 'test_mae': array([0.6857555 , 0.68854848, 0.6842011 , 0.69118483, 0.69345595]),
 'fit_time': (4.236770868301392,
  4.1745429039001465,
  4.177796125411987,
  4.168962001800537,
  4.193847894668579),
 'test_time': (0.12377071380615234,
  0.10924196243286133,
  0.10797691345214844,
  0.10815715789794922,
  0.10840129852294922)}

In [62]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fec98a7ec70>

In [63]:
# Movies previously rated by user 1
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [64]:
# predict how much user 1's rating for movie 31
svd.predict(1, 31)

Prediction(uid=1, iid=31, r_ui=None, est=2.3318010436681917, details={'was_impossible': False})

### Method 2: Train Test Split

In [65]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.3)

In [66]:
svd.fit(trainset)
predictions = svd.test(testset)

In [67]:
accuracy.mse(predictions)
accuracy.rmse(predictions)

MSE: 0.8036
RMSE: 0.8964


0.8964337132418219

### Method 3: Cross Validation + GridSearchCV

In [68]:
from surprise.model_selection import GridSearchCV
param_grid = {'n_epochs': [5, 10, 15], 'lr_all': [0.002, 0.005, 0.008],
              'reg_all': [0.2, 0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.8974031489981188
{'n_epochs': 15, 'lr_all': 0.008, 'reg_all': 0.2}


In [69]:
print(gs.best_score['mae'])
print(gs.best_params['mae'])

0.6937538550744824
{'n_epochs': 15, 'lr_all': 0.008, 'reg_all': 0.2}


In [70]:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fec93844d30>

In [71]:
# Can see the whole results
results_df = pd.DataFrame.from_dict(gs.cv_results)

In [72]:
# predict how much user 1's rating for movie 31
svd.predict(1, 31)

Prediction(uid=1, iid=31, r_ui=None, est=2.331806483040878, details={'was_impossible': False})