In [2]:
import pandas as pd
from itertools import permutations
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

movie = pd.read_csv('movies.csv')
rating = pd.read_csv('user_ratings.csv')

# Simple Pair Recommender

In [11]:
# Find all permutations
def find_pair(x):
    pair = pd.DataFrame(list(permutations(x.values, 2)), columns=['movie_a', 'movie_b'])
    return pair

# Permute the title column and reset the index
movie_combo = rating.groupby('userId')['title'].apply(find_pair).reset_index(drop=True)
print(movie_combo)

                   movie_a                           movie_b
0         Toy Story (1995)           Grumpier Old Men (1995)
1         Toy Story (1995)                       Heat (1995)
2         Toy Story (1995)       Seven (a.k.a. Se7en) (1995)
3         Toy Story (1995)        Usual Suspects, The (1995)
4         Toy Story (1995)        From Dusk Till Dawn (1996)
...                    ...                               ...
60793295         31 (2016)                 Gen-X Cops (1999)
60793296         31 (2016)                  Bloodmoon (1997)
60793297         31 (2016)  Sympathy for the Underdog (1971)
60793298         31 (2016)                     Hazard (2005)
60793299         31 (2016)                Blair Witch (2016)

[60793300 rows x 2 columns]


In [13]:
# Calculate the frequency of movie_a occurance with movie_b
combo_count = movie_combo.groupby(['movie_a', 'movie_b']).size()
# Convert to a DataFrame and reset the index
combo_count_df = combo_count.to_frame(name='count').reset_index()
print(combo_count_df)

                                            movie_a  \
0                                        '71 (2014)   
1                                        '71 (2014)   
2                                        '71 (2014)   
3                                        '71 (2014)   
4                                        '71 (2014)   
...                                             ...   
26309176  À nous la liberté (Freedom for Us) (1931)   
26309177  À nous la liberté (Freedom for Us) (1931)   
26309178  À nous la liberté (Freedom for Us) (1931)   
26309179  À nous la liberté (Freedom for Us) (1931)   
26309180  À nous la liberté (Freedom for Us) (1931)   

                                             movie_b  count  
0                        (500) Days of Summer (2009)      1  
1                         10 Cloverfield Lane (2016)      1  
2                                   127 Hours (2010)      1  
3         13 Assassins (Jûsan-nin no shikaku) (2010)      1  
4                            

In [18]:
# Sort the combination counts descendingly
combo_count_df = combo_count_df.sort_values('count', ascending=False)
# Find the 5 movies most watched by users who watched Toy Story (1995)
combo_count_df[combo_count_df.movie_a == 'Toy Story (1995)'].head()

Unnamed: 0,movie_a,movie_b,count
24019673,Toy Story (1995),Forrest Gump (1994),154
24023000,Toy Story (1995),Pulp Fiction (1994),141
24023672,Toy Story (1995),"Shawshank Redemption, The (1994)",137
24024033,Toy Story (1995),Star Wars: Episode IV - A New Hope (1977),134
24021020,Toy Story (1995),Jurassic Park (1993),132


# Content Based Recommender

In [5]:
# Convert genres string to list with top 3 elements
def str_list(x):
    return x.split("|")[:3]

movie['genres'] = movie.genres.apply(str_list)

In [6]:
# Convert strings to lower case
def to_low(x):
    return [str.lower(i) for i in x]

movie['genres'] = movie.genres.apply(to_low)

In [7]:
# Create metadata soup
movie['soup'] = movie.genres.apply(lambda x: ' '.join(x))

In [8]:
# Create the count matrix
count = CountVectorizer(stop_words = 'english')
count_matrix = count.fit_transform(movie.soup)

In [9]:
# Compute Cosine Similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [10]:
# A reverse map of indices and movie titles
indices = pd.Series(movie.index, index=movie.title)

In [11]:
# Function that takes in movie title as input and outputs most similar movies
def recommend(title):
    # The index of the movie that matches the title
    idx = indices[title]
    # The pairwsie similarity scores of all movies
    scores = list(enumerate(cosine_sim[idx]))
    # Sort the movies based on the similarity scores
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    # The scores of the 10 most similar movies
    top_10 = scores[1:11]
    # Get the movie indices
    movie_indices = [i[0] for i in top_10]
    # Return the top 10 most similar movies
    return movie['title'].iloc[movie_indices]

In [12]:
# Recommend movies to people who watched Toy Story (1995)
recommend('Toy Story (1995)')

12                                          Balto (1995)
322                                Lion King, The (1994)
506                                       Aladdin (1992)
534                       All Dogs Go to Heaven 2 (1996)
551                     James and the Giant Peach (1996)
559                                     Space Jam (1996)
578                              Oliver & Company (1988)
673    Land Before Time III: The Time of the Great Gi...
787                                 Pete's Dragon (1977)
789                           Alice in Wonderland (1951)
Name: title, dtype: object

In [13]:
# Recommend movies to people who watched Jumanji (1995)
recommend('Jumanji (1995)')

53                     Indian in the Cupboard, The (1995)
109                     NeverEnding Story III, The (1994)
701                              Wizard of Oz, The (1939)
767                       Escape to Witch Mountain (1975)
1514            Darby O'Gill and the Little People (1959)
1556                                  Return to Oz (1985)
1565                                     Tall Tale (1995)
1617                        NeverEnding Story, The (1984)
1618    NeverEnding Story II: The Next Chapter, The (1...
1799                        Santa Claus: The Movie (1985)
Name: title, dtype: object

# Collaborative Filtering

In [4]:
# Instantiate reader
reader = Reader()
# Load dataset into a form that SVD accepts
data = Dataset.load_from_df(rating[['userId', 'movieId', 'rating']], reader)
# Perform K-Fold with 5 folds on the rating dataset
kf = KFold(n_splits=5)
# Split the dataset
kf.split(data)

<generator object _BaseKFold.split at 0x7f9d76f622d0>

In [5]:
# Instantiate SVD model
svd = SVD()
# Perform cross validation with the loss metrics of RMSE
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8842  0.8713  0.8640  0.8707  0.8753  0.8731  0.0066  
MAE (testset)     0.6804  0.6692  0.6648  0.6669  0.6710  0.6704  0.0054  
Fit time          6.41    6.97    6.88    7.36    6.46    6.82    0.35    
Test time         0.37    0.23    0.56    0.24    0.28    0.34    0.12    


{'test_rmse': array([0.88415071, 0.87133706, 0.86399262, 0.87072034, 0.87534694]),
 'test_mae': array([0.68036011, 0.66915498, 0.66480937, 0.66685506, 0.67103677]),
 'fit_time': (6.410394906997681,
  6.969261884689331,
  6.882070064544678,
  7.362034797668457,
  6.455087184906006),
 'test_time': (0.3685901165008545,
  0.23425912857055664,
  0.5576927661895752,
  0.2388608455657959,
  0.2785799503326416)}

In [6]:
# Train on the dataset and predict
trainset = data.build_full_trainset()
svd.fit(trainset)
svd.predict(1, 302, 3)# It is estiamted that the user with ID 1 will rate 4.02 on movie with ID 302.

Prediction(uid=1, iid=302, r_ui=3, est=4.0151160072345835, details={'was_impossible': False})

# Association Rule

In [3]:
view = rating[['userId', 'movieId']].copy()
# Convert movieId from integer to string
view['movieId'] = view.movieId.astype(str)
# Group movies per user
view['movie'] = view.groupby("userId")['movieId'].transform(lambda x: ','.join(x))
# Drop movieId column
view = view.drop('movieId', axis=1)
# Drop duplicated user column
view = view.drop_duplicates(subset='userId')
# Sort and set user as index
view = view.sort_values('userId').set_index('userId')

In [4]:
# Split transaction strings into lists
view = view.movie.apply(lambda t: t.split(','))
# Convert Dataframe into list of strings
view = list(view)

In [5]:
# Instantiate movie view encoder
encoder = TransactionEncoder()
# One-hot encode itemsets by applying fit and transform
onehot = encoder.fit(view).transform(view)
# Convert one-hot encoded data to DataFrame
onehot = pd.DataFrame(onehot, columns=encoder.columns_)

In [8]:
# Compute frequent itemsets using the Apriori algorithm
freqent_itemsets = apriori(onehot, min_support=0.1,
                           use_colnames=True, max_len=2)
freqent_itemsets

Unnamed: 0,support,itemsets
0,0.352459,(1)
1,0.216393,(10)
2,0.116393,(1028)
3,0.104918,(1035)
4,0.237705,(1036)
...,...,...
5384,0.111475,"(8961, 858)"
5385,0.101639,"(904, 858)"
5386,0.118033,"(912, 858)"
5387,0.114754,"(858, 924)"


In [9]:
# Compute all association rules for frequent_itemsets
# Choose 80% minimum confidence value. In other words, 
# when movie X is viewed, we can say that the purchase of product Y is 80% or more.
rules = association_rules(freqent_itemsets,
                          metric="confidence",
                          min_threshold=0.8)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(3114),(1),0.159016,0.352459,0.132787,0.835052,2.369216,0.076740,3.925717
1,(788),(1),0.134426,0.352459,0.109836,0.817073,2.318208,0.062456,3.539891
2,(10),(356),0.216393,0.539344,0.178689,0.825758,1.531040,0.061978,2.643763
3,(1028),(356),0.116393,0.539344,0.101639,0.873239,1.619076,0.038863,3.634062
4,(1370),(1036),0.111475,0.237705,0.100000,0.897059,3.773834,0.073502,7.405152
...,...,...,...,...,...,...,...,...,...
383,(736),(780),0.201639,0.331148,0.162295,0.804878,2.430572,0.095523,3.427869
384,(788),(780),0.134426,0.331148,0.111475,0.829268,2.504226,0.066960,3.917564
385,(91529),(79132),0.124590,0.234426,0.109836,0.881579,3.760582,0.080629,6.464845
386,(99114),(79132),0.116393,0.234426,0.100000,0.859155,3.664927,0.072714,5.435574
