# Netflix Recommendation
#### <span style="color: cornflowerblue">Team 01 | CSPB 4502 | 11/16/22</span>

The following is a base recommendation using only the netflix dataset.

In [25]:
## LIBRARIES USED
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import iqr
from surprise import Dataset, SVD, Reader
from surprise.model_selection import cross_validate

<span style="color: cornflowerblue">Import Dataset and Combine</span>

In [2]:
## IMPORT NETFLIX MOVIE NAMES
netflix_movie_titles = "../data/netflix/movie_titles.csv"

def manual_sep(old_split):
    new_split = old_split[0:2] + [",".join(old_split[2:])]
    return new_split
    
ntfx = pd.read_csv(netflix_movie_titles,
                   encoding = "ISO-8859-1",
                   header = None,
                   names = ['Movie_Id', 'Year', 'Name'],
                   on_bad_lines=manual_sep,
                   engine='python')
ntfx.dropna(subset='Year', inplace=True)
ntfx['Year'] = ntfx['Year'].astype("Int64")
print("Netflix Movie Names:")
print(f'{ntfx.shape = }')
print(ntfx.head().to_string())
print("Netflix Movie Ratings:")

Netflix Movie Names:
ntfx.shape = (17763, 3)
   Movie_Id  Year                          Name
0         1  2003               Dinosaur Planet
1         2  2004    Isle of Man TT 2004 Review
2         3  1997                     Character
3         4  1994  Paula Abdul's Get Up & Dance
4         5  2004      The Rise and Fall of ECW
Netflix Movie Ratings:


In [3]:
## IMPORT NETFLIX MOVIE RATINGS AND COMBINE
netflix_movie_ratings = [f'../data/netflix/combined_data_{i}.txt' for i in range(1, 5)]
stream = StringIO()
movie_number = "1"
for path in netflix_movie_ratings:
    print(f'reading file {path}')
    with open(path, "r") as file:
        patrn = "[0-9]:"
        for line in file:
            if re.search(patrn, line):
                movie_num = line.replace(":\n", "")
            else:
                stream.write(movie_num+","+line)
    file.close()  
stream.seek(0)
print("reading done")
ratings = pd.read_csv(stream,
                      encoding = "ISO-8859-1",
                      names = ['Movie_Id', 'CustomerID', 'Rating', 'Date'],
                      engine='c')
stream.close()
del(stream)
print(f'{ratings.shape = }')
print(ratings.head().to_string())

reading file ../data/netflix/combined_data_1.txt
reading file ../data/netflix/combined_data_2.txt
reading file ../data/netflix/combined_data_3.txt
reading file ../data/netflix/combined_data_4.txt
reading done
ratings.shape = (100480507, 4)
   Movie_Id  CustomerID  Rating        Date
0         1     1488844       3  2005-09-06
1         1      822109       5  2005-05-13
2         1      885013       4  2005-10-19
3         1       30878       4  2005-12-26
4         1      823519       3  2004-05-03


In [4]:
print(f'Number of Movies: {len(ratings["Movie_Id"].unique())}')
print(f'Number of Customers: {len(ratings["CustomerID"].unique())}')
print(f'Total number of Reviews: {len(ratings)}')

Number of Movies: 17770
Number of Customers: 480189
Total number of Reviews: 100480507


<span style="color: cornflowerblue">Collaborative Filtering</span>

There are a few ways we can build a recommendation system. Unfortunately, since the dataset we are using is so large, it is extremely difficult to do anything substantial without a high computation and time cost. For example, a very common step in recommendation systems is to create a sparcity matrix. If we did not make any adjustments to out dataset, we would need a sparcity matrix which contained inforamtion for 17,770 unique movies and 480,189 unique customers with 100,480,507 ratings total. This would amount to a matrix with 8,532,958,530 cells in total - sometyhing that would take too much time and too much computational resources.  
One way to get past this is through "on-the-fly" collaborative filtering. Collaborative filtering is a technique which uses only information for rating profiles fropm different users or items.$^{[1]}$ In order to do this, they locate similar users or items that are in the nearest neighborhood and generate recommendations based on the inforamtion given.  
There are two major types of collaborative filtering - user based filtering and item based filtering. In user based filtering, we look for similar users and base the decision making on the choices made by the set of users. In item based collaboration, we look for similar items based on the items that have already been chosen.  
What we can do is instead of calculating everything at once, we can caluclate recommendations based on who the custyomer is and compare it with customers that are similar. This greatly cuts down computation by eliminating datapoints which aren't very similar. The issue with this is that since we are only computing recommendations for single customers in the moment, and because we are only looking at correlation values, it becomes difficult to produce any type of metric. 

For collaboration, we have created a script to run which will get the get the top similar users based on a specified threshold:

In [5]:
import scripts.filtering as uf

In [6]:
## Define variables and create a similarity filtering object
movie_thresh = 0.6    # threshold proportion of random_ID's movies that a user must have rated to be retained
rho_thresh = 0.4   # threshold a user's correlation coefficient must meet to be retained

user_filter = uf.SimilarityFiltering(ratings,
                                     movie_thresh=movie_thresh,
                                     rho_thresh=rho_thresh,
                                     random_state=2)

In [7]:
## Getting user similarity based on chosen movies for customer 10
user_filter.getUserSimilarity(531050, "Movie_Id")
user_filter.similar_users.sort_values('corr', ascending=False).head()

Unnamed: 0,Movie_Id,CustomerID,Rating,Date,corr
185471,11846,1180814,4,2004-08-20,0.589639
185698,17716,1180814,4,2005-10-21,0.589639
185696,17681,1180814,4,2005-01-10,0.589639
185695,17671,1180814,4,2005-04-12,0.589639
185694,17662,1180814,3,2005-02-14,0.589639


From this, we can then get a score for each movie id based on the correlation given.

In [8]:
## Create a score based on correlation and rating
df = user_filter.similar_users
df['score'] = df['corr']*df['Rating']
df.head()

Unnamed: 0,Movie_Id,CustomerID,Rating,Date,corr,score
0,3,728801,4,2004-04-19,0.405044,1.620175
1,30,728801,2,2004-04-28,0.405044,0.810088
2,83,728801,4,2003-03-10,0.405044,1.620175
3,97,728801,4,2003-03-15,0.405044,1.620175
4,108,728801,3,2004-06-20,0.405044,1.215131


In [9]:
# Get the cumulative score
cum_df = df.groupby('Movie_Id').sum()[['score', 'corr']]
cum_df.columns = ['cum_score', 'cum_corr']
cum_df.head()

Unnamed: 0_level_0,cum_score,cum_corr
Movie_Id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,14.309206,3.873034
5,1.702102,0.425525
8,38.394871,11.912866
9,0.8679,0.43395
13,2.139101,0.42782


In [10]:
## Create our recomendation, make an estimated rating
recommendation = pd.DataFrame(data=(cum_df['cum_score']/cum_df['cum_corr']).to_list(),
                              index = cum_df.index,
                              columns=['Recommendation_Score']).sort_values(by='Recommendation_Score',
                                                                                        ascending=False)
recommendation = recommendation.merge(right=ntfx, left_index=True, right_on='Movie_Id')
recommendation.head(10)

Unnamed: 0,Recommendation_Score,Movie_Id,Year,Name
13943,5.0,13944,1998,A Good Baby
9736,5.0,9737,2001,The Powerpuff Girls: Meet the Beat-Alls
6632,5.0,6633,2003,Foyle's War: Series 1
6175,5.0,6176,1983,Fraggle Rock: Season 1
15890,5.0,15891,1988,Inspector Morse 5: Last Seen Wearing
15632,5.0,15633,2000,The Thin Blue Lie
17234,5.0,17235,2003,Kal Ho Naa Ho: Tomorrow May Never Come
9481,5.0,9482,1992,Inspector Morse 23: The Death of the Self
3850,5.0,3851,1992,Inspector Morse 25: Cherubim & Seraphim
4578,5.0,4579,1977,I Never Promised You a Rose Garden


<span style="color: cornflowerblue">Reducing Data</span>

While the above implementation is very useful, it does not allow us to get any type of metric to evaluate the effectiveness of our recommendation. An alternative implementation would be to use matrix factorization using the SVD algorithm. This algorithm was popularized by Simon Funk during the Netflix Prize competition.$^{[2]}$ This can be accomplished through the surprise library.$^{[3]}$ Before doing this, however, we need to try and reduce the amount of objects in the data as much as possible.  
To do this, we will first remove movies that are probably not very popular in the data - these are any movies which have a low amount of reviews. Since we are looking to recommend movies, we probably dont really care to recommend a movie with low popularity. We will then remove any customers who are not very active in their reviews. If a customer doesnt review often, then they will most likely be less trustworthy with giving reviews.

In [11]:
## Free memory from previous work since we are dealing with large data
del([recommendation, cum_df, df, user_filter])

In [12]:
## Remove movies with too few reviews (unpopular movies)
freq_df = ratings.groupby('Movie_Id')['Rating'].agg(['count'])
freq_df.head(20)
movie_thresh = round(freq_df['count'].quantile(0.7))
drop_movies = freq_df[freq_df['count'] < movie_thresh].index
print(f'Movies to drop: {len(drop_movies)}')

Movies to drop: 12438


In [13]:
## Remove customer with too few reviews (inactive customers)
freq_df = ratings.groupby('CustomerID')['Rating'].agg(['count'])
freq_df.head(20)
customer_thresh = round(freq_df['count'].quantile(0.7))
drop_customers = freq_df[freq_df['count'] < customer_thresh].index
print(f'Customers to drop: {len(drop_customers)}')

Customers to drop: 335809


In [15]:
## Contextual outlier removal: remove outliers in movie reviews
def q1(x):
    return np.quantile(x, .25)

def q3(x):
    return np.quantile(x, .75)

freq_df = ratings.groupby('Movie_Id')['Rating'].agg([iqr, q1, q3])
outlier_df = ratings.merge(right=freq_df,
                               left_on='Movie_Id',
                               right_index=True)
del(freq_df)
outliers = outlier_df[(outlier_df['Rating']<(outlier_df['q1'] - 1.5*outlier_df['iqr'])) |
                      (outlier_df['Rating']>(outlier_df['q3'] + 1.5*outlier_df['iqr']))].index
del(outlier_df)
print(f'Contextual outliers to drop: {len(outliers)}')

Contextual outliers to drop: 2914484


In [16]:
new_ratings = ratings[~ratings['Movie_Id'].isin(drop_movies)]
new_ratings = new_ratings[~new_ratings['CustomerID'].isin(drop_customers)]
new_ratings = new_ratings[~new_ratings.index.isin(outliers)]
new_ratings.shape

(69868264, 4)

In [17]:
print(f'Total removed datapoints: {len(ratings) - len(new_ratings)} ({1-(len(new_ratings)/len(ratings)):.2f}%)')

Total removed datapoints: 30612243 (0.30%)


In [18]:
del(ratings, outliers, drop_movies, drop_customers)

In [19]:
## Create a pivot table
ratings_pivot = pd.pivot_table(new_ratings, index='CustomerID', columns='Movie_Id', values='Rating')
ratings_pivot.shape

(144380, 5332)

In [28]:
reader = Reader()

data = Dataset.load_from_df(new_ratings[['CustomerID', 'Movie_Id', 'Rating']], reader)

svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5)

{'test_rmse': array([0.74498367, 0.74513559, 0.74513853, 0.74515964, 0.74485369]),
 'test_mae': array([0.58629553, 0.5865052 , 0.58654417, 0.58652367, 0.58630125]),
 'fit_time': (1243.6732385158539,
  1166.1720881462097,
  971.7347178459167,
  867.3207578659058,
  937.7179448604584),
 'test_time': (497.456750869751,
  245.21426105499268,
  193.13601326942444,
  214.1835584640503,
  210.58476948738098)}

In [11]:
filename = '../data/rating_corr.csv'
ratings = pd.read_csv(filename)
ratings.head()

Unnamed: 0,index,Movie_Id,CustomerID,Rating,Date,corr
0,84178754,15039,1333,3,2005-07-09,1.0
1,38942962,6923,1333,2,2004-03-07,1.0
2,50690952,9160,1333,2,2004-02-08,1.0
3,20913277,3936,1333,1,2005-07-09,1.0
4,95259246,16901,1333,1,2004-02-18,1.0


In [None]:
# ratMat = ratings.pivot_table(values="Rating",index="CustomerID",columns="Movie_Id").fillna(-1)

[1]   
[2] https://sifter.org/~simon/journal/20061211.html  
[3] https://surprise.readthedocs.io/en/stable/index.html
