In [36]:
from datetime import datetime
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Netflix

In [53]:
def formatting_data(file_path):
    f = open(file_path, 'rt')

    data = f.readlines()

    final_list = []
    for i, line in enumerate(data):
        if ':' in line:
            current_movie_id = int(line[:-2])
        elif ',' in line:
            tmp = line[:-1].split(',')
            final_list.append([current_movie_id, int(tmp[0]), int(tmp[1]), tmp[2]])
    
    return final_list

Loading all four files with movie ratings:

In [36]:
startTime = datetime.now()

data = formatting_data('raw-data/combined_data_1.txt')
data += formatting_data('raw-data/combined_data_2.txt')
data += formatting_data('raw-data/combined_data_3.txt')
data += formatting_data('raw-data/combined_data_4.txt')

df_netflix = pd.DataFrame(data, columns = ['movie_id', 'user_id', 'rating', 'rating_date'])
df_netflix.to_csv('netflix-prize-data/netflix_data.csv')

print(len(data))
del data
print(datetime.now() - startTime)

100480507
0:30:29.084828


Explore data:

In [39]:
print("Data info:")
print("Total number of ratings = "+str(df_netflix.shape[0]))
print("Unique movies = "+str(len(np.unique(df_netflix["movie_id"]))))
print("Unique users = "+str(len(np.unique(df_netflix["user_id"]))))

Data info:
Total number of ratings = 100480507
Unique movies = 17770
Unique users = 480189


In [40]:
print("Duplicated rows = "+str(df_netflix.duplicated(["movie_id","user_id", "rating"]).sum()))

Duplicated rows = 0


In [41]:
print("Number of NaN values = "+str(df_netflix.isnull().sum()))

Number of NaN values = movie_id       0
user_id        0
rating         0
rating_date    0
dtype: int64


Creating sparse matrix and calculating similarity between movies:

In [42]:
sparse_data = sparse.csr_matrix((df_netflix.rating, (df_netflix.user_id, df_netflix.movie_id)))

In [43]:
sparse_data.shape

(2649430, 17771)

In [44]:
similarity = cosine_similarity(sparse_data.T, dense_output = False)

Creating a dictionary with all the similar movies:

In [45]:
movie_ids = np.unique(similarity.nonzero())

In [47]:
similar_movies_dict = dict()
for movie in movie_ids:
    rec_movies = np.argsort(-similarity[movie].toarray().ravel())[1:100]
    similar_movies_dict[movie] = rec_movies

Storing dictionary in data folder:

In [50]:
with open('data/dict_recommendations.pkl', 'wb') as f:
    pickle.dump(similar_movies_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

# IMDB

Loading raw data from https://datasets.imdbws.com/:

In [55]:
df_imdb_ratings = pd.read_csv('raw-data/title.ratings.tsv', sep = '\t', na_values=['\\N'])
df_imdb_titles = pd.read_csv('raw-data/title.basics.tsv', sep = '\t', na_values=['\\N'])

Keeping only movies and series:

In [56]:
df_imdb_titles = df_imdb_titles[df_imdb_titles.titleType.isin(['movie', 'tvSeries'])][['tconst', 'titleType', 'primaryTitle', 'startYear', 'genres']]

Merging both sets, removing rows with no rating, genre or startYear, and adding weighted average:

In [57]:
df_imdb = df_imdb_titles.merge(df_imdb_ratings, how='left', left_on='tconst', right_on='tconst')

In [58]:
df_imdb['titleType'] = df_imdb['titleType'].astype(str)

In [59]:
df_imdb = df_imdb.dropna()

In [60]:
df_imdb['weightedAverage'] = df_imdb['averageRating']*df_imdb['numVotes']

Storing final IMDb dataframe in the Data directory

In [62]:
df_imdb.to_csv('data/imdb_df.csv', index = False)

Storing set of possible genres

In [80]:
genres_raw = list(df_imdb['genres'].unique())
genres = set([element for item in genres_raw for element in item.split(',')])
with open('data/set_genres.pkl', 'wb') as f:
    pickle.dump(genres, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# FILTER BY GENRE AND YEAR AND GET TOP 10
#Loading IMDb dataframe
#df_imdb = pd.read_csv('data/imdb_df.csv', sep = ',', header = 0)
#Loading possible genres, not used in this cell
#with open('data/set_genres.pkl', 'rb') as f:
#    genres = pickle.load(f)

#genre = 'Drama'
#selected_year = 2020
#selected_type = 'movie'
#df_filtered = df_imdb[(df_imdb['genres'].str.contains(genre)) & (df_imdb['startYear'] == selected_year) & (df_imdb['titleType'] == selected_type)]
#df_filtered.nlargest(10,'weightedAverage')

In [None]:
#HOW TO RECOMMEND TOP 10 MOVIES
#Loading movie titles
#movie_titles = pd.read_csv('data/movie_titles.csv', sep = ',', names = ['movie_id', 'year_released', 'title'], index_col = 'movie_id', encoding = 'iso8859_2')

#Loading recommendation dictionary
#dict_rec = {}
#with open('data/dict_recommendations.pkl', 'rb') as f:
#    dict_rec = pickle.load(f)

#base_movie_id = 28

#movie_titles.loc[dict_rec[base_movie_id][0][:10]] #I changed it so I access the first array for that key

Some movie IDs to try the recommendation:  
* 28: Lilo and Stitch  
* 121: Beyonce  
* 299: Bridget Jones's Diary  
* 316: Futurama  
* 409: Godzilla  
* 607: Speed  
* 621: Armageddon  
* 1542: Sleepless in Seattle  
* 2660: When Harry Met Sally  
* 6287: Pretty Woman  
* 6797: The Breakfast Club  
* 11283: Forrest Gump  
* 13763: Jerry Maguire  
* 14928: Dead Poets Society  
* 16879: Titanic  