In [9]:
import numpy as np
import pandas as pd
from itertools import product

from get_movie_features import movie_feature
from evaluate_model import evaluate

from sklearn.model_selection import train_test_split

import warnings; warnings.simplefilter('ignore')


# Load data

In [8]:
movies_metadata = r'E:\School\DE_AN\Movie-Recommendation-System\src\data\movies_metadata.csv'
links_small = r'E:\School\DE_AN\Movie-Recommendation-System\src\data\links_small.csv'
credits_ = r'E:\School\DE_AN\Movie-Recommendation-System\src\data\credits.csv'
keywords = r'E:\School\DE_AN\Movie-Recommendation-System\src\data\keywords.csv'
rating_path = r'E:\School\DE_AN\Movie-Recommendation-System\src\data\ratings_small.csv'

movie_df = movie_feature(movies_metadata, links_small, credits_, keywords)
rating_df = pd.read_csv(rating_path)

In [10]:
train_df, test_df = train_test_split(rating_df, test_size=0.25, \
                                     stratify=rating_df['userId'], random_state=42)

# Preprocess

In [54]:
# drop duplicate

print(movie_df.duplicated().sum())
movie_df.drop_duplicates(inplace=True)
print(movie_df.duplicated().sum())

128
0


In [55]:
# select relevant columns

cols = ['title', 'movieId', 'popularity', 'wr', 'year']
movie_df = movie_df[cols]
movie_df.head(2)

Unnamed: 0,title,movieId,popularity,wr,year
0,Toy Story,1,21.946943,6.86977,1995
1,Jumanji,2,17.015539,5.884891,1995


# Baseline model 1
**Popularity**: Movies which have higher popularity

**Weighted Rate**: Movies which have higher weighted rate

**Production Year**: Movies which were released recently

In [31]:
# sort movies by release_year, popularity and weighted rate

movie_df_sorted = movie_df.sort_values(by=['year', 'popularity', 'wr'], ascending=False)
movie_df_sorted.head(10)

Unnamed: 0,title,movieId,popularity,wr,year
8871,Deadpool,122904,187.860492,6.935872,2016
8872,Captain America: Civil War,122920,145.882135,6.903532,2016
9200,The Legend of Tarzan,160563,45.38298,5.036374,2016
9004,Suicide Squad,135536,42.965027,5.01304,2016
9190,Now You See Me 2,159093,39.540653,5.913668,2016
9154,Me Before You,152017,34.34759,6.754918,2016
9155,Me Before You,152017,34.34759,6.754918,2016
9024,Batman v Superman: Dawn of Justice,136864,31.435879,5.013943,2016
8873,X-Men: Apocalypse,122924,28.712522,5.937756,2016
9164,Zootopia,152081,26.024868,6.858811,2016


In [29]:
# get top 10 and recommend to all users in the test_df
top_10_movie_id_list = movie_df_sorted['movieId'].head(10).tolist()
user_list = test_df.userId.unique().tolist()


In [30]:
# combine user_list and predict_movie_list
combinations = list(product(user_list, top_10_movie_id_list))
pred_df = pd.DataFrame(data=combinations, columns=['userId', 'movieId'])
pred_df

Unnamed: 0,userId,movieId
0,302,122904
1,302,122920
2,302,160563
3,302,135536
4,302,159093
...,...,...
6705,498,152017
6706,498,152017
6707,498,136864
6708,498,122924


In [32]:
evaluate(pred_df=pred_df, val_df=test_df, top_k=10)
# output: 0.0017883755588673624

0.0017883755588673624

# Baseline model 2
**Popularity**: Get movies which have higher popularity

**Rating**: Get movie which have higher average rating among user in the train_df

In [69]:
# get average rating of each movies 

movie_avg_rating = train_df.groupby('movieId')['rating'].mean().reset_index()

In [70]:
# merge movie_avg_rating and movie_df
base_line_2_df = pd.merge(movie_df, movie_avg_rating, on='movieId', how='inner')
# get top 10 by popularity and average rating
top_10_movie_id_list = base_line_2_df.sort_values(by=['popularity', 'rating'], \
                                                  ascending=False)['movieId']\
                                                    .head(10).tolist()

In [71]:
# combine user_list and predict_movie_list
combinations = list(product(user_list, top_10_movie_id_list))
pred_df = pd.DataFrame(data=combinations, columns=['userId', 'movieId'])
pred_df

Unnamed: 0,userId,movieId
0,302,135887
1,302,115617
2,302,122904
3,302,72998
4,302,115149
...,...,...
6705,498,112556
6706,498,116823
6707,498,122920
6708,498,296


In [72]:
evaluate(pred_df=pred_df, val_df=test_df, top_k=10)
# output: 0.02280178837555887

0.02280178837555887

# Conclusion 1
- Baseline model (2) which recommend based on popularity and average rating of movie has higher proportion of movies that user actually watched (2.3%) than Baseline model (1) with proportion of 0.18%
- Now we try to improve baseline model (2) with release_year of movie. Let's see whether the evaluation metric improve or not

# Baseline 3
**Popularity**

**Rating**

**Year**

In [73]:
base_line_2_df.head()

Unnamed: 0,title,movieId,popularity,wr,year,rating
0,Toy Story,1,21.946943,6.86977,1995,3.845946
1,Jumanji,2,17.015539,5.884891,1995,3.423077
2,Grumpier Old Men,3,11.7129,5.376968,1995,3.097826
3,Waiting to Exhale,4,3.859495,5.299755,1995,2.555556
4,Father of the Bride Part II,5,8.387519,5.175099,1995,3.294872


In [74]:
# sort by popularity, rating, release_year
top_10_movie_id_list = base_line_2_df.sort_values(by=['popularity', 'rating', 'year'], \
                                                  ascending=False)['movieId']\
                                                    .head(10).tolist()
# combine user_list and predict_movie_list
combinations = list(product(user_list, top_10_movie_id_list))
pred_df = pd.DataFrame(data=combinations, columns=['userId', 'movieId'])
pred_df

Unnamed: 0,userId,movieId
0,302,135887
1,302,115617
2,302,122904
3,302,72998
4,302,115149
...,...,...
6705,498,112556
6706,498,116823
6707,498,122920
6708,498,296


In [75]:
evaluate(pred_df=pred_df, val_df=test_df, top_k=10)

0.02280178837555887

# Conclusion 2
The result after adding release_year doesn't improve. Hence we keep model (2) as our baseline model