# 1. Imports

In [40]:
import pandas           as pd
import pickle
from scipy.sparse       import csr_matrix 
from sklearn.neighbors  import NearestNeighbors

# 2. Load Data

In [3]:
movies_raw = pd.read_csv('dataset/movies_metadata.csv', low_memory=False)
ratings_raw = pd.read_csv('dataset/ratings.csv')

In [4]:
movies_raw.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [5]:
ratings_raw.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


# 3. Data transforming

## 3.1 Movies Dataset

### 3.1.1 Selecting columns e renaming variables

In [6]:
movies = movies_raw.copy()

# Filtering columns
movies = movies[['id', 'original_title', 'original_language', 'vote_count']]

# Renaming variables
movies.rename(columns={'id': 'movie_id', 'original_title': 'title',
                       'original_language': 'language',
                       'vote_count': 'num_reviews'}, inplace=True)

movies.head()

Unnamed: 0,movie_id,title,language,num_reviews
0,862,Toy Story,en,5415.0
1,8844,Jumanji,en,2413.0
2,15602,Grumpier Old Men,en,92.0
3,31357,Waiting to Exhale,en,34.0
4,11862,Father of the Bride Part II,en,173.0


### 3.1.2 Checking NAs

In [7]:
movies.shape

(45466, 4)

In [8]:
movies.isnull().sum()

movie_id        0
title           0
language       11
num_reviews     6
dtype: int64

In [9]:
# Excluding NAs data. Not relevant in relation to the dataset
movies.dropna(inplace=True)

### 3.1.3 Ratings per movie

In [10]:
# As a business rule: consider movies that have received at least 1000 reviews and in English as language

movies = movies[movies['num_reviews'] > 999]
movies = movies[movies['language'] == 'en']

In [11]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1100 entries, 0 to 44842
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   movie_id     1100 non-null   object 
 1   title        1100 non-null   object 
 2   language     1100 non-null   object 
 3   num_reviews  1100 non-null   float64
dtypes: float64(1), object(3)
memory usage: 43.0+ KB


In [12]:
movies['movie_id'] = movies['movie_id'].astype(int)

In [13]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1100 entries, 0 to 44842
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   movie_id     1100 non-null   int64  
 1   title        1100 non-null   object 
 2   language     1100 non-null   object 
 3   num_reviews  1100 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 43.0+ KB


## 3.2 Ratings Dataset

### 3.2.1 Selecting columns e renaming variables

In [14]:
ratings = ratings_raw.copy()

# Filtering columns
ratings = ratings[['userId', 'movieId', 'rating']]

# Renaming variables
ratings.rename(columns={'userId': 'user_id', 'movieId': 'movie_id'}, 
               inplace=True)

ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


### 3.2.2 Checking NAs

In [15]:
ratings.shape

(26024289, 3)

In [16]:
ratings.isna().sum()

user_id     0
movie_id    0
rating      0
dtype: int64

### 3.2.3 Ratings per user

In [17]:
ratings['user_id'].value_counts()

user_id
45811     18276
8659       9279
270123     7638
179792     7515
228291     7410
          ...  
141473        1
196897        1
111231        1
193655        1
193683        1
Name: count, Length: 270896, dtype: int64

In [18]:
# As a business rule: consider users who have reviewed at least 500 times

rating = ratings['user_id'].value_counts() >= 500
y = rating[rating].index
y.shape

(9516,)

In [19]:
# Filtering dataset ratings including just users according business rule
ratings = ratings[ratings['user_id'].isin(y)]

In [20]:
ratings.shape

(8575119, 3)

In [21]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
1710,24,1,4.0
1711,24,2,3.0
1712,24,6,4.0
1713,24,16,3.0
1714,24,17,3.0


In [22]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8575119 entries, 1710 to 26023521
Data columns (total 3 columns):
 #   Column    Dtype  
---  ------    -----  
 0   user_id   int64  
 1   movie_id  int64  
 2   rating    float64
dtypes: float64(1), int64(2)
memory usage: 261.7 MB


# 4. Merging Dataframes

In [23]:
movies_and_ratings = ratings.merge(movies, on='movie_id')
movies_and_ratings.head()

Unnamed: 0,user_id,movie_id,rating,title,language,num_reviews
0,24,58,5.0,Pirates of the Caribbean: Dead Man's Chest,en,5380.0
1,24,62,2.0,2001: A Space Odyssey,en,3075.0
2,24,73,3.0,American History X,en,3120.0
3,24,111,5.0,Scarface,en,3017.0
4,24,162,4.0,Edward Scissorhands,en,3731.0


In [24]:
movies_and_ratings.shape

(484955, 6)

# 5. New Data transforming

In [25]:
# drop duplicates reviews
movies_and_ratings.drop_duplicates(['user_id', 'movie_id'], inplace=True)

In [26]:
movies_and_ratings.shape

(484955, 6)

In [27]:
# move_id column is not necessary anymore
del movies_and_ratings['movie_id']
movies_and_ratings.head()

Unnamed: 0,user_id,rating,title,language,num_reviews
0,24,5.0,Pirates of the Caribbean: Dead Man's Chest,en,5380.0
1,24,2.0,2001: A Space Odyssey,en,3075.0
2,24,3.0,American History X,en,3120.0
3,24,5.0,Scarface,en,3017.0
4,24,4.0,Edward Scissorhands,en,3731.0


In [28]:
# Pivoting the dataframe. Each user_id will be a variable with the respective 
# rating value for each movie reviewed

movie_pivot = movies_and_ratings.pivot_table(columns='user_id', index='title', values= 'rating')

movie_pivot.head()

user_id,24,46,120,132,150,229,231,251,332,340,...,270555,270564,270579,270631,270634,270654,270684,270734,270769,270887
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You,,,,,,,,,,,...,,,,,,,,,,
12 Angry Men,,,,,,,,,,,...,,,,,,3.5,,,,
127 Hours,,,,,,,,,,,...,,,,,,,,,,
1408,,,,,,,,,,,...,,,,,,,,,,
2 Fast 2 Furious,,,,,,,,,,,...,,,,,,,,,,


In [29]:
# fill NAs with zero
movie_pivot.fillna(0, inplace=True)
movie_pivot.head()

user_id,24,46,120,132,150,229,231,251,332,340,...,270555,270564,270579,270631,270634,270654,270684,270734,270769,270887
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0
127 Hours,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2 Fast 2 Furious,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 6. Machine Learning Model

In [30]:
# Transforming dataset into a sparse matrix
movie_sparse = csr_matrix(movie_pivot)

In [31]:
# training
model = NearestNeighbors(algorithm='brute')
model.fit(movie_sparse)

## 6.1 Movies Predictions

In [32]:
# Movie: 127 hours

distances, sugestions = model.kneighbors(movie_pivot.
                                         filter(items=['127 Hours'],
                                                axis=0).values.reshape(1, -1))

for i in range(len(sugestions)):
    print(movie_pivot.index[sugestions[i]])

Index(['127 Hours', 'American Hustle', 'The Expendables 2', 'Lord of War',
       'RED 2'],
      dtype='object', name='title')


found 0 physical cores < 1
  File "c:\Users\Felipe\Documents\Felipe\Cursos\CientistaDados\recommendation_project\recommendation_venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


In [33]:
# Movie: Toy Story

distances, sugestions = model.kneighbors(movie_pivot.
                                         filter(items=['Toy Story'],
                                                axis=0).values.reshape(1, -1))

for i in range(len(sugestions)):
    print(movie_pivot.index[sugestions[i]])

Index(['Toy Story', 'Austin Powers: International Man of Mystery',
       'Harry Potter and the Chamber of Secrets', 'Black Hawk Down', 'Bambi'],
      dtype='object', name='title')


In [34]:
# Movie: 2 Fast 2 Furious

distances, sugestions = model.kneighbors(movie_pivot.
                                         filter(items=['2 Fast 2 Furious'],
                                                axis=0).values.reshape(1, -1))

for i in range(len(sugestions)):
    print(movie_pivot.index[sugestions[i]])

Index(['2 Fast 2 Furious', 'Bambi', 'The Matrix Reloaded', 'Lord of War',
       'RED 2'],
      dtype='object', name='title')


In [37]:
pickle.dump(model, open("parameter/model.pkl", "wb"))
pickle.dump(movie_pivot, open("parameter/movie_pivot.pkl", "wb"))
pickle.dump(movies, open("parameter/movies.pkl", "wb"))