In [43]:
import pandas as pd
import numpy as np
import ast

## Preprocess CSV files

In [2]:
df = pd.read_csv('ratings.csv')

In [3]:
df.head()

Unnamed: 0,UserId,MovieId,Rating
0,2,1,3.5
1,3,1,4.0
2,4,1,3.0
3,5,1,4.0
4,8,1,4.0


In [4]:
threshold = 1500

filter_movies = df['MovieId'].value_counts() >= threshold
filter_movies = filter_movies[filter_movies].index.tolist()

filter_users = df['UserId'].value_counts() > threshold
filter_users = filter_users[filter_users].index.tolist()

df_new = df[(df['MovieId'].isin(filter_movies)) & (df['UserId'].isin(filter_users))]

print('The original number of movies:\t{}'.format(len(df['MovieId'].unique())))
print('The new number of movies:\t{}'.format(len(df_new['MovieId'].unique())))

print('The original number of users:\t{}'.format(len(df['UserId'].unique())))
print('The new number of users:\t{}'.format(len(df_new['UserId'].unique())))

print('The original data frame shape:\t{}'.format(df.shape))
print('The new data frame shape:\t{}'.format(df_new.shape))

The original number of movies:	14196
The new number of movies:	2825
The original number of users:	120620
The new number of users:	758
The original data frame shape:	(23019525, 3)
The new data frame shape:	(982568, 3)


In [5]:
movies = pd.read_csv('movies_affiche_clean_final.csv')
sql = pd.read_csv('db_sql.csv')

In [8]:
movies.drop(['Unnamed: 0'], axis=1, inplace=True)

In [9]:
movies.head()

Unnamed: 0,MovieId,Title,Genres,Year,Synopsis,RatingMean,Affiche
0,1,Toy Story,['Adventure|Animation|Children|Comedy|Fantasy'],1995,A cowboy doll is profoundly threatened and jea...,3.887861,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,2,Jumanji,['Adventure|Children|Fantasy'],1995,Four teenagers are sucked into a magical video...,3.240917,https://m.media-amazon.com/images/M/MV5BMmM1ZT...
2,3,Grumpier Old Men,['Comedy|Romance'],1995,John and Max resolve to save their beloved bai...,3.107356,https://m.media-amazon.com/images/M/MV5BMjQxM2...
3,4,Waiting to Exhale,['Comedy|Drama|Romance'],1995,"Based on Terry McMillan's novel, this film fol...",2.831447,https://m.media-amazon.com/images/M/MV5BYzcyMD...
4,5,Father of the Bride Part II,['Comedy'],1995,George Banks must deal not only with his daugh...,3.027098,https://m.media-amazon.com/images/M/MV5BOTEyNz...


In [14]:
sql.rename(columns=dict(zip(sql.columns, [i[0].capitalize() + i[1:] for i in sql.columns])), inplace=True)

In [15]:
sql.head()

Unnamed: 0,MovieId,Title,Genres,Year,Synopsis,Affiche
0,1,Toy Story,['Adventure|Animation|Children|Comedy|Fantasy'],1995,A cowboy doll is profoundly threatened and jea...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,2,Jumanji,['Adventure|Children|Fantasy'],1995,Four teenagers are sucked into a magical video...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
2,3,Grumpier Old Men,['Comedy|Romance'],1995,John and Max resolve to save their beloved bai...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
3,4,Waiting to Exhale,['Comedy|Drama|Romance'],1995,"Based on Terry McMillan's novel, this film fol...",https://m.media-amazon.com/images/M/MV5BMDU2ZW...
4,5,Father of the Bride Part II,['Comedy'],1995,George Banks must deal not only with his daugh...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...


In [16]:
movie_idx = list(df_new['MovieId'].unique())

In [19]:
movies_new = movies.set_index('MovieId').loc[movie_idx].reset_index()

In [21]:
sql_new = sql.set_index('MovieId').loc[movie_idx].reset_index()

In [22]:
sql_new

Unnamed: 0,MovieId,Title,Genres,Year,Synopsis,Affiche
0,1,Toy Story,['Adventure|Animation|Children|Comedy|Fantasy'],1995,A cowboy doll is profoundly threatened and jea...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,2,Jumanji,['Adventure|Children|Fantasy'],1995,Four teenagers are sucked into a magical video...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
2,3,Grumpier Old Men,['Comedy|Romance'],1995,John and Max resolve to save their beloved bai...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
3,4,Waiting to Exhale,['Comedy|Drama|Romance'],1995,"Based on Terry McMillan's novel, this film fol...",https://m.media-amazon.com/images/M/MV5BMDU2ZW...
4,5,Father of the Bride Part II,['Comedy'],1995,George Banks must deal not only with his daugh...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
...,...,...,...,...,...,...
2820,188301,Ant-Man and the Wasp,['Action|Adventure|Comedy|Fantasy|Sci-Fi'],2018,Scott Lang and Hope Van Dyne are dragged into ...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
2821,189333,Mission: Impossible - Fallout,['Action|Adventure|Thriller'],2018,"Ethan Hunt and his IMF team, along with some f...",https://m.media-amazon.com/images/M/MV5BMDU2ZW...
2822,192803,Bohemian Rhapsody,['Drama'],2018,The story of the legendary British rock band Q...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
2823,194448,Green Book,['Comedy|Drama'],2018,A working-class Italian-American bouncer becom...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...


In [28]:
rating_mean = pd.DataFrame(df_new[['MovieId', 'Rating']].groupby('MovieId')['Rating'].mean())

In [33]:
movies_new['RatingMean'] = list(rating_mean['Rating'])

In [34]:
movies_new

Unnamed: 0,MovieId,Title,Genres,Year,Synopsis,RatingMean,Affiche
0,1,Toy Story,['Adventure|Animation|Children|Comedy|Fantasy'],1995,A cowboy doll is profoundly threatened and jea...,3.947592,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,2,Jumanji,['Adventure|Children|Fantasy'],1995,Four teenagers are sucked into a magical video...,3.025381,https://m.media-amazon.com/images/M/MV5BMmM1ZT...
2,3,Grumpier Old Men,['Comedy|Romance'],1995,John and Max resolve to save their beloved bai...,2.718663,https://m.media-amazon.com/images/M/MV5BMjQxM2...
3,4,Waiting to Exhale,['Comedy|Drama|Romance'],1995,"Based on Terry McMillan's novel, this film fol...",2.391129,https://m.media-amazon.com/images/M/MV5BYzcyMD...
4,5,Father of the Bride Part II,['Comedy'],1995,George Banks must deal not only with his daugh...,2.599125,https://m.media-amazon.com/images/M/MV5BOTEyNz...
...,...,...,...,...,...,...,...
2820,188301,Ant-Man and the Wasp,['Action|Adventure|Comedy|Fantasy|Sci-Fi'],2018,Scott Lang and Hope Van Dyne are dragged into ...,3.327381,https://i.ibb.co/svCHzsj/image-non-disponible.png
2821,189333,Mission: Impossible - Fallout,['Action|Adventure|Thriller'],2018,"Ethan Hunt and his IMF team, along with some f...",3.534591,https://i.ibb.co/svCHzsj/image-non-disponible.png
2822,192803,Bohemian Rhapsody,['Drama'],2018,The story of the legendary British rock band Q...,3.509740,https://i.ibb.co/svCHzsj/image-non-disponible.png
2823,194448,Green Book,['Comedy|Drama'],2018,A working-class Italian-American bouncer becom...,3.556000,https://i.ibb.co/svCHzsj/image-non-disponible.png


In [35]:
print(df_new.shape)
print(movies_new.shape)
print(sql_new.shape)

(982568, 3)
(2825, 7)
(2825, 6)


In [36]:
df_new.to_csv('ratings_final.csv', index=False)

In [37]:
movies_new.to_csv('movies_affiche_final.csv', index=False)

In [38]:
sql_new.to_csv('db_sql_final.csv', index=False)

## Preprocess Synopsis to enhance NLP recommandation

In [61]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [62]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [86]:
movies = pd.read_csv('movies_affiche_clean_final.csv')
movies.drop(['Unnamed: 0'], axis=1, inplace=True)

In [88]:
def preprocess_text(text):

    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]
    
    return ' '.join(filtered_words)

In [89]:
def convert_to_list(genre_string):
        genre_list = ast.literal_eval(genre_string)
        return genre_list[0].split('|')

In [90]:
def create_new_synopsis(row):
    genres_str = ', '.join(row['Genres'])
    new_synopsis = f"This is a {genres_str} movie. {row['Synopsis']}"
    return new_synopsis

In [91]:
movies['Genres'] = movies['Genres'].apply(convert_to_list)

In [92]:
movies['Synopsis'] = movies.apply(create_new_synopsis, axis=1)

In [93]:
movies['Synopsis'] = movies['Synopsis'].apply(preprocess_text)

In [94]:
movies

Unnamed: 0,MovieId,Title,Genres,Year,Synopsis,RatingMean,Affiche
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,adventure animation children comedy fantasy mo...,3.887861,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,adventure children fantasy movie four teenager...,3.240917,https://m.media-amazon.com/images/M/MV5BMmM1ZT...
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,comedy romance movie john max resolve save bel...,3.107356,https://m.media-amazon.com/images/M/MV5BMjQxM2...
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,comedy drama romance movie based terry mcmilla...,2.831447,https://m.media-amazon.com/images/M/MV5BYzcyMD...
4,5,Father of the Bride Part II,[Comedy],1995,comedy movie george banks must deal daughter p...,3.027098,https://m.media-amazon.com/images/M/MV5BOTEyNz...
...,...,...,...,...,...,...,...
14191,206208,Bill Burr: Paper Tiger,[Comedy],2019,comedy movie bill burr unloads outrage culture...,3.648649,https://i.ibb.co/svCHzsj/image-non-disponible.png
14192,206499,Between Two Ferns: The Movie,[Comedy],2019,comedy movie zach galifianakis oddball crew ta...,3.056180,https://i.ibb.co/svCHzsj/image-non-disponible.png
14193,206845,The Laundromat,"[Comedy, Crime, Drama]",2019,comedy crime drama movie widow investigates in...,3.231707,https://i.ibb.co/svCHzsj/image-non-disponible.png
14194,207405,Doctor Sleep,[Horror],2019,horror movie years following events shining 19...,3.730769,https://i.ibb.co/svCHzsj/image-non-disponible.png


In [95]:
movies.to_csv('movies_nlp.csv')