<a href="https://colab.research.google.com/github/ernselito/Recommendation_System/blob/main/Recommendation_Machine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise.dataset import Reader, Dataset
from surprise.model_selection import cross_validate
from surprise import SVD


import warnings; warnings.simplefilter('ignore')

In [82]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

In [83]:
print('Data source import complete.')
data_path = kagglehub.dataset_download("rounakbanik/the-movies-dataset")
md = pd.read_csv(data_path + '/movies_metadata.csv')

Data source import complete.


In [84]:
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [85]:
def safe_literal_eval_and_extract_names(x):
    try:
        # First, try to evaluate the string if it's a string
        if isinstance(x, str):
            evaluated_x = literal_eval(x)
        else:
            # If not a string, assume it's already in a list format or is NaN/None
            evaluated_x = x

        # Ensure the evaluated result is a list before extracting names
        if isinstance(evaluated_x, list):
            return [i['name'] for i in evaluated_x if isinstance(i, dict) and 'name' in i]
        else:
            return [] # Return empty list for non-list entries

    except (ValueError, SyntaxError):
        # Return empty list for malformed strings that literal_eval cannot handle
        return []

md['genres'] = md['genres'].fillna('[]').apply(safe_literal_eval_and_extract_names)

In [86]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.244896612406511

In [87]:
m = vote_counts.quantile(0.95)
m

434.0

In [90]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if pd.notna(x) else np.nan)
md['year'] = pd.to_numeric(md['year'], errors='coerce')

In [91]:
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2274, 6)

In [92]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [93]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [94]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

In [95]:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010.0,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008.0,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014.0,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999.0,9678,8,63.869599,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001.0,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994.0,8670,8,140.950236,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994.0,8358,8,51.645403,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003.0,8226,8,29.324358,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994.0,8147,8,48.307194,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002.0,7641,8,29.423537,"[Adventure, Fantasy, Action]",7.851924


In [96]:
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = md.drop('genres', axis=1).join(s)

In [97]:
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)

    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')

    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)

    return qualified

In [98]:
build_chart('Romance').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995.0,661,9,34.457024,8.565285
351,Forrest Gump,1994.0,8147,8,48.307194,7.971357
876,Vertigo,1958.0,1162,8,18.20822,7.811667
40251,Your Name.,2016.0,1030,8,34.461252,7.789489
883,Some Like It Hot,1959.0,835,8,11.845107,7.745154
1132,Cinema Paradiso,1988.0,834,8,14.177005,7.744878
19901,Paperman,2012.0,734,8,7.198633,7.713951
37863,Sing Street,2016.0,669,8,10.672862,7.689483
882,The Apartment,1960.0,498,8,11.994281,7.599317
38718,The Handmaiden,2016.0,453,8,16.727405,7.566166


In [99]:
links_small = pd.read_csv(data_path + '/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [100]:
md = md.drop([19730, 29503, 35587])
#Check EDA Notebook for how and why I got these indices.
md['id'] = md['id'].astype('int')

In [101]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9099, 25)

In [102]:
smd.loc[:, 'tagline'] = smd['tagline'].fillna('')
smd.loc[:, 'description'] = smd['overview'] + smd['tagline']
smd.loc[:, 'description'] = smd['description'].fillna('')


In [103]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [104]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=1, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])
tfidf_matrix.shape

(9099, 268124)

In [105]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [106]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [107]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [108]:
get_recommendations('The Godfather').head(10)

Unnamed: 0,title
973,The Godfather: Part II
8387,The Family
3509,Made
4196,Johnny Dangerously
29,Shanghai Triad
5667,Fury
2412,American Movie
1582,The Godfather: Part III
4221,8 Women
2159,Summer of Sam


In [109]:
get_recommendations('The Dark Knight').head(10)

Unnamed: 0,title
7931,The Dark Knight Rises
132,Batman Forever
1113,Batman Returns
8227,"Batman: The Dark Knight Returns, Part 2"
7565,Batman: Under the Red Hood
524,Batman
7901,Batman: Year One
2579,Batman: Mask of the Phantasm
2696,JFK
8165,"Batman: The Dark Knight Returns, Part 1"


In [110]:
credits, keywords = pd.read_csv(data_path + '/credits.csv'), pd.read_csv(data_path + '/keywords.csv')

In [111]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [112]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [113]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9219, 28)

In [114]:
smd = smd.copy() # Work on a copy to avoid SettingWithCopyWarning
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [115]:
def get_director(x):
    for i in x:
        if isinstance(i, dict) and i.get('job') == 'Director':
            return i.get('name')
    return np.nan

In [116]:
smd['director'] = smd['crew'].apply(lambda x: get_director(x) if isinstance(x, list) else np.nan)

In [117]:
smd['cast'] = smd['cast'].apply(lambda x: [i.get('name') for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [118]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i.get('name') for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(str(i).replace(" ", "")) for i in x])
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x, x])

In [119]:

s = s.value_counts()
s[:5]

s = s[s > 1]

In [120]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [121]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [122]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [123]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [124]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=1, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

cosine_sim = cosine_similarity(count_matrix, count_matrix)

smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [125]:
get_recommendations('The Dark Knight').head(10)

Unnamed: 0,title
7991,The Dark Knight Rises
6186,Batman Begins
6587,The Prestige
2077,Following
4125,Insomnia
3373,Memento
8573,Interstellar
7608,Inception
5907,Thursday
8899,Kidnapping Mr. Heineken


In [126]:
get_recommendations('Mean Girls').head(10)

Unnamed: 0,title
4735,Freaky Friday
1321,The House of Yes
3311,Head Over Heels
7865,Mr. Popper's Penguins
6245,Just Like Heaven
7292,Ghosts of Girlfriends Past
6923,The Spiderwick Chronicles
3170,The Ladies Man
5124,Confessions of a Teenage Drama Queen
4406,The Hot Chick


In [127]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]

    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [128]:
improved_recommendations('The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,year,wr
7608,Inception,14075,8,2010.0,7.917588
8573,Interstellar,11187,8,2014.0,7.897107
6587,The Prestige,4510,8,2006.0,7.758148
3373,Memento,4168,8,2000.0,7.740175
7991,The Dark Knight Rises,9263,7,2012.0,6.921448
6186,Batman Begins,7511,7,2005.0,6.904127
1524,The French Connection,435,7,1971.0,6.123458
2978,Gone in Sixty Seconds,1511,6,2000.0,5.831509
4125,Insomnia,1181,6,2002.0,5.797081
149,Hackers,406,6,1995.0,5.609863


In [129]:
improved_recommendations('Mean Girls')

Unnamed: 0,title,vote_count,vote_average,year,wr
8433,About Time,2140,7,2013.0,6.704073
7843,Midnight in Paris,2049,7,2011.0,6.693228
4735,Freaky Friday,919,6,2003.0,5.757786
1629,The Parent Trap,768,6,1998.0,5.727359
6245,Just Like Heaven,595,6,2005.0,5.681521
6923,The Spiderwick Chronicles,593,6,2008.0,5.680901
6189,Herbie Fully Loaded,559,5,2005.0,5.107034
8969,Aloha,704,5,2015.0,5.093396
7292,Ghosts of Girlfriends Past,716,5,2009.0,5.092422
7865,Mr. Popper's Penguins,775,5,2011.0,5.087912


## Collaborative

In [130]:
reader = Reader()

In [131]:
ratings = pd.read_csv(data_path + '/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [132]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
display(data)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

<surprise.dataset.DatasetAutoFolds at 0x7e75c17df6b0>

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9076  0.8979  0.8930  0.8952  0.8941  0.8976  0.0053  
MAE (testset)     0.6974  0.6911  0.6870  0.6903  0.6877  0.6907  0.0037  
Fit time          3.41    3.19    3.06    2.22    2.09    2.79    0.53    
Test time         0.76    0.21    0.12    0.19    0.12    0.28    0.24    


{'test_rmse': array([0.90757818, 0.8978785 , 0.89304465, 0.89522537, 0.89412798]),
 'test_mae': array([0.69737327, 0.69112724, 0.68697123, 0.69028609, 0.6877076 ]),
 'fit_time': (3.4074199199676514,
  3.1890013217926025,
  3.059587240219116,
  2.2195096015930176,
  2.090996742248535),
 'test_time': (0.761981725692749,
  0.21325254440307617,
  0.11894536018371582,
  0.19047999382019043,
  0.12076759338378906)}

In [133]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7e75b8129f40>

In [134]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [135]:
svd.predict(1, 302, 3)

Prediction(uid=1, iid=302, r_ui=3, est=2.8986054918682633, details={'was_impossible': False})

In [136]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [137]:
id_map = pd.read_csv(data_path + '/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')
#id_map = id_map.set_index('tmdbId')

In [138]:
indices_map = id_map.set_index('id')

def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']

    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]

    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [139]:
hybrid(1, 'Avatar')

Unnamed: 0,title,vote_count,vote_average,year,id,est
962,Aliens,3282.0,7.7,1986.0,679,3.102731
522,Terminator 2: Judgment Day,4274.0,7.7,1991.0,280,3.076609
999,The Terminator,4208.0,7.4,1984.0,218,2.989088
910,The Abyss,822.0,7.1,1989.0,2756,2.816862
8622,X-Men: Days of Future Past,6155.0,7.5,2014.0,127585,2.809285
344,True Lies,1138.0,6.8,1994.0,36955,2.724446
1368,Titanic,7770.0,7.5,1997.0,597,2.690529
4323,Piranha Part Two: The Spawning,41.0,3.9,1981.0,31646,2.689503
2123,Superman,1042.0,6.9,1978.0,1924,2.639822
4258,Mothra vs. Godzilla,38.0,6.4,1964.0,1682,2.63771
