In [7]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer
from surprise import Reader, Dataset, SVD
import joblib

In [8]:
md = pd.read_csv('./sources/best_metadata_big.csv')
md.head()

Unnamed: 0,tmdbId,title,description,year,vote_average,vote_count,soup
0,88224,Toy Story,"Led by Woody, Andy's toys live happily in his ...",1995,7.7,5415,jealousi toy boy friendship friend rivalri boy...
1,42164,Jumanji,When siblings Judy and Peter discover an encha...,1995,6.9,2413,disappear basedonchildren'sbook newhom reclus ...
2,220,Heat,"Obsessive master thief, Neil McCauley leads a ...",1995,7.7,1886,robberi detect bank obsess chase shoot thief h...
3,23449,GoldenEye,James Bond must unmask the mysterious head of ...,1995,6.6,1194,cuba falselyaccus secretident computervirus se...
4,9361,Cutthroat Island,"Morgan Adams and her slave, William Shaw, are ...",1995,5.7,137,exoticisland treasur map ship pirat geenadavis...


In [9]:
md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7735 entries, 0 to 7734
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   tmdbId        7735 non-null   int64  
 1   title         7735 non-null   object 
 2   description   7601 non-null   object 
 3   year          7735 non-null   object 
 4   vote_average  7735 non-null   float64
 5   vote_count    7735 non-null   int64  
 6   soup          7735 non-null   object 
dtypes: float64(1), int64(2), object(4)
memory usage: 423.1+ KB


In [10]:
# create count vectorizer
count = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
count_matrix = count.fit_transform(md['soup'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)

indices = pd.Series(md.index, index=md['tmdbId'])

# disable copy warning when calling improved_recommendations
pd.set_option('mode.chained_assignment', None)

In [11]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [12]:
def improved_recommendations(tmdbId):
    if tmdbId not in indices:
        return 'tmdbId not found'
    index = indices[tmdbId]
    sim_scores = list(enumerate(cosine_sim[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]

    movies = md.iloc[movie_indices][['tmdbId', 'title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    m = vote_counts.quantile(0.60)
    C = vote_averages.mean()
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified.vote_count = qualified.vote_count.astype('int')
    qualified.vote_count = qualified.vote_count.astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [14]:
# tmdbId for 'The Dark Knight'
# improved_recommendations(503736)

In [15]:
# joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(md, "./metadata.joblib", compress=True)
joblib.dump(cosine_sim, "./cosine_sim.joblib", compress=True)

['./cosine_sim.joblib']