### Importing Modules and Dataset

In [1]:
import string, random
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise.model_selection import cross_validate

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv("tmdb.csv")

In [4]:
df = df.iloc[:10000,:]

### Content Based Filtering

In [5]:
# content based dataset
# id
# title
# imdb_id
# overview
# genres
# production_companies
# keywords 
content_based_df = df[["id", "imdb_id", "title", "overview", "genres", "keywords", "production_companies"]]

In [6]:
content_based_df.head()

Unnamed: 0,id,imdb_id,title,overview,genres,keywords,production_companies
0,27205,tt1375666,Inception,"Cobb, a skilled thief who commits corporate es...","Action, Science Fiction, Adventure","rescue, mission, dream, airplane, paris, franc...","Legendary Pictures, Syncopy, Warner Bros. Pict..."
1,157336,tt0816692,Interstellar,The adventures of a group of explorers who mak...,"Adventure, Drama, Science Fiction","rescue, future, spacecraft, race against time,...","Legendary Pictures, Syncopy, Lynda Obst Produc..."
2,155,tt0468569,The Dark Knight,Batman raises the stakes in his war on crime. ...,"Drama, Action, Crime, Thriller","joker, sadism, chaos, secret identity, crime f...","DC Comics, Legendary Pictures, Syncopy, Isobel..."
3,19995,tt0499549,Avatar,"In the 22nd century, a paraplegic Marine is di...","Action, Adventure, Fantasy, Science Fiction","future, society, culture clash, space travel, ...","Dune Entertainment, Lightstorm Entertainment, ..."
4,24428,tt0848228,The Avengers,When an unexpected enemy emerges and threatens...,"Science Fiction, Action, Adventure","new york city, superhero, shield, based on com...",Marvel Studios


In [7]:
content_based_df = content_based_df.fillna(" ")

In [8]:
content_based_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    10000 non-null  int64 
 1   imdb_id               10000 non-null  object
 2   title                 10000 non-null  object
 3   overview              10000 non-null  object
 4   genres                10000 non-null  object
 5   keywords              10000 non-null  object
 6   production_companies  10000 non-null  object
dtypes: int64(1), object(6)
memory usage: 547.0+ KB


In [9]:
def filter_punc(text):
    translator = str.maketrans('','',string.punctuation)
    text = text.translate(translator)
    return text

In [10]:
def filter_tags(text):
    L = text.split(",")
    for i in range(len(L)):
        L[i] = L[i].replace(" ", "")
    text = " ".join(L)
    translator = str.maketrans('','',string.punctuation)
    text = text.translate(translator)
    return " "+text

In [11]:
content_based_df["overview"] = content_based_df["overview"].apply(filter_punc)
content_based_df["genres"] = content_based_df["genres"].apply(filter_tags)
content_based_df["keywords"] = content_based_df["keywords"].apply(filter_tags)
content_based_df["production_companies"] = content_based_df["production_companies"].apply(filter_tags)

In [12]:
content_based_df.head()

Unnamed: 0,id,imdb_id,title,overview,genres,keywords,production_companies
0,27205,tt1375666,Inception,Cobb a skilled thief who commits corporate esp...,Action ScienceFiction Adventure,rescue mission dream airplane paris france vi...,LegendaryPictures Syncopy WarnerBrosPictures
1,157336,tt0816692,Interstellar,The adventures of a group of explorers who mak...,Adventure Drama ScienceFiction,rescue future spacecraft raceagainsttime arti...,LegendaryPictures Syncopy LyndaObstProductions
2,155,tt0468569,The Dark Knight,Batman raises the stakes in his war on crime W...,Drama Action Crime Thriller,joker sadism chaos secretidentity crimefighte...,DCComics LegendaryPictures Syncopy IsobelGrif...
3,19995,tt0499549,Avatar,In the 22nd century a paraplegic Marine is dis...,Action Adventure Fantasy ScienceFiction,future society cultureclash spacetravel space...,DuneEntertainment LightstormEntertainment 20t...
4,24428,tt0848228,The Avengers,When an unexpected enemy emerges and threatens...,ScienceFiction Action Adventure,newyorkcity superhero shield basedoncomic ali...,MarvelStudios


In [13]:
content_based_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    10000 non-null  int64 
 1   imdb_id               10000 non-null  object
 2   title                 10000 non-null  object
 3   overview              10000 non-null  object
 4   genres                10000 non-null  object
 5   keywords              10000 non-null  object
 6   production_companies  10000 non-null  object
dtypes: int64(1), object(6)
memory usage: 547.0+ KB


In [14]:
tags = content_based_df["overview"] + content_based_df["genres"] + content_based_df["keywords"] + content_based_df["production_companies"]
content_based_df["tags"] = tags
content_based_df.drop(columns=["overview", "genres", "keywords", "production_companies"], inplace=True)

In [15]:
content_based_df.head()

Unnamed: 0,id,imdb_id,title,tags
0,27205,tt1375666,Inception,Cobb a skilled thief who commits corporate esp...
1,157336,tt0816692,Interstellar,The adventures of a group of explorers who mak...
2,155,tt0468569,The Dark Knight,Batman raises the stakes in his war on crime W...
3,19995,tt0499549,Avatar,In the 22nd century a paraplegic Marine is dis...
4,24428,tt0848228,The Avengers,When an unexpected enemy emerges and threatens...


In [16]:
from nltk.stem.porter import PorterStemmer

In [17]:
ps = PorterStemmer()

In [18]:
def stemmer(text):
    L = []
    for i in text.split():
        L.append(ps.stem(i))
    return " ".join(L)

In [19]:
content_based_df["tags"] = content_based_df["tags"].apply(stemmer)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
cv = CountVectorizer(max_features=5000, stop_words="english")

In [22]:
vectors = cv.fit_transform(content_based_df["tags"]).toarray()

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
content_based_similarity = cosine_similarity(vectors)

In [25]:
def content_based_recommend(movie):
    index = content_based_df[content_based_df["title"] == movie].index[0]
    distances = content_based_similarity[index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]
    L = []
    for i in movies_list:
        L.append(content_based_df.iloc[i[0]].title)
    return L

In [45]:
content_based_recommend("Avengers: Endgame")

['Avengers: Age of Ultron',
 'Thor: Ragnarok',
 'Marvel One-Shot: Agent Carter',
 'Guardians of the Galaxy Vol. 3',
 'Captain America: Civil War']

### Collaboartive Filtering

In [27]:
collab_df = df[["id", "imdb_id", "title", "vote_count", "vote_average"]]

In [28]:
collab_df.head()

Unnamed: 0,id,imdb_id,title,vote_count,vote_average
0,27205,tt1375666,Inception,34495,8.364
1,157336,tt0816692,Interstellar,32571,8.417
2,155,tt0468569,The Dark Knight,30619,8.512
3,19995,tt0499549,Avatar,29815,7.573
4,24428,tt0848228,The Avengers,29166,7.71


In [29]:
vote_mean=collab_df['vote_average'].mean()
vote_percentile=collab_df['vote_count'].quantile(0.9)
vote_mean,vote_percentile

(6.6243313, 4235.0)

In [30]:
def weighted_average(X,vote_mean=vote_mean,vote_percentile=vote_percentile):
    R=X['vote_average']
    v=X['vote_count']
    return round((v/(v+vote_percentile)*R)+(vote_percentile/(vote_percentile+v)*vote_mean))

In [31]:
collab_df['score']=collab_df.apply(weighted_average,axis=1)
collab_df['score']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  collab_df['score']=collab_df.apply(weighted_average,axis=1)


0       8
1       8
2       8
3       7
4       8
       ..
9995    7
9996    7
9997    7
9998    7
9999    7
Name: score, Length: 10000, dtype: int64

In [32]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(collab_df[['id', 'title', 'score']], reader)
trainset, testset = train_test_split(data, test_size=0.2)
colab_filt = SVD()
cross_validate(colab_filt, data, measures=['RMSE', 'MAE'],cv=5,verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.3811  0.3830  0.3735  0.3828  0.3719  0.3785  0.0048  
MAE (testset)     0.2540  0.2548  0.2495  0.2543  0.2484  0.2522  0.0027  
Fit time          0.35    0.43    0.42    0.45    0.32    0.39    0.05    
Test time         0.04    0.04    0.05    0.06    0.03    0.04    0.01    


{'test_rmse': array([0.38113888, 0.38298471, 0.37349965, 0.38280436, 0.37192763]),
 'test_mae': array([0.25401717, 0.25481465, 0.24946995, 0.25425299, 0.24839069]),
 'fit_time': (0.34683680534362793,
  0.43062758445739746,
  0.4161853790283203,
  0.44594883918762207,
  0.32286906242370605),
 'test_time': (0.03780364990234375,
  0.042413949966430664,
  0.04730510711669922,
  0.06265068054199219,
  0.031369686126708984)}

In [33]:
colab_filt.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x183833995d0>

In [34]:
def collaborative_recommend(id, movie, n=5):
    user_movie_titles = collab_df[collab_df['id'] == id]['title'].values
    user_movie_recs = [title for title in collab_df['title'].unique() if title not in user_movie_titles]
    collaborative_rec = [random.choice([title for title in user_movie_recs if colab_filt.predict(id, title).est > 3.5]) for i in range(n)]
    return collaborative_rec

In [35]:
collaborative_recommend(1999,"Avatar")

['Mermaids',
 'Mandy',
 'Malice',
 'Barbie Presents: Thumbelina',
 'My Name Is Nobody']

### Hybrid Filering

In [36]:
def hybrid_recommend(id, movie):
    content_based = content_based_recommend(movie)
    collab_based = collaborative_recommend(id, movie)
    return content_based+collab_based

In [37]:
hybrid_recommend(1999, "Jurassic Park")

['Jurassic World',
 'The Lost World: Jurassic Park',
 'Jurassic Park III',
 'Futureworld',
 'The Land Before Time',
 'Octopussy',
 'La Grande Bouffe',
 'Cry Macho',
 'Spanish Affair 2',
 '[REC]']

### Model export

In [38]:
import pickle as pk

In [39]:
movies = list(df["title"])

In [40]:
pk.dump(movies, open("movies.pkl", 'wb'))

In [41]:
df1 = content_based_df.to_dict()

In [42]:
pk.dump(df1, open("df1.pkl", 'wb'))

In [43]:
pk.dump(content_based_similarity, open("similarity.pkl", 'wb'))