# Content-based Recommender System 
[Model on hugging face](https://huggingface.co/spaces/MehrabK/RecommenderSystem)

### Contents
* Preprocessing
* Modeling

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [3]:
data = pd.read_csv('cleaned_data.csv').iloc[:, 1:]
data.head(1)

Unnamed: 0,genres,id,original_language,overview,poster_path,title,release_year,cast,crew,keywords
0,"['Animation', 'Comedy', 'Family']",862.0,en,"Led by Woody, Andy's toys live happily in his ...",/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,Toy Story,1995.0,"['Tom Hanks', 'Tim Allen', 'Don Rickles']",['John Lasseter'],"['jealousy', 'toy', 'boy', 'friendship', 'frie..."


In [4]:
data.shape

(42102, 10)

---
# Preprocessing

In [5]:
data.head(1)

Unnamed: 0,genres,id,original_language,overview,poster_path,title,release_year,cast,crew,keywords
0,"['Animation', 'Comedy', 'Family']",862.0,en,"Led by Woody, Andy's toys live happily in his ...",/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,Toy Story,1995.0,"['Tom Hanks', 'Tim Allen', 'Don Rickles']",['John Lasseter'],"['jealousy', 'toy', 'boy', 'friendship', 'frie..."


In [6]:
data['overview'] = data['overview'].apply(lambda x: x.split())

In [7]:
g = lambda x: x.replace('\'', '').replace('[', '').replace(']', '').replace(',', '').split(' ')

In [8]:
data['genres'] = data['genres'].apply(g)
data['cast'] = data['cast'].apply(g)
data['crew'] = data['crew'].apply(g)
data['keywords'] = data['keywords'].apply(g)

In [9]:
data.head(1)

Unnamed: 0,genres,id,original_language,overview,poster_path,title,release_year,cast,crew,keywords
0,"[Animation, Comedy, Family]",862.0,en,"[Led, by, Woody,, Andy's, toys, live, happily,...",/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,Toy Story,1995.0,"[Tom, Hanks, Tim, Allen, Don, Rickles]","[John, Lasseter]","[jealousy, toy, boy, friendship, friends, riva..."


In [10]:
data['tags'] = data['overview'] + data['genres'] + data['keywords'] + data['cast'] + data['crew']
data.drop(['overview', 'genres', 'cast', 'crew', 'keywords'], axis=1, inplace=True)

In [11]:
data['tags'] = data['tags'].apply(lambda x: " ".join(x))

In [12]:
data['tags'] = data['original_language'].astype(str) + ' ' + data['tags']
data.drop(['original_language'], axis=1, inplace=True)

In [13]:
data['tags'] = data['release_year'].astype(str) + ' ' + data['tags']
data.drop(['release_year'], axis=1, inplace=True)

In [14]:
data.head(1)

Unnamed: 0,id,poster_path,title,tags
0,862.0,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,Toy Story,"1995.0 en Led by Woody, Andy's toys live happi..."


We can also remove duplicated titles 

In [15]:
# data[['title']].duplicated().sum()
# data = data.iloc[data[['title']].drop_duplicates().index]

In [16]:
data.shape

(42102, 4)

Saving data

In [17]:
pickle.dump(data, open('data.pkl','wb'))

---
# Modeling

In [52]:
tfidf = TfidfVectorizer(stop_words='english')

In [53]:
tfidf_matrix = tfidf.fit_transform(data['tags'])

In [54]:
tfidf_matrix = tfidf_matrix.astype(np.float32)

In [55]:
tfidf_matrix.shape

(42102, 104408)

In [56]:
cosine_sim = cosine_similarity(tfidf_matrix)

In [57]:
cosine_sim.shape

(42102, 42102)

In [58]:
from sys import getsizeof
print(getsizeof(cosine_sim) / 1024**3, 'GB')

6.603369243443012 GB


In [67]:
cosine_sim = cosine_sim.astype(np.float16)

In [68]:
print(getsizeof(cosine_sim) / 1024**3, 'GB')

3.3016846776008606 GB


Svaing matrix

In [74]:
pickle.dump(cosine_sim, open('similarity.pkl','wb'))

In [69]:
indices = pd.Series(data.index, index=data['title'])
indices

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                               ...  
The Burkittsville 7            42097
Caged Heat 3000                42098
Robin Hood                     42099
Century of Birthing            42100
Betrayal                       42101
Length: 42102, dtype: int64

### Logic

In [26]:
t = 'Avatar'

In [27]:
indices[t]

14259

In [28]:
indx = [14259]

In [29]:
s = list(enumerate(cosine_sim[14259]))

In [30]:
s = sorted(s, key=lambda x: x[1], reverse=True)

In [31]:
s = s[1:6]
s

[(38627, 0.3242336),
 (1108, 0.32278615),
 (7279, 0.29778364),
 (1220, 0.29074025),
 (25462, 0.27526)]

In [32]:
r = 8.5
r /= 10
r

0.85

In [33]:
l = [(j[0], j[1] * r) for j in s]
l

[(38627, 0.2755985528230667),
 (1108, 0.27436822950839995),
 (7279, 0.25311609655618666),
 (1220, 0.24712921380996702),
 (25462, 0.23397100120782852)]

In [34]:
s = l

In [35]:
lst = [j for j in s]
lst

[(38627, 0.2755985528230667),
 (1108, 0.27436822950839995),
 (7279, 0.25311609655618666),
 (1220, 0.24712921380996702),
 (25462, 0.23397100120782852)]

In [36]:
lst.sort(key=lambda x: x[1], reverse=True)

In [37]:
lst

[(38627, 0.2755985528230667),
 (1108, 0.27436822950839995),
 (7279, 0.25311609655618666),
 (1220, 0.24712921380996702),
 (25462, 0.23397100120782852)]

## Recommender Model
### Inputs
* title: list of movies
* rank: list of ranks

In [70]:
def get_recommendations(title, rank):
    idx = []
    for m in title:
        if type(indices[m]) == np.int64:
            idx.append(indices[m])
        else:
            idx.append(indices[m][0])
    lst = []
    counter = 0
    for i in idx:
        sim_scores = list(enumerate(cosine_sim[i]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:6]
        
        # ranking
        l = [(ss[0], ss[1] * rank[counter] / 10) for ss in sim_scores]
        counter += 1
        sim_scores = l
        
        for j in sim_scores:
            if j[0] not in idx:
                lst.append(j)
    # sort
    lst.sort(key=lambda x: x[1], reverse=True)
    l = [i[0] for i in lst]
    movie_indices = l
    return data['title'].iloc[movie_indices][0:5]

In [71]:
for i in get_recommendations(['Toy Story', 'Batman Begins'], [9, 1]):
    print(i)

Toy Story 2
Toy Story 3
Small Fry
Small Soldiers
Silent Night, Deadly Night 5: The Toy Maker


In [72]:
for i in get_recommendations(['Toy Story', 'Batman Begins'], [7, 6]):
    print(i)

Toy Story 2
Toy Story 3
The Dark Knight
The Dark Knight Rises
Batman


In [75]:
for i in get_recommendations(['Toy Story'], [10]):
    print(i)

Toy Story 2
Toy Story 3
Small Fry
Small Soldiers
Silent Night, Deadly Night 5: The Toy Maker
