In [10]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

In [11]:
df = pd.read_csv(r"datasets\csv_file.csv")

In [12]:
df.head(3)

Unnamed: 0,id,title,overview,cast,crew,keywords,genres
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",TomHanks TimAllen DonRickles,JohnLasseter,jealousy toy boy friendship friends rivalry bo...,Animation Comedy Family
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,RobinWilliams JonathanHyde KirstenDunst,JoeJohnston,boardgame disappearance basedonchildren'sbook ...,Adventure Fantasy Family
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,WalterMatthau JackLemmon Ann-Margret,HowardDeutch,fishing bestfriend duringcreditsstinger oldmen,Romance Comedy


In [13]:
df=df.head(5000)

In [14]:
df.isnull().sum()

id            0
title         0
overview      0
cast         76
crew         28
keywords    637
genres      101
dtype: int64

In [15]:
df.dropna(inplace=True)

In [16]:
df.duplicated().sum()

2

In [17]:
df.drop_duplicates(keep=False, inplace=True)

In [18]:
df.duplicated().sum()

0

In [19]:
df.shape

(4322, 7)

In [20]:
df['tags'] = df['keywords'] + df['genres'] + df['cast'] + df['crew']

In [21]:
df = df.drop(['cast', 'crew', 'keywords', 'genres'], axis=1)

In [22]:
df.head(3)

Unnamed: 0,id,title,overview,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",jealousy toy boy friendship friends rivalry bo...
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,boardgame disappearance basedonchildren'sbook ...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,fishing bestfriend duringcreditsstinger oldmen...


In [23]:
# id = tmbdid

In [24]:
df['overview'] = df['overview'].apply(lambda x:x.split())

In [25]:
df['overview'] = df['overview'].apply(lambda x:" ".join(x))

In [26]:
df.head(3)

Unnamed: 0,id,title,overview,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",jealousy toy boy friendship friends rivalry bo...
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,boardgame disappearance basedonchildren'sbook ...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,fishing bestfriend duringcreditsstinger oldmen...


In [27]:
df['tags'] = df['tags'] + df['overview']

In [28]:
df = df.drop(['overview'], axis=1)

In [29]:
df['tags'] = df['tags'].apply(lambda x:x.lower())

In [30]:
df

Unnamed: 0,id,title,tags
0,862,Toy Story,jealousy toy boy friendship friends rivalry bo...
1,8844,Jumanji,boardgame disappearance basedonchildren'sbook ...
2,15602,Grumpier Old Men,fishing bestfriend duringcreditsstinger oldmen...
3,31357,Waiting to Exhale,basedonnovel interracialrelationship singlemot...
4,11862,Father of the Bride Part II,baby midlifecrisis confidence aging daughter m...
...,...,...,...
4993,108267,The Lone Ranger and the Lost City of Gold,lonerangeraction adventure westernclaytonmoore...
4995,15999,Vampire Hunter D: Bloodlust,bountyhunter katana future vampire halfvampire...
4996,614,Wild Strawberries,adultery identity dream journeyinthepast profe...
4998,121703,Cattle Queen of Montana,montana nativeamerican cattleranch landgrabwes...


In [31]:
from sklearn.feature_extraction.text import CountVectorizer
countvectorizer = CountVectorizer(stop_words='english', max_features=5000)

In [32]:
vectors = countvectorizer.fit_transform(df['tags']).toarray()
# toarray() because default result will be in a sparse matrix form

In [33]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [34]:
countvectorizer.get_feature_names()

['000',
 '007',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '1920s',
 '1930',
 '1930s',
 '1934',
 '1940s',
 '1944',
 '1950s',
 '1955',
 '1957',
 '1959',
 '1960',
 '1960s',
 '1965',
 '1970',
 '1970s',
 '1980',
 '1980s',
 '1986',
 '1990',
 '1990s',
 '1992',
 '1994',
 '1996',
 '1999',
 '19th',
 '20',
 '20th',
 '24',
 '25',
 '30',
 '300',
 '40',
 '50',
 '50s',
 '60',
 '60s',
 '70',
 '80',
 '90s',
 'abandoned',
 'abducted',
 'abilities',
 'ability',
 'able',
 'aboard',
 'abroad',
 'abruptly',
 'absence',
 'abuse',
 'abused',
 'abusive',
 'academy',
 'accept',
 'accepted',
 'accepts',
 'access',
 'accident',
 'accidental',
 'accidentally',
 'accidents',
 'acclaimed',
 'accompanied',
 'account',
 'accountant',
 'accused',
 'ace',
 'acid',
 'acquired',
 'act',
 'acting',
 'action',
 'actions',
 'activist',
 'activities',
 'actor',
 'actorcomedy',
 'actors',
 'actress',
 'acts',
 'actual',
 'actually',
 'ad',
 'ada',
 'adam',
 'adams',
 'adaptation',
 'add',
 'addict

In [35]:
import nltk
from nltk.stem.porter import PorterStemmer
port = PorterStemmer()

In [36]:
def stem(text):
    y = []
    for i in text.split():
        y.append(port.stem(i))
    return " ".join(y)

In [37]:
df['tags'] = df['tags'].apply(stem)

In [38]:
df

Unnamed: 0,id,title,tags
0,862,Toy Story,jealousi toy boy friendship friend rivalri boy...
1,8844,Jumanji,boardgam disappear basedonchildren'sbook newho...
2,15602,Grumpier Old Men,fish bestfriend duringcreditssting oldmenrom c...
3,31357,Waiting to Exhale,basedonnovel interracialrelationship singlemot...
4,11862,Father of the Bride Part II,babi midlifecrisi confid age daughter motherda...
...,...,...,...
4993,108267,The Lone Ranger and the Lost City of Gold,lonerangeract adventur westernclaytonmoor jays...
4995,15999,Vampire Hunter D: Bloodlust,bountyhunt katana futur vampir halfvampir adul...
4996,614,Wild Strawberries,adulteri ident dream journeyinthepast professo...
4998,121703,Cattle Queen of Montana,montana nativeamerican cattleranch landgrabwes...


In [39]:
df['tags'][0]

"jealousi toy boy friendship friend rivalri boynextdoor newtoy toycomestolifeanim comedi familytomhank timallen donricklesjohnlasseterl by woody, andy' toy live happili in hi room until andy' birthday bring buzz lightyear onto the scene. afraid of lose hi place in andy' heart, woodi plot against buzz. but when circumst separ buzz and woodi from their owner, the duo eventu learn to put asid their differences."

In [40]:
#in this problem, we cslculate cosine distance, not eucledian distance
#in higher dimensions, eucledian distance is not that accurate

In [41]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
sim = TfidfVectorizer(stop_words='english')

In [42]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [43]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [44]:
cosine_similarity(vectors)

array([[1.        , 0.02094729, 0.        , ..., 0.        , 0.        ,
        0.04643635],
       [0.02094729, 1.        , 0.0547791 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.0547791 , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.03380617],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.04643635, 0.        , 0.        , ..., 0.03380617, 0.        ,
        1.        ]])

In [45]:
cosine_similarity(vectors).shape

(4322, 4322)

In [46]:
similarity = cosine_similarity(vectors)

In [47]:
def recommend(movie):
    movie_index = df[df['title']==movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    
    for i in movies_list:
        print(df.iloc[i[0]].title)

In [48]:
recommend('Jumanji')

Dungeons & Dragons
eXistenZ
Sudden Death
Any Given Sunday
D3: The Mighty Ducks


In [49]:
recommend('Batman Begins')

IndexError: index 0 is out of bounds for axis 0 with size 0

In [50]:
import pickle

In [51]:
df=df.head(5000)
pickle.dump(df.to_dict(),open('movies_content_dict.pkl','wb'))

In [52]:
pickle.dump(similarity,open('similarity_content.pkl','wb'))

In [53]:
import bz2

In [55]:
def compressed_pickle(title, data):
    with bz2.BZ2File(title + '.pbz2', 'w') as f: 
        pickle.dump(data, f)
compressed_pickle('similarity_content', similarity) 