In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

def stemSentence_porter(sentence):
    porter = PorterStemmer()
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 


def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 


def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [96]:
movies = pd.read_csv('../tmdb_5000_data/tmdb_5000_movies.csv')
credits = pd.read_csv('../tmdb_5000_data/tmdb_5000_credits.csv') 
movies = movies.merge(credits,on='title')
movies.dropna(inplace=True)

In [97]:
movies['release_year'] = movies.release_date.apply(lambda x: x.split("-")[0]).astype(int)

In [98]:
movies = movies[movies['release_year']>1970]

In [99]:
movies = movies[['movie_id','title','release_year','overview','genres','keywords','cast','crew']]

In [100]:
movies['genres'] = movies['genres'].apply(convert)

movies['keywords'] = movies['keywords'].apply(convert)

movies['cast'] = movies['cast'].apply(convert)

movies['cast'] = movies['cast'].apply(lambda x:x[0:3])

movies['crew'] = movies['crew'].apply(fetch_director)


movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)

movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['release_year'] = movies['release_year'].astype(str).apply(lambda x:x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['release_year'] + movies['keywords'] + movies['cast'] + movies['crew']
new = movies.drop(columns=['overview','genres','keywords','cast','crew','release_year'])
new['tags'] = new['tags'].apply(lambda x: " ".join(x))

In [101]:
new = new[(new['tags'].notnull())].reset_index(drop=True)
new['tags'] = new['tags'].apply(stemSentence_porter)
new = new.reset_index(drop=True)

In [102]:
new

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd centuri , a parapleg marin is disp..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa , long believ to be dead , ha..."
2,206647,Spectre,a cryptic messag from bond ’ s past send him o...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weari , former militari c..."
...,...,...,...
1476,2292,Clerks,conveni and video store clerk dant and randal ...
1477,255266,Dry Spell,sasha tri to get her soon-to-b ex husband kyle...
1478,157185,Tin Can Man,"recent dump by hi girlfirend for anoth man , w..."
1479,14337,Primer,friends/fledgl entrepreneur invent a devic in ...


In [103]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(new['tags'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

results = {}

for idx, row in new.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], new['title'][i]) for i in similar_indices]

    results[row['title']] = similar_items[1:]
    
print('done!')


done!


In [104]:
def item(title):
    return new.loc[new['title'] == title]['title'].tolist()[0].split(' - ')[0]

# Just reads the results out of the dictionary.
def recommend(title, num):
    print("Recommending " + str(num) + " products similar to " + item(title) + "...")
    print("-------")
    recs = results[title][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

In [105]:
recommend(title="""Avatar""", num=5)

Recommending 5 products similar to Avatar...
-------
Recommended: X-Men: Days of Future Past (score:0.037342202491879695)
Recommended: Aliens in the Attic (score:0.03645212615666184)
Recommended: Battle: Los Angeles (score:0.03560992173935679)
Recommended: Meet Dave (score:0.033647779020400885)
Recommended: Man of Steel (score:0.03149233272251119)


In [110]:
new.iloc[0,-1]

'in the 22nd centuri , a parapleg marin is dispatch to the moon pandora on a uniqu mission , but becom torn between follow order and protect an alien civil . action adventur fantasi sciencefict 2009 cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron '

In [120]:
print("Number of tokens:", tfidf_matrix.shape[1])

Number of tokens: 138534
