In [22]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

def stemSentence_porter(sentence):
    porter = PorterStemmer()
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 


def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 


def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [23]:
movies = pd.read_csv('new_data/tmdb_5000_movies.csv')
credits = pd.read_csv('new_data/tmdb_5000_credits.csv') 

In [24]:
movies = movies.merge(credits,on='title')

In [25]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [26]:
movies.dropna(inplace=True)

In [27]:
movies['genres'] = movies['genres'].apply(convert)

movies['keywords'] = movies['keywords'].apply(convert)

movies['cast'] = movies['cast'].apply(convert)

movies['cast'] = movies['cast'].apply(lambda x:x[0:3])

movies['crew'] = movies['crew'].apply(fetch_director)


movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)

movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
new = movies.drop(columns=['overview','genres','keywords','cast','crew'])
new['tags'] = new['tags'].apply(lambda x: " ".join(x))

In [14]:
new = new[(new['tags'].notnull())].reset_index(drop=True)
new['tags'] = new['tags'].apply(stemSentence_porter)

In [15]:
new

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd centuri , a parapleg marin is disp..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa , long believ to be dead , ha..."
2,206647,Spectre,a cryptic messag from bond ’ s past send him o...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weari , former militari c..."
...,...,...,...
4801,9367,El Mariachi,el mariachi just want to play hi guitar and ca...
4802,72766,Newlyweds,a newlyw coupl 's honeymoon is upend by the ar...
4803,231617,"Signed, Sealed, Delivered","`` sign , seal , deliv '' introduc a dedic qua..."
4804,126186,Shanghai Calling,when ambiti new york attorney sam is sent to s...


In [16]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(new['tags'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

results = {}

for idx, row in new.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], new['title'][i]) for i in similar_indices]

    results[row['title']] = similar_items[1:]
    
print('done!')


done!


In [17]:
def item(title):
    return new.loc[new['title'] == title]['title'].tolist()[0].split(' - ')[0]

# Just reads the results out of the dictionary.
def recommend(title, num):
    print("Recommending " + str(num) + " products similar to " + item(title) + "...")
    print("-------")
    recs = results[title][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

In [18]:
recommend(title="""Avatar""", num=5)

Recommending 5 products similar to Avatar...
-------
Recommended: Aliens (score:0.04180438375188731)
Recommended: X-Men: Days of Future Past (score:0.03296142681912127)
Recommended: Star Trek Into Darkness (score:0.0314161917949587)
Recommended: Falcon Rising (score:0.03138599745672438)
Recommended: Meet Dave (score:0.031214354837152573)


In [33]:
import openai

openai.api_key = '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'
resp = openai.Embedding.create(
    input=new['tags'].values.tolist()[0:100],
    engine="text-similarity-davinci-001")

# embedding_a = resp['data'][0]['embedding']
# embedding_b = resp['data'][1]['embedding']

# similarity_score = np.dot(embedding_a, embedding_b)

In [34]:
embedding_a = resp['data'][0]['embedding']
embedding_b = resp['data'][1]['embedding']

similarity_score = np.dot(embedding_a, embedding_b)
similarity_score

0.7761406283433965

In [30]:
new['tags'].values.tolist()

['In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron',
 "Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with Will Turner and Elizabeth Swann. But nothing is quite as it seems. Adventure Fantasy Action ocean drugabuse exoticisland eastindiatradingcompany loveofone'slife traitor shipwreck strongwoman ship alliance calypso afterlife fighter pirate swashbuckler aftercreditsstinger JohnnyDepp OrlandoBloom KeiraKnightley GoreVerbinski",
 'A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. While M 

In [35]:
embedding_a

[-0.0060567427426576614,
 0.01715654879808426,
 -0.009104120545089245,
 -0.004941693972796202,
 -0.005394682753831148,
 0.028281694278120995,
 -0.01602882891893387,
 0.005005049053579569,
 0.010415570810437202,
 -0.0035922315437346697,
 -0.0033071336802095175,
 -0.003275456139817834,
 -0.0017074187053367496,
 -0.00865430012345314,
 0.004038884770125151,
 0.011682671494781971,
 -0.012671010568737984,
 0.0061802854761481285,
 -0.017042508348822594,
 0.006563583388924599,
 -0.0015331922331824899,
 -0.0027068445924669504,
 0.009883387945592403,
 -0.006440041121095419,
 0.010935082100331783,
 0.0005480212275870144,
 0.02128729782998562,
 0.005255301482975483,
 -0.015800749883055687,
 -0.0018388803582638502,
 0.002207923447713256,
 0.01011780183762312,
 4.216451998217963e-06,
 -0.002200004179030657,
 -0.0067156353034079075,
 -0.0012433428782969713,
 0.004181433469057083,
 -0.010333209298551083,
 0.017333941534161568,
 0.007710309699177742,
 -0.011213844642043114,
 -0.010086123831570148,
 0.0