In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

def stemSentence_porter(sentence):
    porter = PorterStemmer()
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 


def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 


def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [4]:
movies = pd.read_csv('new_data/tmdb_5000_movies.csv')
credits = pd.read_csv('new_data/tmdb_5000_credits.csv') 

In [5]:
movies = movies.merge(credits,on='title')

In [6]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [7]:
movies.dropna(inplace=True)

In [8]:
movies['genres'] = movies['genres'].apply(convert)

movies['keywords'] = movies['keywords'].apply(convert)

movies['cast'] = movies['cast'].apply(convert)

movies['cast'] = movies['cast'].apply(lambda x:x[0:3])

movies['crew'] = movies['crew'].apply(fetch_director)


movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)

movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
new = movies.drop(columns=['overview','genres','keywords','cast','crew'])
new['tags'] = new['tags'].apply(lambda x: " ".join(x))

In [9]:
new = new[(new['tags'].notnull())].reset_index(drop=True)
new['tags'] = new['tags'].apply(stemSentence_porter)

In [10]:
new

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd centuri , a parapleg marin is disp..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa , long believ to be dead , ha..."
2,206647,Spectre,a cryptic messag from bond ’ s past send him o...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weari , former militari c..."
...,...,...,...
4801,9367,El Mariachi,el mariachi just want to play hi guitar and ca...
4802,72766,Newlyweds,a newlyw coupl 's honeymoon is upend by the ar...
4803,231617,"Signed, Sealed, Delivered","`` sign , seal , deliv '' introduc a dedic qua..."
4804,126186,Shanghai Calling,when ambiti new york attorney sam is sent to s...


In [11]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(new['tags'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

results = {}

for idx, row in new.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], new['title'][i]) for i in similar_indices]

    results[row['title']] = similar_items[1:]
    
print('done!')


done!


In [12]:
def item(title):
    return new.loc[new['title'] == title]['title'].tolist()[0].split(' - ')[0]

# Just reads the results out of the dictionary.
def recommend(title, num):
    print("Recommending " + str(num) + " products similar to " + item(title) + "...")
    print("-------")
    recs = results[title][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

In [13]:
recommend(title="""Avatar""", num=5)

Recommending 5 products similar to Avatar...
-------
Recommended: Aliens (score:0.04180438375188731)
Recommended: X-Men: Days of Future Past (score:0.03296142681912127)
Recommended: Star Trek Into Darkness (score:0.0314161917949587)
Recommended: Falcon Rising (score:0.03138599745672438)
Recommended: Meet Dave (score:0.031214354837152573)


In [14]:
import openai

openai.api_key = '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'
resp = openai.Embedding.create(
    input=new['tags'].values.tolist()[0:100],
    engine="text-similarity-davinci-001")

# embedding_a = resp['data'][0]['embedding']
# embedding_b = resp['data'][1]['embedding']

# similarity_score = np.dot(embedding_a, embedding_b)

In [15]:
embedding_a = resp['data'][0]['embedding']
embedding_b = resp['data'][1]['embedding']

similarity_score = np.dot(embedding_a, embedding_b)
similarity_score

0.8075420538987348

In [16]:
new['tags'].values.tolist()

['in the 22nd centuri , a parapleg marin is dispatch to the moon pandora on a uniqu mission , but becom torn between follow order and protect an alien civil . action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron ',
 "captain barbossa , long believ to be dead , ha come back to life and is head to the edg of the earth with will turner and elizabeth swann . but noth is quit as it seem . adventur fantasi action ocean drugabus exoticisland eastindiatradingcompani loveofone'slif traitor shipwreck strongwoman ship allianc calypso afterlif fighter pirat swashbuckl aftercreditssting johnnydepp orlandobloom keiraknightley goreverbinski ",
 'a cryptic messag from bond ’ s past send him on a trail to uncov a sinist organ . while m battl polit forc to keep the secret servic aliv , bond peel ba

In [17]:
len(embedding_a)

12288

In [18]:
embedding_a

[-0.0030857156962156296,
 0.009352189488708973,
 -0.007077503949403763,
 -0.009554947726428509,
 -0.005566326901316643,
 0.025420721620321274,
 -0.006225288845598698,
 -0.001084277406334877,
 0.009003700688481331,
 -0.004004460759460926,
 -0.006988797802478075,
 -0.0073753041215240955,
 -0.003077795496210456,
 -0.007432329468429089,
 0.014839314855635166,
 0.01837489940226078,
 -0.011918339878320694,
 0.007330950815230608,
 -0.016841545701026917,
 0.005860959179699421,
 -0.004159696865826845,
 -5.9946127294097096e-05,
 0.007951895706355572,
 -0.004612733144313097,
 0.013305961154401302,
 -0.0007147993892431259,
 0.018755069002509117,
 0.0008419188670814037,
 -0.023456508293747902,
 -0.003985452000051737,
 -0.0038999137468636036,
 0.008819951675832272,
 -0.0018168973037973046,
 0.0010407161898911,
 -0.005788092967122793,
 0.00181056116707623,
 0.0016822536708787084,
 -0.012729369103908539,
 0.01047369185835123,
 0.014319748617708683,
 -0.009751368314027786,
 -0.003700324334204197,
 0.01

In [19]:
len(embedding_b)

12288

In [20]:
embedding_b

[-0.00917407963424921,
 0.01258105132728815,
 -0.01440190989524126,
 -0.008044767193496227,
 -0.007759266998618841,
 0.01676204614341259,
 -0.0006832183571532369,
 -0.005449885502457619,
 0.004108033608645201,
 -0.00650306511670351,
 -0.014300398528575897,
 -0.0022760727442801,
 -0.0025885370559990406,
 -0.003863772377371788,
 0.012853862717747688,
 0.006052608601748943,
 -0.0027027372270822525,
 0.004637795500457287,
 -0.015150555409491062,
 0.008476190268993378,
 -0.001432260381989181,
 -0.0011840336956083775,
 0.012650840915739536,
 -0.0055323634296655655,
 0.0038161887787282467,
 -0.002547298092395067,
 0.012669874355196953,
 0.007645066827535629,
 -0.017155403271317482,
 0.0014211576199159026,
 -0.000528175791259855,
 0.001960436115041375,
 -0.0005686216754838824,
 -0.005116801708936691,
 -0.0028756235260516405,
 -0.0018954055849462748,
 0.006509409286081791,
 -0.01050007063895464,
 0.0030881627462804317,
 -0.0007391288527287543,
 -0.0014378117630258203,
 0.0008596734842285514,
 -