In [160]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
import torch
import torch.nn as nn
import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())

def stemSentence_porter(sentence):
    porter = PorterStemmer()
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 


def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 


def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\czhao\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [161]:
# !pip install -U sentence-transformers

In [162]:
movies = pd.read_csv('new_data/tmdb_5000_movies.csv')
credits = pd.read_csv('new_data/tmdb_5000_credits.csv') 
movies = movies.merge(credits,on='title')
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]
movies.dropna(inplace=True)
movies['genres'] = movies['genres'].apply(convert)

movies['keywords'] = movies['keywords'].apply(convert)

movies['cast'] = movies['cast'].apply(convert)

movies['cast'] = movies['cast'].apply(lambda x:x[0:3])

movies['crew'] = movies['crew'].apply(fetch_director)


movies['crew'] = movies['crew'].apply(lambda x: [" ".join(\
        w for w in nltk.wordpunct_tokenize(i) \
         if w.lower() in words or not w.isalpha()) for i in x])

# movies['cast'] = movies['cast'].apply(collapse)
# movies['crew'] = movies['crew'].apply(collapse)
# movies['genres'] = movies['genres'].apply(collapse)
# movies['keywords'] = movies['keywords'].apply(collapse)

movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
new = movies.drop(columns=['overview','genres','keywords','cast','crew'])
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new = new[(new['tags'].notnull())].reset_index(drop=True)
# new['tags'] = new['tags'].apply(stemSentence_porter)

In [163]:
# model = SentenceTransformer('paraphrase-distilroberta-base-v1')
# model = SentenceTransformer('all-MiniLM-L6-v2')
# model = SentenceTransformer('all-mpnet-base-v2')
model = SentenceTransformer('stsb-bert-large')

In [110]:
descriptions = new['tags'].tolist()
des_embeddings = []
for i,des in enumerate(descriptions):
    des_embeddings.append(model.encode(des))

In [111]:
len(des_embeddings[0])

1024

In [167]:
def recommend(query, model):
    #Compute cosine-similarities with all embeddings 
    model.eval()
    query_embedd = model.encode(query)
    cosine_scores = util.pytorch_cos_sim(query_embedd, des_embeddings)
    top5_matches = torch.argsort(cosine_scores, dim=-1, descending=True).tolist()[0][0:5]
    return top5_matches

In [168]:
query_show_des = 'moon Pandora space colony society Sam 3d'
recommendded_results = recommend(query_show_des, model)

for index in recommendded_results:
    print(new.iloc[index,:])

movie_id                                                   62
title                                   2001: A Space Odyssey
tags        Humanity finds a mysterious object buried bene...
Name: 2970, dtype: object
movie_id                                                19995
title                                                  Avatar
tags        In the 22nd century, a paraplegic Marine is di...
Name: 0, dtype: object
movie_id                                                10153
title                                                  Sphere
tags        The OSSA discovers a spacecraft thought to be ...
Name: 549, dtype: object
movie_id                                                17431
title                                                    Moon
tags        With only three weeks left in his three year c...
Name: 3628, dtype: object
movie_id                                                50357
title                                               Apollo 18
tags        Officially, Apollo 1

In [125]:
new['tags'] = new['tags']
new['review'] = new['tags']
new.dtypes

movie_id     int64
title       object
tags        object
review      object
dtype: object

In [157]:
def train(model, df, batch_size=32, learning_rate=0.01, epochs=30):

    model.to(device)
    model.train()
    criterion = nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    plots = df['tags'].values
    
    reviews = df['review'].values

    for epoch in range(epochs):
        for i in range(0, len(plots), batch_size):

            batch_plots = plots[i:i+batch_size]
            batch_reviews = reviews[i:i+batch_size]
            
            embedding_plots = torch.tensor(model.encode(batch_plots), requires_grad=True).unsqueeze(1)
            embedding_reviews = torch.tensor(model.encode(batch_reviews), requires_grad=True).unsqueeze(1)
            
            loss = criterion(embedding_plots, embedding_reviews)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

In [166]:
train(model, new)

In [None]:
query_show_des = 'moon Pandora space colony society Sam 3d'
recommendded_results = recommend(query_show_des, model)

for index in recommendded_results:
    print(new.iloc[index,:])