In [1]:
# !pip install -U sentence-transformers

In [2]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sentence_transformers import evaluation
from sentence_transformers.cross_encoder import CrossEncoder
import ast
from sentence_transformers import models, losses, InputExample
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import AutoTokenizer, AutoModelWithLMHead
from nltk.stem import PorterStemmer
import torch
import torch.nn as nn
import nltk
import re
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()
nltk.download('words')
words = set(nltk.corpus.words.words())

def stemSentence_porter(sentence):
    porter = PorterStemmer()
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 


def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 


def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z0-9,.’]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    
    return text

def get_cosine_sim(model, df):
    scores = []
    for row in zip(df['tags'], df['reviews']):
        scores.append(util.cos_sim(model.encode(row[0]), model.encode(row[1])))
    return scores

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\czhao\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


device(type='cuda')

In [3]:
movies = pd.read_csv('../tmdb_5000_data/tmdb_5000_movies.csv')
credits = pd.read_csv('../tmdb_5000_data/tmdb_5000_credits.csv')
movies = movies.merge(credits,on='title')

reviews = pd.read_csv('../crawled_data/2022-11-19_movie_info_with_reviews.csv')
reviews = reviews[['id','reviews']]
reviews['reviews'] = reviews['reviews'].apply(lambda x: list(map(clean_text, x.split("\',"))))
movies = movies.merge(reviews,on='id', how='left')

movies.dropna(inplace=True)
movies['release_year'] = movies.release_date.apply(lambda x: x.split("-")[0]).astype(int)
movies = movies[movies['release_year']>1970]

movies = movies[['movie_id','title','release_year','overview','genres','keywords','cast','crew','reviews']]
movies.dropna(inplace=True)

movies['genres'] = movies['genres'].apply(convert)

movies['keywords'] = movies['keywords'].apply(convert)

movies['cast'] = movies['cast'].apply(convert)

movies['cast'] = movies['cast'].apply(lambda x:x[0:3])

movies['crew'] = movies['crew'].apply(fetch_director)


movies['crew'] = movies['crew'].apply(lambda x: [" ".join(\
        w for w in nltk.wordpunct_tokenize(i) \
         if w.lower() in words or not w.isalpha()) for i in x])

In [3]:
storyline = pd.read_csv('../tmdb_5000_data/2022-12-04movie_with_imdb_storyline.csv', index_col=0) 
# storyline['storyline'] = storyline['storyline'].apply(lambda x: list(map(clean_text, x.split("\',"))))

movies = movies.merge(storyline[['movie_id','title','storyline']], how='left', left_on=['movie_id','title'], right_on = ['movie_id','title'])
movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['storyline'] = movies['storyline'].apply(lambda x:str(x).split())

movies['release_year'] = movies['release_year'].astype(str).apply(lambda x:x.split())
# movies['tags'] = movies['overview'] + movies['genres'] + movies['release_year'] + movies['keywords']
movies['tags'] = movies['overview'] + pd.Series(np.repeat(' The genre is ', len(movies), axis =0), index = movies.index).apply(lambda x:x.split()) \
+ movies['genres'] + pd.Series(np.repeat('.', len(movies), axis =0), index = movies.index).apply(lambda x:x.split())  \
+ pd.Series(np.repeat(' The movie was released in ', len(movies), axis =0), index = movies.index).apply(lambda x:x.split()) + pd.Series(np.repeat('.', len(movies), axis =0), index = movies.index).apply(lambda x:x.split()) \
+ movies['release_year'] + pd.Series(np.repeat(' The charactistics of the movie are ', len(movies), axis =0), index = movies.index).apply(lambda x:x.split()) \
+ movies['keywords'] + pd.Series(np.repeat('.', len(movies), axis =0), index = movies.index).apply(lambda x:x.split())  + movies['storyline']

new = movies.drop(columns=['overview','genres','keywords','cast','crew','release_year','storyline'])


In [51]:
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new = new[(new['tags'].notnull())].reset_index(drop=True)
new = new[new["reviews"].str.len() != 0].reset_index(drop=True)

In [52]:
new = new[new["reviews"].str.len() > 1]

In [54]:
new = new.explode(['reviews'])
# new = new[(new['reviews']!='')].reset_index(drop=True)
# new.to_csv("../tmdb_5000_data/Cleaned_Filtered_Plots_Reviews")

In [55]:
new

Unnamed: 0,movie_id,title,reviews,tags
0,19995,Avatar,Cameron’s epic can still thrill the audience w...,"In the 22nd century, a paraplegic Marine is di..."
0,19995,Avatar,Avatar still elicits much of the same wide eye...,"In the 22nd century, a paraplegic Marine is di..."
0,19995,Avatar,The emotional stakes presented in the final ba...,"In the 22nd century, a paraplegic Marine is di..."
0,19995,Avatar,"Thirteen years after its release, Avatar still...","In the 22nd century, a paraplegic Marine is di..."
0,19995,Avatar,A meaningful blockbuster that fails to play ig...,"In the 22nd century, a paraplegic Marine is di..."
...,...,...,...,...
1512,126186,Shanghai Calling,Shanghai Calling doesnt aspire to fresh insigh...,When ambitious New York attorney Sam is sent t...
1512,126186,Shanghai Calling,If you prefer your social commentary in the fo...,When ambitious New York attorney Sam is sent t...
1512,126186,Shanghai Calling,Shanghai Calling eventually reveals itself to ...,When ambitious New York attorney Sam is sent t...
1512,126186,Shanghai Calling,"Through it all, Henney is an appealing screen ...",When ambitious New York attorney Sam is sent t...


In [56]:
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment")

model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment")
model = model.to(device)
def get_sentiment(text):
  input_ids = tokenizer.encode(text + '</s>', return_tensors='pt').to(device)

  output = model.generate(input_ids=input_ids,
               max_length=2)
  
  dec = [tokenizer.decode(ids) for ids in output]
  label = dec[0]
  return label



In [57]:
new['review_sentiment'] = new['reviews'].apply(lambda x: get_sentiment(x)[6:])

In [58]:
new['sentiment_score'] = new['review_sentiment'].map({'positive': 1, 'negative': 0})

In [68]:
new = new[(new['reviews'].str.len() > 1)].reset_index(drop=True)

In [70]:
new

Unnamed: 0,movie_id,title,reviews,tags,review_sentiment,sentiment_score
0,19995,Avatar,Cameron’s epic can still thrill the audience w...,"In the 22nd century, a paraplegic Marine is di...",positive,1.0
1,19995,Avatar,Avatar still elicits much of the same wide eye...,"In the 22nd century, a paraplegic Marine is di...",positive,1.0
2,19995,Avatar,The emotional stakes presented in the final ba...,"In the 22nd century, a paraplegic Marine is di...",positive,1.0
3,19995,Avatar,"Thirteen years after its release, Avatar still...","In the 22nd century, a paraplegic Marine is di...",positive,1.0
4,19995,Avatar,A meaningful blockbuster that fails to play ig...,"In the 22nd century, a paraplegic Marine is di...",positive,1.0
...,...,...,...,...,...,...
10947,126186,Shanghai Calling,Shanghai Calling doesnt aspire to fresh insigh...,When ambitious New York attorney Sam is sent t...,positive,1.0
10948,126186,Shanghai Calling,If you prefer your social commentary in the fo...,When ambitious New York attorney Sam is sent t...,negative,0.0
10949,126186,Shanghai Calling,Shanghai Calling eventually reveals itself to ...,When ambitious New York attorney Sam is sent t...,negative,0.0
10950,126186,Shanghai Calling,"Through it all, Henney is an appealing screen ...",When ambitious New York attorney Sam is sent t...,negative,0.0


In [71]:
#new = new.drop(['Unnamed: 0'], axis=1)

new['grouped_reviews'] = new.groupby(['movie_id','title','tags','review_sentiment','sentiment_score'])['reviews'].transform(lambda x: '.'.join(x)).drop_duplicates()

In [72]:
new = new.dropna().drop(['reviews'], axis=1)
new

Unnamed: 0,movie_id,title,tags,review_sentiment,sentiment_score,grouped_reviews
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",positive,1.0,Cameron’s epic can still thrill the audience w...
11,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",negative,0.0,Five hundred million dollars wasted ..The leve...
18,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,positive,1.0,It still might not be quite the conclusion we ...
23,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,negative,0.0,Even with a twisting mystery and numerous new ...
31,49529,John Carter,"John Carter is a war-weary, former military ca...",positive,1.0,John Carter is a good summer movie in March bu...
...,...,...,...,...,...,...
10928,2292,Clerks,Convenience and video store clerks Dante and R...,negative,0.0,...the films inherent deficiencies are general...
10930,14337,Primer,Friends/fledgling entrepreneurs invent a devic...,positive,1.0,Time travel may provide the paradoxical mechan...
10931,14337,Primer,Friends/fledgling entrepreneurs invent a devic...,negative,0.0,The storytelling is so confusing and the multi...
10943,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...,negative,0.0,A star is born in Daniel Henney in the predict...


In [73]:
new.to_csv("../tmdb_5000_data/tmdb_sentiment_grouped_reviews.csv",index=False)

In [4]:
new = pd.read_csv('../tmdb_5000_data/tmdb_sentiment_grouped_reviews.csv') 

In [5]:
new

Unnamed: 0,movie_id,title,tags,review_sentiment,sentiment_score,grouped_reviews
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",positive,1.0,Cameron’s epic can still thrill the audience w...
1,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",negative,0.0,Five hundred million dollars wasted ..The leve...
2,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,positive,1.0,It still might not be quite the conclusion we ...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,negative,0.0,Even with a twisting mystery and numerous new ...
4,49529,John Carter,"John Carter is a war-weary, former military ca...",positive,1.0,John Carter is a good summer movie in March bu...
...,...,...,...,...,...,...
1723,2292,Clerks,Convenience and video store clerks Dante and R...,negative,0.0,...the films inherent deficiencies are general...
1724,14337,Primer,Friends/fledgling entrepreneurs invent a devic...,positive,1.0,Time travel may provide the paradoxical mechan...
1725,14337,Primer,Friends/fledgling entrepreneurs invent a devic...,negative,0.0,The storytelling is so confusing and the multi...
1726,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...,negative,0.0,A star is born in Daniel Henney in the predict...


In [32]:
# microsoft/mpnet-base
# sentence-transformers/stsb-bert-large
word_embedding_model = models.Transformer('sentence-transformers/all-mpnet-base-v2').to(device)

In [33]:
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 
                                  pooling_mode_mean_tokens=True, 
                                  pooling_mode_cls_token=False, 
                                  pooling_mode_max_tokens=False
                              )

In [34]:
# update the model
sent_model = SentenceTransformer(modules=[word_embedding_model, pooling_model]).to(device)

In [35]:
train_examples = []

for index, row in enumerate(zip(new['tags'], new['grouped_reviews'], new['sentiment_score'])):
#     train_examples.append(InputExample(texts=[row[0], row[1]], label = [scores[index]]))
    train_examples.append(InputExample(texts=[row[0], row[1]], label = row[2]))    

In [36]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=3)
# ContrastiveLoss
# MarginMSELoss
# OnlineContrastiveLoss
# TripletLoss
train_loss = losses.OnlineContrastiveLoss(sent_model)

In [37]:
# dev_mse = evaluation.EmbeddingSimilarityEvaluator(new['tags'].values, new['reviews'].values,\
#                                                   scores=scores)
dev_mse = evaluation.BinaryClassificationEvaluator(new['tags'].values, \
                          new['grouped_reviews'].values, \
                          new['sentiment_score'].values)

In [38]:
# dev_mse = evaluation.MSEEvaluator(new['tags'].values, new['reviews'].values,teacher_model=sent_model)

In [39]:
sent_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100, output_path='result')

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/576 [00:00<?, ?it/s]

In [40]:
torch.save(sent_model.state_dict(), 'best_sent_model.pt')

In [None]:
checkpoint = torch.load('best_sent_model.pt', device)
sent_model.load_state_dict(checkpoint)
sent_model.to(device)

In [41]:
retrieve_df = new[['movie_id','title','tags']].drop_duplicates().reset_index(drop=True)

descriptions = retrieve_df['tags']
des_embeddings = []
for i,des in enumerate(descriptions):
    des_embeddings.append(sent_model.encode(des))

In [42]:
def recommend(query, model):
    #Compute cosine-similarities with all embeddings 
    model.eval()
    query_embedd = model.encode(query)
    cosine_scores = util.pytorch_cos_sim(query_embedd, des_embeddings)
    top10_matches = torch.argsort(cosine_scores, dim=-1, descending=True).tolist()[0][0:10]
    return top10_matches

In [43]:
query_show_des = '3D Top notch education and entertainment for dinosaurs'
recommendded_results = recommend(query_show_des, sent_model)

for index in recommendded_results:
    print(retrieve_df.iloc[index,1])

Walking With Dinosaurs
The Good Dinosaur
Jurassic Park
Jurassic World
U2 3D
The Lego Movie
Sanctum
Ice Age
Khumba
The Secret of Kells


In [None]:
Walking With Dinosaurs
The Good Dinosaur
Jurassic Park
Jurassic World
Thunder and the House of Magic
Sanctum
Yogi Bear
The Secret of Kells
ParaNorman
Shrek Forever After

In [44]:
query_show_des = 'alien attack earth'
recommendded_results = recommend(query_show_des, sent_model)

for index in recommendded_results:
    print(retrieve_df.iloc[index,1])

The Darkest Hour
Pixels
Left Behind
Bowling for Columbine
Space Cowboys
They Live
District 9
Alien
Edge of Tomorrow
Survival of the Dead


In [None]:
The Darkest Hour
They Live
Taxi to the Dark Side
The Terminator
The Lazarus Effect
Riddick
Resident Evil
Meet the Spartans
30 Days of Night
Final Destination 5

In [45]:
query_show_des = 'superhero fight trans genetic monster'
recommendded_results = recommend(query_show_des, sent_model)

for index in recommendded_results:
    print(retrieve_df.iloc[index,1])

Monsters vs Aliens
Fantastic Four
Meet the Spartans
The Lego Movie
Clash of the Titans
Osmosis Jones
Boogeyman
Teenage Mutant Ninja Turtles
The Terminator
Minions


In [None]:
Clash of the Titans
Spider-Man
Hellboy
Blade II
Osmosis Jones
The Amazing Spider-Man
Prom Night
Watchmen
Boogeyman
American Hero

In [46]:
query_show_des = 'glory space fighting in the solar system'
recommendded_results = recommend(query_show_des, sent_model)

for index in recommendded_results:
    print(retrieve_df.iloc[index,1])

300
Space Cowboys
Red Cliff
Letters from Iwo Jima
Beneath Hill 60
Edge of Tomorrow
Pixels
Riddick
Apocalypse Now
The Darkest Hour


In [None]:
300
Red Cliff
Return of the Jedi
Letters from Iwo Jima
Beneath Hill 60
Riddick
Edge of Tomorrow
Apocalypse Now
Spaceballs
Space Cowboys

In [47]:
query_show_des = 'escape prison'
recommendded_results = recommend(query_show_des, sent_model)

for index in recommendded_results:
    print(retrieve_df.iloc[index,1])

Bronson
Taxi to the Dark Side
The Green Mile
Death Race
Escape Plan
Sympathy for Lady Vengeance
The Act of Killing
Man on Wire
Free State of Jones
Side Effects
