In [42]:
# !pip install -U sentence-transformers
!pip install gradio

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [45]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sentence_transformers import evaluation
from sentence_transformers.cross_encoder import CrossEncoder
import ast
from sentence_transformers import models, losses, InputExample
from torch.utils.data import DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import AutoTokenizer, AutoModelWithLMHead
from nltk.stem import PorterStemmer
import torch
import torch.nn as nn
import nltk
import re
import gradio as gr
#from GPUtil import showUtilization as gpu_usage
#from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()
nltk.download('words')
words = set(nltk.corpus.words.words())

def stemSentence_porter(sentence):
    porter = PorterStemmer()
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 


def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 


def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z0-9,.’]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    
    return text

def get_cosine_sim(model, df):
    scores = []
    for row in zip(df['tags'], df['reviews']):
        scores.append(util.cos_sim(model.encode(row[0]), model.encode(row[1])))
    return scores

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else device)
device

[nltk_data] Downloading package words to
[nltk_data]     /Users/zijianchen/nltk_data...
[nltk_data]   Package words is already up-to-date!


device(type='mps')

In [None]:
movies = pd.read_csv('../tmdb_5000_data/tmdb_5000_movies.csv')
credits = pd.read_csv('../tmdb_5000_data/tmdb_5000_credits.csv')
movies = movies.merge(credits,on='title')

reviews = pd.read_csv('../crawled_data/2022-11-19_movie_info_with_reviews.csv')
reviews = reviews[['id','reviews']]
reviews['reviews'] = reviews['reviews'].apply(lambda x: list(map(clean_text, x.split("\',"))))
movies = movies.merge(reviews,on='id', how='left')

movies.dropna(inplace=True)
movies['release_year'] = movies.release_date.apply(lambda x: x.split("-")[0]).astype(int)
movies = movies[movies['release_year']>1970]

movies = movies[['movie_id','title','release_year','overview','genres','keywords','cast','crew','reviews']]
movies.dropna(inplace=True)

movies['genres'] = movies['genres'].apply(convert)

movies['keywords'] = movies['keywords'].apply(convert)

movies['cast'] = movies['cast'].apply(convert)

movies['cast'] = movies['cast'].apply(lambda x:x[0:3])

movies['crew'] = movies['crew'].apply(fetch_director)


movies['crew'] = movies['crew'].apply(lambda x: [" ".join(\
        w for w in nltk.wordpunct_tokenize(i) \
         if w.lower() in words or not w.isalpha()) for i in x])

In [None]:
storyline = pd.read_csv('../tmdb_5000_data/2022-12-04movie_with_imdb_storyline.csv', index_col=0) 
# storyline['storyline'] = storyline['storyline'].apply(lambda x: list(map(clean_text, x.split("\',"))))

movies = movies.merge(storyline[['movie_id','title','storyline']], how='left', left_on=['movie_id','title'], right_on = ['movie_id','title'])
movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['storyline'] = movies['storyline'].apply(lambda x:str(x).split())

movies['release_year'] = movies['release_year'].astype(str).apply(lambda x:x.split())
# movies['tags'] = movies['overview'] + movies['genres'] + movies['release_year'] + movies['keywords']
movies['tags'] = movies['overview'] + pd.Series(np.repeat(' The genre is ', len(movies), axis =0), index = movies.index).apply(lambda x:x.split()) \
+ movies['genres'] + pd.Series(np.repeat('.', len(movies), axis =0), index = movies.index).apply(lambda x:x.split())  \
+ pd.Series(np.repeat(' The movie was released in ', len(movies), axis =0), index = movies.index).apply(lambda x:x.split()) + pd.Series(np.repeat('.', len(movies), axis =0), index = movies.index).apply(lambda x:x.split()) \
+ movies['release_year'] + pd.Series(np.repeat(' The charactistics of the movie are ', len(movies), axis =0), index = movies.index).apply(lambda x:x.split()) \
+ movies['keywords'] + pd.Series(np.repeat('.', len(movies), axis =0), index = movies.index).apply(lambda x:x.split())  + movies['storyline']

new = movies.drop(columns=['overview','genres','keywords','cast','crew','release_year','storyline'])


In [None]:
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new = new[(new['tags'].notnull())].reset_index(drop=True)
new = new[new["reviews"].str.len() != 0].reset_index(drop=True)

In [None]:
new = new[new["reviews"].str.len() > 1]

In [None]:
new = new.explode(['reviews'])
# new = new[(new['reviews']!='')].reset_index(drop=True)
# new.to_csv("../tmdb_5000_data/Cleaned_Filtered_Plots_Reviews")

In [None]:
new

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment")

model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment")
model = model.to(device)
def get_sentiment(text):
  input_ids = tokenizer.encode(text + '</s>', return_tensors='pt').to(device)

  output = model.generate(input_ids=input_ids,
               max_length=2)
  
  dec = [tokenizer.decode(ids) for ids in output]
  label = dec[0]
  return label

In [None]:
new['review_sentiment'] = new['reviews'].apply(lambda x: get_sentiment(x)[6:])

In [None]:
new['sentiment_score'] = new['review_sentiment'].map({'positive': 1, 'negative': 0})

In [None]:
new = new[(new['reviews'].str.len() > 1)].reset_index(drop=True)

In [None]:
new

In [None]:
#new = new.drop(['Unnamed: 0'], axis=1)

new['grouped_reviews'] = new.groupby(['movie_id','title','tags','review_sentiment','sentiment_score'])['reviews'].transform(lambda x: '.'.join(x)).drop_duplicates()

In [None]:
new = new.dropna().drop(['reviews'], axis=1)
new

In [None]:
new.to_csv("../tmdb_5000_data/tmdb_sentiment_grouped_reviews.csv",index=False)

In [19]:
new = pd.read_csv('../tmdb_5000_data/tmdb_sentiment_grouped_reviews.csv') 

In [20]:
new

Unnamed: 0,movie_id,title,tags,review_sentiment,sentiment_score,grouped_reviews
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",positive,1.0,Cameron’s epic can still thrill the audience w...
1,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",negative,0.0,Five hundred million dollars wasted ..The leve...
2,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,positive,1.0,It still might not be quite the conclusion we ...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,negative,0.0,Even with a twisting mystery and numerous new ...
4,49529,John Carter,"John Carter is a war-weary, former military ca...",positive,1.0,John Carter is a good summer movie in March bu...
...,...,...,...,...,...,...
1723,2292,Clerks,Convenience and video store clerks Dante and R...,negative,0.0,...the films inherent deficiencies are general...
1724,14337,Primer,Friends/fledgling entrepreneurs invent a devic...,positive,1.0,Time travel may provide the paradoxical mechan...
1725,14337,Primer,Friends/fledgling entrepreneurs invent a devic...,negative,0.0,The storytelling is so confusing and the multi...
1726,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...,negative,0.0,A star is born in Daniel Henney in the predict...


In [21]:
# microsoft/mpnet-base
# sentence-transformers/stsb-bert-large
word_embedding_model = models.Transformer('sentence-transformers/all-mpnet-base-v2').to(device)

In [22]:
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 
                                  pooling_mode_mean_tokens=True, 
                                  pooling_mode_cls_token=False, 
                                  pooling_mode_max_tokens=False
                              )

In [23]:
# update the model
sent_model = SentenceTransformer(modules=[word_embedding_model, pooling_model]).to(device)

In [24]:
train_examples = []

for index, row in enumerate(zip(new['tags'], new['grouped_reviews'], new['sentiment_score'])):
#     train_examples.append(InputExample(texts=[row[0], row[1]], label = [scores[index]]))
    train_examples.append(InputExample(texts=[row[0], row[1]], label = row[2]))    

In [25]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=3)
# ContrastiveLoss
# MarginMSELoss
# OnlineContrastiveLoss
# TripletLoss
train_loss = losses.OnlineContrastiveLoss(sent_model)

In [26]:
# dev_mse = evaluation.EmbeddingSimilarityEvaluator(new['tags'].values, new['reviews'].values,\
#                                                   scores=scores)
dev_mse = evaluation.BinaryClassificationEvaluator(new['tags'].values, \
                          new['grouped_reviews'].values, \
                          new['sentiment_score'].values)

In [27]:
# dev_mse = evaluation.MSEEvaluator(new['tags'].values, new['reviews'].values,teacher_model=sent_model)

In [28]:
sent_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100, output_path='result')

Epoch:   0%|                                              | 0/1 [00:00<?, ?it/s]
Iteration:   0%|                                        | 0/576 [00:00<?, ?it/s][A
Iteration:   0%|                                | 1/576 [00:03<28:49,  3.01s/it][A
Iteration:   0%|                                | 2/576 [00:05<26:24,  2.76s/it][A
Iteration:   1%|▏                               | 3/576 [00:08<26:55,  2.82s/it][A
Iteration:   1%|▏                               | 4/576 [00:12<30:43,  3.22s/it][A
Iteration:   1%|▎                               | 5/576 [00:16<32:06,  3.37s/it][A
Iteration:   1%|▎                               | 6/576 [00:20<31:57,  3.36s/it][A
Iteration:   1%|▍                               | 7/576 [00:22<30:33,  3.22s/it][A
Iteration:   1%|▍                               | 8/576 [00:26<31:37,  3.34s/it][A
Iteration:   2%|▌                               | 9/576 [00:31<33:11,  3.51s/it][A
Iteration:   2%|▌                              | 10/576 [00:35<33:34,  3.56s/it

Iteration:  17%|█████▏                         | 96/576 [05:37<26:24,  3.30s/it][A
Iteration:  17%|█████▏                         | 97/576 [05:40<26:17,  3.29s/it][A
Iteration:  17%|█████▎                         | 98/576 [05:44<26:29,  3.33s/it][A
Iteration:  17%|█████▎                         | 99/576 [05:48<26:45,  3.37s/it][A
Iteration:  17%|█████▏                        | 100/576 [05:51<26:42,  3.37s/it][A
Iteration:  18%|█████▎                        | 101/576 [05:55<26:34,  3.36s/it][A
Iteration:  18%|█████▎                        | 102/576 [05:57<26:19,  3.33s/it][A
Iteration:  18%|█████▎                        | 103/576 [06:01<26:32,  3.37s/it][A
Iteration:  18%|█████▍                        | 104/576 [06:05<26:40,  3.39s/it][A
Iteration:  18%|█████▍                        | 105/576 [06:09<26:45,  3.41s/it][A
Iteration:  18%|█████▌                        | 106/576 [06:13<26:49,  3.42s/it][A
Iteration:  19%|█████▌                        | 107/576 [06:14<26:06,  3.34s

Iteration:  34%|██████████                    | 193/576 [10:45<21:21,  3.35s/it][A
Iteration:  34%|██████████                    | 194/576 [10:47<20:47,  3.27s/it][A
Iteration:  34%|██████████▏                   | 195/576 [10:52<21:21,  3.36s/it][A
Iteration:  34%|██████████▏                   | 196/576 [10:55<21:15,  3.36s/it][A
Iteration:  34%|██████████▎                   | 197/576 [10:59<21:07,  3.34s/it][A
Iteration:  34%|██████████▎                   | 198/576 [11:03<21:15,  3.37s/it][A
Iteration:  35%|██████████▎                   | 199/576 [11:06<21:04,  3.35s/it][A
Iteration:  35%|██████████▍                   | 200/576 [11:09<21:07,  3.37s/it][A
Iteration:  35%|██████████▍                   | 201/576 [11:13<21:11,  3.39s/it][A
Iteration:  35%|██████████▌                   | 202/576 [11:15<20:46,  3.33s/it][A
Iteration:  35%|██████████▌                   | 203/576 [11:19<21:00,  3.38s/it][A
Iteration:  35%|██████████▋                   | 204/576 [11:22<20:43,  3.34s

Iteration:  50%|███████████████               | 290/576 [16:13<16:38,  3.49s/it][A
Iteration:  51%|███████████████▏              | 291/576 [16:16<16:32,  3.48s/it][A
Iteration:  51%|███████████████▏              | 292/576 [16:19<16:13,  3.43s/it][A
Iteration:  51%|███████████████▎              | 293/576 [16:22<16:11,  3.43s/it][A
Iteration:  51%|███████████████▎              | 294/576 [16:28<16:44,  3.56s/it][A
Iteration:  51%|███████████████▎              | 295/576 [16:31<16:29,  3.52s/it][A
Iteration:  51%|███████████████▍              | 296/576 [16:34<16:21,  3.51s/it][A
Iteration:  52%|███████████████▍              | 297/576 [16:37<16:01,  3.45s/it][A
Iteration:  52%|███████████████▌              | 298/576 [16:40<16:02,  3.46s/it][A
Iteration:  52%|███████████████▌              | 299/576 [16:42<15:32,  3.37s/it][A
Iteration:  52%|███████████████▋              | 300/576 [16:45<15:22,  3.34s/it][A
Iteration:  52%|███████████████▋              | 301/576 [16:50<15:48,  3.45s

Iteration:  67%|████████████████████▏         | 387/576 [21:18<10:16,  3.26s/it][A
Iteration:  67%|████████████████████▏         | 388/576 [21:22<10:24,  3.32s/it][A
Iteration:  68%|████████████████████▎         | 389/576 [21:28<10:44,  3.45s/it][A
Iteration:  68%|████████████████████▎         | 390/576 [21:32<10:46,  3.48s/it][A
Iteration:  68%|████████████████████▎         | 391/576 [21:35<10:38,  3.45s/it][A
Iteration:  68%|████████████████████▍         | 392/576 [21:39<10:38,  3.47s/it][A
Iteration:  68%|████████████████████▍         | 393/576 [21:42<10:31,  3.45s/it][A
Iteration:  68%|████████████████████▌         | 394/576 [21:44<10:19,  3.40s/it][A
Iteration:  69%|████████████████████▌         | 395/576 [21:46<10:03,  3.33s/it][A
Iteration:  69%|████████████████████▋         | 396/576 [21:50<10:06,  3.37s/it][A
Iteration:  69%|████████████████████▋         | 397/576 [21:53<09:54,  3.32s/it][A
Iteration:  69%|████████████████████▋         | 398/576 [21:57<09:57,  3.36s

Iteration:  84%|█████████████████████████▏    | 484/576 [26:25<04:44,  3.09s/it][A
Iteration:  84%|█████████████████████████▎    | 485/576 [26:29<04:44,  3.12s/it][A
Iteration:  84%|█████████████████████████▎    | 486/576 [26:33<04:42,  3.14s/it][A
Iteration:  85%|█████████████████████████▎    | 487/576 [26:35<04:34,  3.08s/it][A
Iteration:  85%|█████████████████████████▍    | 488/576 [26:38<04:32,  3.10s/it][A
Iteration:  85%|█████████████████████████▍    | 489/576 [26:41<04:31,  3.12s/it][A
Iteration:  85%|█████████████████████████▌    | 490/576 [26:43<04:22,  3.05s/it][A
Iteration:  85%|█████████████████████████▌    | 491/576 [26:46<04:18,  3.04s/it][A
Iteration:  85%|█████████████████████████▋    | 492/576 [26:48<04:10,  2.98s/it][A
Iteration:  86%|█████████████████████████▋    | 493/576 [26:52<04:10,  3.02s/it][A
Iteration:  86%|█████████████████████████▋    | 494/576 [26:55<04:09,  3.04s/it][A
Iteration:  86%|█████████████████████████▊    | 495/576 [26:58<04:05,  3.03s

In [29]:
torch.save(sent_model.state_dict(), 'best_sent_model.pt')

In [30]:
checkpoint = torch.load('best_sent_model.pt', device)
sent_model.load_state_dict(checkpoint)
sent_model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [31]:
retrieve_df = new[['movie_id','title','tags']].drop_duplicates().reset_index(drop=True)

descriptions = retrieve_df['tags']
des_embeddings = []
for i,des in enumerate(descriptions):
    des_embeddings.append(sent_model.encode(des))

In [32]:
def recommend(query, model):
    #Compute cosine-similarities with all embeddings 
    model.eval()
    query_embedd = model.encode(query)
    cosine_scores = util.pytorch_cos_sim(query_embedd, des_embeddings)
    top10_matches = torch.argsort(cosine_scores, dim=-1, descending=True).tolist()[0][0:10]
    return top10_matches

In [33]:
query_show_des = '3D Top notch education and entertainment for dinosaurs'
recommendded_results = recommend(query_show_des, sent_model)

for index in recommendded_results:
    print(retrieve_df.iloc[index,1])

Walking With Dinosaurs
Jurassic Park
The Good Dinosaur
Jurassic World
Yogi Bear
Thunder and the House of Magic
How to Train Your Dragon
The Secret of Kells
Finding Nemo
How to Train Your Dragon 2


  b = torch.tensor(b)


In [34]:
Walking With Dinosaurs
The Good Dinosaur
Jurassic Park
Jurassic World
Thunder and the House of Magic
Sanctum
Yogi Bear
The Secret of Kells
ParaNorman
Shrek Forever After

SyntaxError: invalid syntax (1182986673.py, line 1)

In [None]:
query_show_des = 'alien attack earth'
recommendded_results = recommend(query_show_des, sent_model)

for index in recommendded_results:
    print(retrieve_df.iloc[index,1])

In [None]:
The Darkest Hour
They Live
Taxi to the Dark Side
The Terminator
The Lazarus Effect
Riddick
Resident Evil
Meet the Spartans
30 Days of Night
Final Destination 5

In [None]:
query_show_des = 'superhero fight trans genetic monster'
recommendded_results = recommend(query_show_des, sent_model)

for index in recommendded_results:
    print(retrieve_df.iloc[index,1])

In [None]:
Clash of the Titans
Spider-Man
Hellboy
Blade II
Osmosis Jones
The Amazing Spider-Man
Prom Night
Watchmen
Boogeyman
American Hero

In [None]:
query_show_des = 'glory space fighting in the solar system'
recommendded_results = recommend(query_show_des, sent_model)

for index in recommendded_results:
    print(retrieve_df.iloc[index,1])

In [None]:
300
Red Cliff
Return of the Jedi
Letters from Iwo Jima
Beneath Hill 60
Riddick
Edge of Tomorrow
Apocalypse Now
Spaceballs
Space Cowboys

In [35]:
query_show_des = 'escape prison'
recommendded_results = recommend(query_show_des, sent_model)

for index in recommendded_results:
    print(retrieve_df.iloc[index,1])

Bronson
Taxi to the Dark Side
Escape Plan
Man on a Ledge
The Green Mile
American History X
Man on Wire
Sympathy for Lady Vengeance
Death Race
Side Effects


In [60]:
retrieve_df

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
2,49529,John Carter,"John Carter is a war-weary, former military ca..."
3,38757,Tangled,When the kingdom's most wanted-and most charmi...
4,767,Harry Potter and the Half-Blood Prince,"As Harry begins his sixth year at Hogwarts, he..."
...,...,...,...
905,158895,This Is Martin Bonner,"Two men, at opposite ends of the social spectr..."
906,42151,Down Terrace,After serving jail time for a mysterious crime...
907,2292,Clerks,Convenience and video store clerks Dante and R...
908,14337,Primer,Friends/fledgling entrepreneurs invent a devic...


In [71]:
def greet(sentence, justTitle):
    recommendded_results = recommend(sentence, sent_model)
    if justTitle:
        recommendded_results = [retrieve_df.iloc[index,1] for index in recommendded_results]
    else:
        recommendded_results = [retrieve_df.iloc[index,1]+': '+retrieve_df.iloc[index,2] for index in recommendded_results]
    return recommendded_results

with gr.Blocks() as demo:
    gr.Markdown("Typing below and then click **Run** to see the output.")
    with gr.Row():
        with gr.Column():
            inp = gr.Textbox(placeholder="Describe a piece of sence/plot you would like to watch",label = "Query")
            justTitle = gr.Checkbox(label="Return movie titles only?")
        with gr.Column():
            opt = gr.TextArea(placeholder="The movies you might like will be displayed here", label ="Recommendations:")
        btn = gr.Button("Run")
        btn.click(fn=greet, inputs=[inp,justTitle], outputs=[opt])


demo.launch(debug=True) 

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


Keyboard interruption in main thread... closing server.


