In [161]:
# !pip install -U sentence-transformers

In [47]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sentence_transformers import evaluation
from sentence_transformers.cross_encoder import CrossEncoder
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
import torch
import torch.nn as nn
import nltk
import re
nltk.download('words')
words = set(nltk.corpus.words.words())

def stemSentence_porter(sentence):
    porter = PorterStemmer()
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 


def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 


def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z0-9,.’]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    
    return text

def get_cosine_sim(model, df):
    scores = []
    for row in zip(df['tags'], df['reviews']):
        scores.append(util.cos_sim(model.encode(row[0]), model.encode(row[1])))
    return scores

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\czhao\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [76]:
movies = pd.read_csv('../tmdb_5000_data/tmdb_5000_movies.csv')
credits = pd.read_csv('../tmdb_5000_data/tmdb_5000_credits.csv') 
movies = movies.merge(credits,on='title')

reviews = pd.read_csv('../crawled_data/2022-11-19_movie_info_with_reviews.csv')
reviews = reviews[['id','reviews']]
reviews['reviews'] = reviews['reviews'].apply(lambda x: list(map(clean_text, x.split("\',"))))
movies = movies.merge(reviews,on='id', how='left')

movies.dropna(inplace=True)
movies['release_year'] = movies.release_date.apply(lambda x: x.split("-")[0]).astype(int)
movies = movies[movies['release_year']>1970]

movies = movies[['movie_id','title','release_year','overview','genres','keywords','cast','crew','reviews']]
movies.dropna(inplace=True)

movies['genres'] = movies['genres'].apply(convert)

movies['keywords'] = movies['keywords'].apply(convert)

movies['cast'] = movies['cast'].apply(convert)

movies['cast'] = movies['cast'].apply(lambda x:x[0:3])

movies['crew'] = movies['crew'].apply(fetch_director)


movies['crew'] = movies['crew'].apply(lambda x: [" ".join(\
        w for w in nltk.wordpunct_tokenize(i) \
         if w.lower() in words or not w.isalpha()) for i in x])

# movies['cast'] = movies['cast'].apply(collapse)
# movies['crew'] = movies['crew'].apply(collapse)
# movies['genres'] = movies['genres'].apply(collapse)
# movies['keywords'] = movies['keywords'].apply(collapse)


# new['tags'] = new['tags'].apply(stemSentence_porter)


In [77]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['release_year'] = movies['release_year'].astype(str).apply(lambda x:x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['release_year'] + movies['keywords'] + movies['cast'] + movies['crew'] 
new = movies.drop(columns=['overview','genres','keywords','cast','crew','release_year'])
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new = new[(new['tags'].notnull())].reset_index(drop=True)
new = new[new["reviews"].str.len() != 0].reset_index(drop=True)

In [78]:
new = new.explode(['reviews'])
new = new[new['reviews']!=''].reset_index(drop=True)
# new.to_csv("../tmdb_5000_data/Cleaned_Filtered_Plots_Reviews")

In [80]:
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment")

model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment")

def get_sentiment(text):
  input_ids = tokenizer.encode(text + '</s>', return_tensors='pt')

  output = model.generate(input_ids=input_ids,
               max_length=2)
  
  dec = [tokenizer.decode(ids) for ids in output]
  label = dec[0]
  return label
  
a = get_sentiment("I like a lot that film")



In [81]:
new['review_sentiment'] = new['reviews'].apply(lambda x: get_sentiment(x)[6:])

In [96]:
new['sentiment_score'] = new['review_sentiment'].map({'positive': 1, 'negative': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['sentiment_score'] = new['review_sentiment'].map({'positive': 1, 'negative': 0})


In [97]:
new

Unnamed: 0,movie_id,title,reviews,tags,review_sentiment,sentiment_score
0,19995,Avatar,Cameron’s epic can still thrill the audience w...,"In the 22nd century, a paraplegic Marine is di...",positive,1.0
0,19995,Avatar,Avatar still elicits much of the same wide eye...,"In the 22nd century, a paraplegic Marine is di...",positive,1.0
0,19995,Avatar,The emotional stakes presented in the final ba...,"In the 22nd century, a paraplegic Marine is di...",positive,1.0
0,19995,Avatar,"Thirteen years after its release, Avatar still...","In the 22nd century, a paraplegic Marine is di...",positive,1.0
0,19995,Avatar,A meaningful blockbuster that fails to play ig...,"In the 22nd century, a paraplegic Marine is di...",positive,1.0
...,...,...,...,...,...,...
1482,126186,Shanghai Calling,Shanghai Calling doesnt aspire to fresh insigh...,When ambitious New York attorney Sam is sent t...,positive,1.0
1482,126186,Shanghai Calling,If you prefer your social commentary in the fo...,When ambitious New York attorney Sam is sent t...,negative,0.0
1482,126186,Shanghai Calling,Shanghai Calling eventually reveals itself to ...,When ambitious New York attorney Sam is sent t...,negative,0.0
1482,126186,Shanghai Calling,"Through it all, Henney is an appealing screen ...",When ambitious New York attorney Sam is sent t...,negative,0.0


In [98]:
new.to_csv("../tmdb_5000_data/tmdb_sentiment_reviews.csv")

In [9]:
# model = SentenceTransformer('paraphrase-distilroberta-base-v1')
# model = SentenceTransformer('all-MiniLM-L6-v2')
# model = SentenceTransformer('all-mpnet-base-v2')
# model = SentenceTransformer('stsb-bert-large')

In [10]:
# def train(model, df, batch_size=128, learning_rate=0.1, epochs=30):

#     model.to(device)
#     model.train()
#     criterion = nn.MSELoss()
#     optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

#     plots = df['tags'].values
    
#     reviews = df['reviews'].values

#     for epoch in range(epochs):
#         for i in range(0, len(plots), batch_size):

#             batch_plots = plots[i:i+batch_size]
#             batch_reviews = reviews[i:i+batch_size]
            
#             embedding_plots = torch.tensor(model.encode(batch_plots), requires_grad=True).unsqueeze(1).to(device)
#             embedding_reviews = torch.tensor(model.encode(batch_reviews), requires_grad=True).unsqueeze(1).to(device)
            
#             loss = criterion(embedding_plots, embedding_reviews)
#             print(loss)
#             loss.backward()
#             optimizer.step()
#             optimizer.zero_grad()

In [11]:
# train(model, new)

In [12]:
# descriptions = new['tags'].tolist()
# des_embeddings = []
# for i,des in enumerate(descriptions):
#     des_embeddings.append(model.encode(des))

In [13]:
# def recommend(query, model):
#     #Compute cosine-similarities with all embeddings 
#     model.eval()
#     query_embedd = model.encode(query)
#     cosine_scores = util.pytorch_cos_sim(query_embedd, des_embeddings)
#     top5_matches = torch.argsort(cosine_scores, dim=-1, descending=True).tolist()[0][0:5]
#     return top5_matches

In [14]:
# query_show_des = 'moon Pandora space colony society Sam 3d'
# recommendded_results = recommend(query_show_des, model)

# for index in recommendded_results:
#     print(new.iloc[index,:])

In [91]:
from sentence_transformers import models, losses, InputExample
from torch.utils.data import DataLoader
from sentence_transformers import evaluation

In [92]:
# microsoft/mpnet-base
# sentence-transformers/stsb-bert-large
word_embedding_model = models.Transformer('microsoft/mpnet-base').to(device)

Some weights of the model checkpoint at microsoft/mpnet-base were not used when initializing MPNetModel: ['lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing MPNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MPNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MPNetModel were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['mpnet.pooler.dense.weight', 'mpnet.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predi

In [93]:
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 
                                  pooling_mode_mean_tokens=True, 
                                  pooling_mode_cls_token=False, 
                                  pooling_mode_max_tokens=False
                              )

In [94]:
# update the model
sent_model = SentenceTransformer(modules=[word_embedding_model, pooling_model]).to(device)

In [19]:
# model = CrossEncoder('cross-encoder/stsb-roberta-large')

In [20]:
# scores = model.predict(new[['tags','reviews']].values)

In [21]:
# scores.shape

In [22]:
train_examples = []

for index, row in enumerate(zip(new['tags'], new['reviews'])):
#     train_examples.append(InputExample(texts=[row[0], row[1]], label = [scores[index]]))
    train_examples.append(InputExample(texts=[row[0], row[1]]))    

In [23]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2)
# MultipleNegativesRankingLoss not trainable because gpu limitation
# MegaBatchMarginLoss 
# MSELoss need labels
train_loss = losses.MegaBatchMarginLoss(sent_model)

In [24]:
# dev_mse = evaluation.EmbeddingSimilarityEvaluator(new['tags'].values, new['reviews'].values,\
#                                                   scores=scores)
dev_mse = evaluation.MSEEvaluator(new['tags'].values, new['reviews'].values, teacher_model=sent_model)

In [25]:
# dev_mse = evaluation.MSEEvaluator(new['tags'].values, new['reviews'].values,teacher_model=sent_model)

In [26]:
sent_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=3, warmup_steps=100, evaluator=dev_mse, output_path='result')

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/474 [00:00<?, ?it/s]

Iteration:   0%|          | 0/474 [00:00<?, ?it/s]

Iteration:   0%|          | 0/474 [00:00<?, ?it/s]

In [27]:
# scores_after_training = get_cosine_sim(sent_model, new)

In [28]:
# sum(scores_before_training)

In [29]:
# sum(scores_after_training)

In [30]:
descriptions = new['tags'].tolist()
des_embeddings = []
for i,des in enumerate(descriptions):
    des_embeddings.append(sent_model.encode(des))

In [31]:
def recommend(query, model):
    #Compute cosine-similarities with all embeddings 
    model.eval()
    query_embedd = model.encode(query)
    cosine_scores = util.pytorch_cos_sim(query_embedd, des_embeddings)
    top5_matches = torch.argsort(cosine_scores, dim=-1, descending=True).tolist()[0][0:5]
    return top5_matches

In [32]:
query_show_des = '3D Top notch education and entertainment for dinosaurs'
recommendded_results = recommend(query_show_des, sent_model)

for index in recommendded_results:
    print(new.iloc[index,:])

movie_id                                                77951
title                                  Walking With Dinosaurs
reviews     Top notch education and entertainment for dino...
tags        Walking with Dinosaurs 3D is a film depicting ...
Name: 170, dtype: object
movie_id                                                15512
title                                      Monsters vs Aliens
reviews     Wonderfully inventive characters and highly po...
tags        When Susan Murphy is unwittingly clobbered by ...
Name: 28, dtype: object
movie_id                                                 9297
title                                           Monster House
reviews     here the setting is unambiguously autumnal, in...
tags        Monsters under the bed are scary enough, but w...
Name: 183, dtype: object
movie_id                                                50321
title                                         Mars Needs Moms
reviews     A decent flick that could have been so much mo

  b = torch.tensor(b)


In [45]:
query_show_des = 'Its Godzilla versus the paper pushers'
recommendded_results = recommend(query_show_des, sent_model)

for index in recommendded_results:
    print(new.iloc[index,:])

movie_id                                                 7278
title                                       Meet the Spartans
reviews     The worlds worst parody movie. , This is one o...
tags        From the creators of Scary Movie and Date Movi...
Name: 712, dtype: object
movie_id                                                13805
title                                          Disaster Movie
reviews     Nothing more than a 90 minute barrage of unfun...
tags        In DISASTER MOVIE, the filmmaking team behind ...
Name: 886, dtype: object
movie_id                                                82690
title                                          Wreck-It Ralph
reviews     Phil Johnston and Jennifer Lee’s script finds ...
tags        Wreck-It Ralph is the 9-foot-tall, 643-pound v...
Name: 72, dtype: object
movie_id                                                93456
title                                         Despicable Me 2
reviews     Nothing is surprising about Despicable Me 2, b

In [15]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [18]:
import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()   

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  0% | 97% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 |  2% |  5% |


In [38]:
device

device(type='cuda')