In [45]:
# !pip install -U sentence-transformers

In [1]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sentence_transformers import evaluation
from sentence_transformers.cross_encoder import CrossEncoder
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
import torch
import torch.nn as nn
import nltk
import re
nltk.download('words')
words = set(nltk.corpus.words.words())

def stemSentence_porter(sentence):
    porter = PorterStemmer()
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 


def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 


def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z0-9,.’]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    
    return text

def get_cosine_sim(model, df):
    scores = []
    for row in zip(df['tags'], df['reviews']):
        scores.append(util.cos_sim(model.encode(row[0]), model.encode(row[1])))
    return scores

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\czhao\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
movies = pd.read_csv('../tmdb_5000_data/tmdb_5000_movies.csv')
credits = pd.read_csv('../tmdb_5000_data/tmdb_5000_credits.csv') 
movies = movies.merge(credits,on='title')

reviews = pd.read_csv('../crawled_data/2022-11-19_movie_info_with_reviews.csv')
reviews = reviews[['id','reviews']]
reviews['reviews'] = reviews['reviews'].apply(lambda x: list(map(clean_text, x.split("\',"))))
movies = movies.merge(reviews,on='id', how='left')

movies.dropna(inplace=True)
movies['release_year'] = movies.release_date.apply(lambda x: x.split("-")[0]).astype(int)
movies = movies[movies['release_year']>1970]

movies = movies[['movie_id','title','release_year','overview','genres','keywords','cast','crew','reviews']]
movies.dropna(inplace=True)

movies['genres'] = movies['genres'].apply(convert)

movies['keywords'] = movies['keywords'].apply(convert)

movies['cast'] = movies['cast'].apply(convert)

movies['cast'] = movies['cast'].apply(lambda x:x[0:3])

movies['crew'] = movies['crew'].apply(fetch_director)


movies['crew'] = movies['crew'].apply(lambda x: [" ".join(\
        w for w in nltk.wordpunct_tokenize(i) \
         if w.lower() in words or not w.isalpha()) for i in x])

# movies['cast'] = movies['cast'].apply(collapse)
# movies['crew'] = movies['crew'].apply(collapse)
# movies['genres'] = movies['genres'].apply(collapse)
# movies['keywords'] = movies['keywords'].apply(collapse)


# new['tags'] = new['tags'].apply(stemSentence_porter)


In [3]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['release_year'] = movies['release_year'].astype(str).apply(lambda x:x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['release_year'] + movies['keywords'] + movies['cast'] + movies['crew'] 
new = movies.drop(columns=['overview','genres','keywords','cast','crew','release_year'])
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new = new[(new['tags'].notnull())].reset_index(drop=True)
new = new[new["reviews"].str.len() != 0].reset_index(drop=True)

In [49]:
new = new.explode(['reviews'])
new = new[(new['reviews']!='') & (new['reviews'].str.len() >= 3)].reset_index(drop=True)
# new.to_csv("../tmdb_5000_data/Cleaned_Filtered_Plots_Reviews")

In [50]:
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment")

model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment")

def get_sentiment(text):
  input_ids = tokenizer.encode(text + '</s>', return_tensors='pt')

  output = model.generate(input_ids=input_ids,
               max_length=2)
  
  dec = [tokenizer.decode(ids) for ids in output]
  label = dec[0]
  return label



In [None]:
new['review_sentiment'] = new['reviews'].apply(lambda x: get_sentiment(x)[6:])

In [None]:
new['sentiment_score'] = new['review_sentiment'].map({'positive': 1, 'negative': 0})

In [None]:
new.sentiment_score.unique()

array([1., 0.])

In [None]:
new = new[(new['reviews']!='') & (new['reviews'].str.len() >= 3)].reset_index(drop=True)

In [None]:
new.to_csv("../tmdb_5000_data/tmdb_sentiment_reviews.csv")

In [2]:
new = pd.read_csv('../tmdb_5000_data/tmdb_sentiment_reviews.csv') 

In [3]:
def concat_sentence(df):
    
    movie_id = df['movie_id'].iloc[0]

    reviews = ' '.join(df['reviews'])
    
    tags = df['tags'].iloc[0]
    
    review_sentiment = df['review_sentiment'].iloc[0]
    
    return movie_id, reviews, tags, review_sentiment

In [4]:
new = new.groupby(['title','sentiment_score']).apply(concat_sentence).reset_index()

In [5]:
new[['movie_id', 'reviews', 'tags', 'review_sentiment']] = pd.DataFrame(new[0].tolist(), index=new.index)

In [6]:
new

Unnamed: 0,title,sentiment_score,0,movie_id,reviews,tags,review_sentiment
0,10 Cloverfield Lane,1,"(333371, 10 Cloverfield Lane ratchets down on ...",333371,10 Cloverfield Lane ratchets down on the psych...,"After a car accident, Michelle awakens to find...",positive
1,12 Rounds,0,"(17134, Awfully forgettable, but Harlin makes ...",17134,"Awfully forgettable, but Harlin makes things g...",When New Orleans cop Danny Fisher prevents a b...,negative
2,12 Rounds,1,"(17134, 12 Rounds has a real sense of urgency ...",17134,12 Rounds has a real sense of urgency even if ...,When New Orleans cop Danny Fisher prevents a b...,positive
3,1408,0,"(3021, Una buena idea y unos 20 minutos inicia...",3021,Una buena idea y unos 20 minutos iniciales inq...,A man who specializes in debunking paranormal ...,negative
4,1408,1,"(3021, Directed in a style that suggests The E...",3021,Directed in a style that suggests The Evil Dea...,A man who specializes in debunking paranormal ...,positive
...,...,...,...,...,...,...,...
1755,Zookeeper,1,"(38317, Doesnt belabor its punch lines and it ...",38317,Doesnt belabor its punch lines and it maximize...,A comedy about a zookeeper who might be great ...,positive
1756,Zoolander 2,0,"(329833, The best thing about Zoolander 2 is t...",329833,The best thing about Zoolander 2 is that it is...,Derek and Hansel are modelling again when an o...,negative
1757,Zoolander 2,1,"(329833, The audience leaves the theater think...",329833,"The audience leaves the theater thinking, It t...",Derek and Hansel are modelling again when an o...,positive
1758,xXx,0,"(7451, For every action sequence, there are sh...",7451,"For every action sequence, there are shots ded...",Xander Cage is your standard adrenaline junkie...,negative


In [7]:
temp = new.groupby("title").count().reset_index()
drop_title_ls = temp[temp['movie_id']==1].title

In [8]:
new = new.drop(columns=[0])

In [9]:
new = new[~new.title.isin(drop_title_ls)]

In [10]:
def form_pos_neg_pair(df):
    
    neg_reviews = df['reviews'].iloc[0]

    pos_reviews = df['reviews'].iloc[1]
    
    tags = df['tags'].iloc[0]
    
    return neg_reviews, pos_reviews, tags

In [11]:
new = new.groupby(['title','movie_id']).apply(form_pos_neg_pair).reset_index()

new[['neg_reviews', 'pos_reviews', 'tags']] = pd.DataFrame(new[0].tolist(), index=new.index)

In [12]:
new = new.drop(columns=[0])

In [13]:
new

Unnamed: 0,title,movie_id,neg_reviews,pos_reviews,tags
0,12 Rounds,17134,"Awfully forgettable, but Harlin makes things g...",12 Rounds has a real sense of urgency even if ...,When New Orleans cop Danny Fisher prevents a b...
1,1408,3021,Una buena idea y unos 20 minutos iniciales inq...,Directed in a style that suggests The Evil Dea...,A man who specializes in debunking paranormal ...
2,17 Again,16996,A pleasantly predictable body swap comedy. Not...,"Yes, weve seen this shtick before, but Efrons ...","On the brink of a midlife crisis, 30-something..."
3,1911,76349,1911 meanders through its interminable two hou...,Students of revolutionary Chinese history can ...,"At the beginning of the 20th century, China is..."
4,2 Guns,136400,"2 Guns is conventional, and the sense of humor...","Despite not owning an ounce of originality, 2 ...",A DEA agent and an undercover Naval Intelligen...
...,...,...,...,...,...
813,Zambezia,133931,"The plot is a hokey, muddled mess about a youn...","Colorful, comic bird tale with lots of cartoon...",Set in a bustling bird city on the edge of the...
814,Zombieland,19908,"If, like I once did, you thought Shaun of the ...",The zombie comedy was hardly fresh territory w...,Columbus has made a habit of running from what...
815,Zookeeper,38317,Kevin Jamess zoo man shovels up a steaming pil...,Doesnt belabor its punch lines and it maximize...,A comedy about a zookeeper who might be great ...
816,Zoolander 2,329833,The best thing about Zoolander 2 is that it is...,"The audience leaves the theater thinking, It t...",Derek and Hansel are modelling again when an o...


In [14]:
from sentence_transformers import models, losses, InputExample
from torch.utils.data import DataLoader
from sentence_transformers import evaluation

In [15]:
# microsoft/mpnet-base
# sentence-transformers/stsb-bert-large
word_embedding_model = models.Transformer('microsoft/mpnet-base').to(device)

Some weights of the model checkpoint at microsoft/mpnet-base were not used when initializing MPNetModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing MPNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MPNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MPNetModel were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['mpnet.pooler.dense.weight', 'mpnet.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predi

In [16]:
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 
                                  pooling_mode_mean_tokens=True, 
                                  pooling_mode_cls_token=False, 
                                  pooling_mode_max_tokens=False
                              )

In [17]:
# update the model
sent_model = SentenceTransformer(modules=[word_embedding_model, pooling_model]).to(device)

In [18]:
train_examples = []

for index, row in enumerate(zip(new['tags'], new['pos_reviews'], new['neg_reviews'])):
#     train_examples.append(InputExample(texts=[row[0], row[1]], label = [scores[index]]))
    train_examples.append(InputExample(texts=[row[0], row[1], row[2]]))    

In [19]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2)
# ContrastiveLoss
# MarginMSELoss
# OnlineContrastiveLoss
# TripletLoss
train_loss = losses.TripletLoss(sent_model)

In [20]:
# dev_mse = evaluation.EmbeddingSimilarityEvaluator(new['tags'].values, new['reviews'].values,\
#                                                   scores=scores)
# dev_mse = evaluation.BinaryClassificationEvaluator(new['tags'].values, \
#                           new['reviews'].values, \
#                           new['sentiment_score'].values)

In [21]:
# dev_mse = evaluation.MSEEvaluator(new['tags'].values, new['reviews'].values,teacher_model=sent_model)

In [22]:
sent_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100, output_path='result')

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/409 [00:00<?, ?it/s]

In [23]:
descriptions = new['tags'].unique().tolist()
des_embeddings = []
for i,des in enumerate(descriptions):
    des_embeddings.append(sent_model.encode(des))

In [24]:
def recommend(query, model):
    #Compute cosine-similarities with all embeddings 
    model.eval()
    query_embedd = model.encode(query)
    cosine_scores = util.pytorch_cos_sim(query_embedd, des_embeddings)
    top5_matches = torch.argsort(cosine_scores, dim=-1, descending=True).tolist()[0][0:5]
    return top5_matches

In [29]:
query_show_des = '3D Top notch education and entertainment for dinosaurs'
recommendded_results = recommend(query_show_des, sent_model)

for index in recommendded_results:
    print(new.iloc[index,:])

title                                                  Grown Ups
movie_id                                                   38365
neg_reviews    It’s a lame and juvenile exercise in banality....
pos_reviews    Grown Ups will make you happy. Granted, Sandle...
tags           After their high school basketball coach passe...
Name: 286, dtype: object
title                                                 The Intern
movie_id                                                  257211
neg_reviews    Meyers concocts a few madcap comedy situations...
pos_reviews    Although many critics panned The Intern, I bel...
tags           70-year-old widower Ben Whittaker has discover...
Name: 689, dtype: object
title                                          Jumping the Broom
movie_id                                                   57119
neg_reviews    This extremely formulaic time waster offers li...
pos_reviews    All of the characters are written as believabl...
tags           Two very different famili

In [26]:
query_show_des = 'purchases a large house that has a zoo'
recommendded_results = recommend(query_show_des, sent_model)

for index in recommendded_results:
    print(new.iloc[index,:])

title                                        Paranormal Activity
movie_id                                                   23827
neg_reviews    Enter the cinema expecting a clever little mov...
pos_reviews    Even when you know theyre coming, the scares s...
tags           After a young, middle class couple moves into ...
Name: 479, dtype: object
title                                                  Marmaduke
movie_id                                                   38579
neg_reviews    Marmaduke is exclusively manufactured for chil...
pos_reviews    full review at Movies for the Masses Fans of t...
tags           When Phil and Debbie Winslow relocate from the...
Name: 416, dtype: object
title                                                 Piranha 3D
movie_id                                                   43593
neg_reviews    You should go see Piranha 3D. Maybe on Friday ...
pos_reviews    Its a shame about the less than stellar CGI be...
tags           Each year the population 

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()   

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  0% | 97% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 |  2% |  5% |


In [None]:
device

device(type='cuda')