In [2]:
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 2.0 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 69.9 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 60.0 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 21.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 56.9 MB/s 
Building wheels for collected pa

In [1]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sentence_transformers import evaluation
from sentence_transformers.cross_encoder import CrossEncoder
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
import torch
import torch.nn as nn
import nltk
import re
nltk.download('words')
words = set(nltk.corpus.words.words())

def stemSentence_porter(sentence):
    porter = PorterStemmer()
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 


def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 


def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z0-9,.’]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    
    return text

def get_cosine_sim(model, df):
    scores = []
    for row in zip(df['tags'], df['reviews']):
        scores.append(util.cos_sim(model.encode(row[0]), model.encode(row[1])))
    return scores

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\czhao\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [3]:
movies = pd.read_csv('../tmdb_5000_data/tmdb_5000_movies.csv')
credits = pd.read_csv('../tmdb_5000_data/tmdb_5000_credits.csv') 
movies = movies.merge(credits,on='title')

reviews = pd.read_csv('../crawled_data/2022-11-19_movie_info_with_reviews.csv')
reviews = reviews[['id','reviews']]
reviews['reviews'] = reviews['reviews'].apply(lambda x: list(map(clean_text, x.split("\',"))))
movies = movies.merge(reviews,on='id', how='left')

movies.dropna(inplace=True)
movies['release_year'] = movies.release_date.apply(lambda x: x.split("-")[0]).astype(int)
movies = movies[movies['release_year']>1970]

movies = movies[['movie_id','title','release_year','overview','genres','keywords','cast','crew','reviews']]
movies.dropna(inplace=True)

movies['genres'] = movies['genres'].apply(convert)

movies['keywords'] = movies['keywords'].apply(convert)

movies['cast'] = movies['cast'].apply(convert)

movies['cast'] = movies['cast'].apply(lambda x:x[0:3])

movies['crew'] = movies['crew'].apply(fetch_director)


movies['crew'] = movies['crew'].apply(lambda x: [" ".join(\
        w for w in nltk.wordpunct_tokenize(i) \
         if w.lower() in words or not w.isalpha()) for i in x])

# movies['cast'] = movies['cast'].apply(collapse)
# movies['crew'] = movies['crew'].apply(collapse)
# movies['genres'] = movies['genres'].apply(collapse)
# movies['keywords'] = movies['keywords'].apply(collapse)


# new['tags'] = new['tags'].apply(stemSentence_porter)


In [4]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['release_year'] = movies['release_year'].astype(str).apply(lambda x:x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['release_year'] + movies['keywords'] + movies['cast'] + movies['crew'] 
new = movies.drop(columns=['overview','genres','keywords','cast','crew','release_year'])
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new = new[(new['tags'].notnull())].reset_index(drop=True)
new = new[new["reviews"].str.len() != 0].reset_index(drop=True)

In [18]:
new = new.explode(['reviews'])
new = new[(new['reviews']!='') & (new['reviews'].str.len() >= 3)].reset_index(drop=True)
# new.to_csv("../tmdb_5000_data/Cleaned_Filtered_Plots_Reviews")

In [19]:
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment")

model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment")

def get_sentiment(text):
  input_ids = tokenizer.encode(text + '</s>', return_tensors='pt')

  output = model.generate(input_ids=input_ids,
               max_length=2)
  
  dec = [tokenizer.decode(ids) for ids in output]
  label = dec[0]
  return label



In [20]:
new['review_sentiment'] = new['reviews'].apply(lambda x: get_sentiment(x)[6:])

In [21]:
new['sentiment_score'] = new['review_sentiment'].map({'positive': 1, 'negative': 0})

In [22]:
new.sentiment_score.unique()

array([1, 0])

In [23]:
new = new[(new['reviews']!='')].reset_index(drop=True)

In [25]:
new

Unnamed: 0,movie_id,title,reviews,tags,review_sentiment,sentiment_score
0,19995,Avatar,Cameron’s epic can still thrill the audience w...,"In the 22nd century, a paraplegic Marine is di...",positive,1
1,19995,Avatar,Avatar still elicits much of the same wide eye...,"In the 22nd century, a paraplegic Marine is di...",positive,1
2,19995,Avatar,The emotional stakes presented in the final ba...,"In the 22nd century, a paraplegic Marine is di...",positive,1
3,19995,Avatar,"Thirteen years after its release, Avatar still...","In the 22nd century, a paraplegic Marine is di...",positive,1
4,19995,Avatar,A meaningful blockbuster that fails to play ig...,"In the 22nd century, a paraplegic Marine is di...",positive,1
...,...,...,...,...,...,...
10895,126186,Shanghai Calling,Shanghai Calling doesnt aspire to fresh insigh...,When ambitious New York attorney Sam is sent t...,positive,1
10896,126186,Shanghai Calling,If you prefer your social commentary in the fo...,When ambitious New York attorney Sam is sent t...,negative,0
10897,126186,Shanghai Calling,Shanghai Calling eventually reveals itself to ...,When ambitious New York attorney Sam is sent t...,negative,0
10898,126186,Shanghai Calling,"Through it all, Henney is an appealing screen ...",When ambitious New York attorney Sam is sent t...,negative,0


In [88]:
new = new.drop(['Unnamed: 0'], axis=1)

new['grouped_reviews'] = new.groupby(['movie_id','title','tags','review_sentiment','sentiment_score'])['reviews'].transform(lambda x: '.'.join(x)).drop_duplicates()

In [95]:
new = new.dropna().drop(['reviews'], axis=1)

In [20]:
new.to_csv("../tmdb_5000_data/tmdb_sentiment_grouped_reviews.csv",index=False)

In [21]:
new = pd.read_csv('../tmdb_5000_data/tmdb_sentiment_grouped_reviews.csv') 

In [22]:
new

Unnamed: 0,movie_id,title,tags,review_sentiment,sentiment_score,grouped_reviews
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",positive,1,Cameron’s epic can still thrill the audience w...
1,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",negative,0,Five hundred million dollars wasted ..The leve...
2,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,positive,1,It still might not be quite the conclusion we ...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,negative,0,Even with a twisting mystery and numerous new ...
4,49529,John Carter,"John Carter is a war-weary, former military ca...",positive,1,John Carter is a good summer movie in March bu...
...,...,...,...,...,...,...
1755,157185,Tin Can Man,Recently dumped by his girlfirend for another ...,positive,1,"Unique and eerie enough to be worth seeing, ev..."
1756,14337,Primer,Friends/fledgling entrepreneurs invent a devic...,positive,1,Time travel may provide the paradoxical mechan...
1757,14337,Primer,Friends/fledgling entrepreneurs invent a devic...,negative,0,The storytelling is so confusing and the multi...
1758,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...,negative,0,A star is born in Daniel Henney in the predict...


In [7]:
from sentence_transformers import models, losses, InputExample
from torch.utils.data import DataLoader
from sentence_transformers import evaluation

In [8]:
# microsoft/mpnet-base
# sentence-transformers/stsb-bert-large
word_embedding_model = models.Transformer('sentence-transformers/all-mpnet-base-v2').to(device)

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [9]:
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 
                                  pooling_mode_mean_tokens=True, 
                                  pooling_mode_cls_token=False, 
                                  pooling_mode_max_tokens=False
                              )

In [10]:
# update the model
sent_model = SentenceTransformer(modules=[word_embedding_model, pooling_model]).to(device)

In [11]:
train_examples = []

for index, row in enumerate(zip(new['tags'], new['grouped_reviews'], new['sentiment_score'])):
#     train_examples.append(InputExample(texts=[row[0], row[1]], label = [scores[index]]))
    train_examples.append(InputExample(texts=[row[0], row[1]], label = row[2]))    

In [12]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=4)
# ContrastiveLoss
# MarginMSELoss
# OnlineContrastiveLoss
# TripletLoss
train_loss = losses.OnlineContrastiveLoss(sent_model)

In [13]:
# dev_mse = evaluation.EmbeddingSimilarityEvaluator(new['tags'].values, new['reviews'].values,\
#                                                   scores=scores)
dev_mse = evaluation.BinaryClassificationEvaluator(new['tags'].values, \
                          new['grouped_reviews'].values, \
                          new['sentiment_score'].values)

In [14]:
# dev_mse = evaluation.MSEEvaluator(new['tags'].values, new['reviews'].values,teacher_model=sent_model)

In [15]:
sent_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100, output_path='result')

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/440 [00:00<?, ?it/s]

In [26]:
torch.save(sent_model.state_dict(), 'best_sent_model.pt')

In [23]:
descriptions = movies['overview'].apply(lambda x: " ".join(x)).tolist()
des_embeddings = []
for i,des in enumerate(descriptions):
    des_embeddings.append(sent_model.encode(des))

In [24]:
def recommend(query, model):
    #Compute cosine-similarities with all embeddings 
    model.eval()
    query_embedd = model.encode(query)
    cosine_scores = util.pytorch_cos_sim(query_embedd, des_embeddings)
    top5_matches = torch.argsort(cosine_scores, dim=-1, descending=True).tolist()[0][0:5]
    return top5_matches

In [32]:
query_show_des = 'monster attack humans'
recommendded_results = recommend(query_show_des, sent_model)

for index in recommendded_results:
    print(movies.iloc[index,1])

Shark Night
The Darkest Hour
Devil
Victor Frankenstein
[REC]


In [27]:
query_show_des = 'purchases a large house that has a zoo'
recommendded_results = recommend(query_show_des, sent_model)

for index in recommendded_results:
    print(new.iloc[index,1])

Hanna
1408
The Reaping
Cloud Atlas
Iron Man 3


In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()   

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  0% | 97% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 |  2% |  5% |


In [None]:
device

device(type='cuda')