# Sentence Similarity
We attempted to calculate the similarity between the review and the plot of the reviewed movie.

In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


### Import Libraries

We used spaCy Sentence-BERT to calculate the similarity between the plot of the movie and the review.

In [2]:
import spacy_sentence_bert 

import spacy
import pandas as pd


In [3]:
nlp=spacy_sentence_bert.load_model('en_stsb_roberta_large')
similarityValue = []



### Read the datasets

In [4]:
dataRew=pd.read_json("../Dataset/IMDB_reviews.json",lines=True)

In [5]:
dataMovie=pd.read_json('../Dataset/IMDB_movie_details.json',lines=True)

In [6]:
dataRewS=dataRew[["movie_id","review_text","is_spoiler"]]

In [7]:
dataRewS

Unnamed: 0,movie_id,review_text,is_spoiler
0,tt0111161,"In its Oscar year, Shawshank Redemption (writt...",True
1,tt0111161,The Shawshank Redemption is without a doubt on...,True
2,tt0111161,I believe that this film is the best story eve...,True
3,tt0111161,"**Yes, there are SPOILERS here**This film has ...",True
4,tt0111161,At the heart of this extraordinary movie is a ...,True
...,...,...,...
573908,tt0139239,"Go is wise, fast and pure entertainment. Assem...",False
573909,tt0139239,"Well, what shall I say. this one´s fun at any ...",False
573910,tt0139239,"Go is the best movie I have ever seen, and I'v...",False
573911,tt0139239,Call this 1999 teenage version of Pulp Fiction...,False


drop the film where the synopsis is not present

In [8]:
dataMovie=dataMovie[dataMovie["plot_synopsis"]!='']

We use the end of the plot synopsis to calculate similarity because it is more likely to contain spoilers.

In [9]:
dataMovie['last'] = dataMovie['plot_synopsis'].apply(lambda x: x[-512:])

In [10]:
dataMovie

Unnamed: 0,movie_id,plot_summary,duration,genre,rating,release_date,plot_synopsis,last
0,tt0105112,"Former CIA analyst, Jack Ryan is in England wi...",1h 57min,"[Action, Thriller]",6.9,1992-06-05,"Jack Ryan (Ford) is on a ""working vacation"" in...",realising that Ryan is leading them away from ...
1,tt1204975,"Billy (Michael Douglas), Paddy (Robert De Niro...",1h 45min,[Comedy],6.6,2013-11-01,Four boys around the age of 10 are friends in ...,"arlier, and make a toast. Alas, the scotch has..."
3,tt0040897,"Fred C. Dobbs and Bob Curtin, both down on the...",2h 6min,"[Adventure, Drama, Western]",8.3,1948-01-24,Fred Dobbs (Humphrey Bogart) and Bob Curtin (T...,"to the city to sell the goods, but once it is ..."
4,tt0126886,Tracy Flick is running unopposed for this year...,1h 43min,"[Comedy, Drama, Romance]",7.3,1999-05-07,Jim McAllister (Matthew Broderick) is a much-a...,"Tracy, yet again, lying, cheating, seducing an..."
5,tt0286716,"Bruce Banner, a brilliant scientist with a clo...",2h 18min,"[Action, Sci-Fi]",5.7,2003-06-20,Bruce Banner (Eric Bana) is a research scienti...,"ce's power is so immense, David cannot contain..."
...,...,...,...,...,...,...,...,...
1563,tt0120655,An abortion clinic worker with a special herit...,2h 10min,"[Adventure, Comedy, Drama]",7.3,1999-11-12,The film opens with a homeless man (Bud Cort) ...,ath and destruction caused by Bartleby and Lok...
1565,tt0276751,Twelve year old Marcus Brewer lives with his c...,1h 41min,"[Comedy, Drama, Romance]",7.1,2002-05-17,Will Freeman (Hugh Grant) is a 38-year-old bac...,"""But clearly, some men are part of island cha..."
1567,tt0289879,Evan Treborn grows up in a small town with his...,1h 53min,"[Sci-Fi, Thriller]",7.7,2004-01-23,"In the year 1998, Evan Treborn (Ashton Kutcher...","scaring her away, he makes sure that she is ne..."
1568,tt1723811,Brandon is a 30-something man living in New Yo...,1h 41min,[Drama],7.2,2012-01-13,"Brandon (Michael Fassbender) is a successful, ...",ies in the rain.Some time later Brandon is see...


In [11]:
dataMovieS=dataMovie[["last","movie_id"]]

In [12]:
dataMovieS

Unnamed: 0,last,movie_id
0,realising that Ryan is leading them away from ...,tt0105112
1,"arlier, and make a toast. Alas, the scotch has...",tt1204975
3,"to the city to sell the goods, but once it is ...",tt0040897
4,"Tracy, yet again, lying, cheating, seducing an...",tt0126886
5,"ce's power is so immense, David cannot contain...",tt0286716
...,...,...
1563,ath and destruction caused by Bartleby and Lok...,tt0120655
1565,"""But clearly, some men are part of island cha...",tt0276751
1567,"scaring her away, he makes sure that she is ne...",tt0289879
1568,ies in the rain.Some time later Brandon is see...,tt1723811


In [13]:
dataRewS.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573913 entries, 0 to 573912
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   movie_id     573913 non-null  object
 1   review_text  573913 non-null  object
 2   is_spoiler   573913 non-null  bool  
dtypes: bool(1), object(2)
memory usage: 9.3+ MB


### Merge  the two datasets based on movie_id

In [14]:
dataSimilar=dataRewS.merge(dataMovieS,left_on="movie_id",right_on="movie_id",how="left")

In [15]:
dataSimilar

Unnamed: 0,movie_id,review_text,is_spoiler,last
0,tt0111161,"In its Oscar year, Shawshank Redemption (writt...",True,"described. Just as Andy said, there was a lar..."
1,tt0111161,The Shawshank Redemption is without a doubt on...,True,"described. Just as Andy said, there was a lar..."
2,tt0111161,I believe that this film is the best story eve...,True,"described. Just as Andy said, there was a lar..."
3,tt0111161,"**Yes, there are SPOILERS here**This film has ...",True,"described. Just as Andy said, there was a lar..."
4,tt0111161,At the heart of this extraordinary movie is a ...,True,"described. Just as Andy said, there was a lar..."
...,...,...,...,...
573908,tt0139239,"Go is wise, fast and pure entertainment. Assem...",False,"ough to return to work, happy that, despite th..."
573909,tt0139239,"Well, what shall I say. this one´s fun at any ...",False,"ough to return to work, happy that, despite th..."
573910,tt0139239,"Go is the best movie I have ever seen, and I'v...",False,"ough to return to work, happy that, despite th..."
573911,tt0139239,Call this 1999 teenage version of Pulp Fiction...,False,"ough to return to work, happy that, despite th..."


In [16]:
dataSimilar=dataSimilar.dropna()

In [17]:
dataSimilar=dataSimilar.rename(columns={'last':'endfilm'})

In [18]:
dataSimilar

Unnamed: 0,movie_id,review_text,is_spoiler,endfilm
0,tt0111161,"In its Oscar year, Shawshank Redemption (writt...",True,"described. Just as Andy said, there was a lar..."
1,tt0111161,The Shawshank Redemption is without a doubt on...,True,"described. Just as Andy said, there was a lar..."
2,tt0111161,I believe that this film is the best story eve...,True,"described. Just as Andy said, there was a lar..."
3,tt0111161,"**Yes, there are SPOILERS here**This film has ...",True,"described. Just as Andy said, there was a lar..."
4,tt0111161,At the heart of this extraordinary movie is a ...,True,"described. Just as Andy said, there was a lar..."
...,...,...,...,...
573908,tt0139239,"Go is wise, fast and pure entertainment. Assem...",False,"ough to return to work, happy that, despite th..."
573909,tt0139239,"Well, what shall I say. this one´s fun at any ...",False,"ough to return to work, happy that, despite th..."
573910,tt0139239,"Go is the best movie I have ever seen, and I'v...",False,"ough to return to work, happy that, despite th..."
573911,tt0139239,Call this 1999 teenage version of Pulp Fiction...,False,"ough to return to work, happy that, despite th..."


### Try first on small dataset

In [19]:
dataReducedTrue=dataSimilar[0:1000]

In [20]:
dataReducedFalse=dataSimilar[5200 :6200]

In [21]:
dataReducedFalse

Unnamed: 0,movie_id,review_text,is_spoiler,endfilm
5200,tt0068646,Mindblowing piece of masterpiece. It can't be ...,False,"estions Michael about Connie's accusation, but..."
5201,tt0068646,"Yesterday, I was lucky to be able to watch 175...",False,"estions Michael about Connie's accusation, but..."
5202,tt0068646,"A wonderful film. I love the history, the acto...",False,"estions Michael about Connie's accusation, but..."
5203,tt0068646,Science fiction has been used as an indication...,False,"estions Michael about Connie's accusation, but..."
5204,tt0068646,RELEASED IN 1972 and directed by Francis Ford ...,False,"estions Michael about Connie's accusation, but..."
...,...,...,...,...
6195,tt0068646,I'm too considering as this movie has to be pr...,False,"estions Michael about Connie's accusation, but..."
6196,tt0068646,Perfect. That is the only word that can truly...,False,"estions Michael about Connie's accusation, but..."
6197,tt0068646,"The Godfather is simply a truly great film, th...",False,"estions Michael about Connie's accusation, but..."
6198,tt0068646,This film is Francis Ford Copollas best work t...,False,"estions Michael about Connie's accusation, but..."


In [22]:
dataReduced=pd.concat([dataReducedTrue,dataReducedFalse],axis=0)

In [61]:
dataReduced

Unnamed: 0,movie_id,review_text,is_spoiler,endfilm
0,tt0111161,"In its Oscar year, Shawshank Redemption (writt...",True,"described. Just as Andy said, there was a lar..."
1,tt0111161,The Shawshank Redemption is without a doubt on...,True,"described. Just as Andy said, there was a lar..."
2,tt0111161,I believe that this film is the best story eve...,True,"described. Just as Andy said, there was a lar..."
3,tt0111161,"**Yes, there are SPOILERS here**This film has ...",True,"described. Just as Andy said, there was a lar..."
4,tt0111161,At the heart of this extraordinary movie is a ...,True,"described. Just as Andy said, there was a lar..."
...,...,...,...,...
1995,tt0068646,I'm too considering as this movie has to be pr...,False,"estions Michael about Connie's accusation, but..."
1996,tt0068646,Perfect. That is the only word that can truly...,False,"estions Michael about Connie's accusation, but..."
1997,tt0068646,"The Godfather is simply a truly great film, th...",False,"estions Michael about Connie's accusation, but..."
1998,tt0068646,This film is Francis Ford Copollas best work t...,False,"estions Michael about Connie's accusation, but..."


In [62]:
dataReduced.index = range(0, len(dataReduced))

In [25]:
dataReduced

Unnamed: 0,movie_id,review_text,is_spoiler,endfilm
0,tt0111161,"In its Oscar year, Shawshank Redemption (writt...",True,"described. Just as Andy said, there was a lar..."
1,tt0111161,The Shawshank Redemption is without a doubt on...,True,"described. Just as Andy said, there was a lar..."
2,tt0111161,I believe that this film is the best story eve...,True,"described. Just as Andy said, there was a lar..."
3,tt0111161,"**Yes, there are SPOILERS here**This film has ...",True,"described. Just as Andy said, there was a lar..."
4,tt0111161,At the heart of this extraordinary movie is a ...,True,"described. Just as Andy said, there was a lar..."
...,...,...,...,...
1995,tt0068646,I'm too considering as this movie has to be pr...,False,"estions Michael about Connie's accusation, but..."
1996,tt0068646,Perfect. That is the only word that can truly...,False,"estions Michael about Connie's accusation, but..."
1997,tt0068646,"The Godfather is simply a truly great film, th...",False,"estions Michael about Connie's accusation, but..."
1998,tt0068646,This film is Francis Ford Copollas best work t...,False,"estions Michael about Connie's accusation, but..."


In [64]:
results=[]
similarityValue=[]
for i in range(dataReduced.count()[0]):
    QueryRew=nlp(dataReduced.loc[i]["review_text"])
    QueryFilm=nlp(dataReduced.loc[i]["endfilm"])
    Similarity=QueryRew.similarity(QueryFilm)
    tuples=(Similarity,dataReduced.loc[i]["is_spoiler"])
    similarityValue.append(tuples)
    if(Similarity>0.30):
        tuplesR=(True,dataReduced.loc[i]["is_spoiler"])
    else:
        tuplesR=(False,dataReduced.loc[i]["is_spoiler"])
    results.append(tuplesR)

  for i in range(dataReduced.count()[0]):


In [67]:
dataset = pd.DataFrame(results, columns=['Predicted', 'Real']) 


In [68]:
# Calcola l'accuracy e l'F1-score
accuracy = accuracy_score(dataset['Real'], dataset['Predicted'])
f1 = f1_score(dataset['Real'], dataset['Predicted'])
recall = recall_score(dataset['Real'], dataset['Predicted'])
precision = precision_score(dataset['Real'], dataset['Predicted'])


print(f"Accuracy: {accuracy}, F1: {f1}, Precision: {precision}, Recall: {recall}")

Accuracy: 0.726, F1: 0.7617391304347826, Precision: 0.6517857142857143, Recall: 0.9163179916317992


### Try on a Big subset

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
big_data,second_part = train_test_split(dataSimilar, train_size=250000, stratify=dataSimilar['is_spoiler'])

In [29]:
big_data

Unnamed: 0,movie_id,review_text,is_spoiler,endfilm
119211,tt1375666,If you think that The Dark Knight was Christop...,False,"ks his passport, but allows him passage throug..."
474629,tt1190080,"Just went to the very first screening here, al...",False,"illed, Tenzin is wounded, and the ark is set a..."
106358,tt4574334,The Duffer Brothers are master chefs. I pictur...,True,"look as the Upside Down for a minute, before ..."
369933,tt0330373,"TO be honest, this was my favourite book of al...",True,m straight to discovering gillyweed in it. Har...
64516,tt0081398,"In a brief scene in 1964, an aging, overweight...",True,"a contender"" scene from On the Waterfront com..."
...,...,...,...,...
496619,tt0103644,Reprising her role as Lt. Ellen Ripley from th...,False,y's body is finally immersed in the flames.Emp...
509336,tt0110632,I am slowly trying to make my way through all ...,False,e must also die. Gale attempts various argumen...
120360,tt0947798,Movie full of surprises and in parts nearly ho...,False,and newly infatuated with Nina. He is smiling ...
117989,tt2356777,A series of two contrasting seasons (to date)....,False,"The first season focuses on two detectives, Ma..."


In [30]:
big_data.index = range(0, len(big_data))

### Compute the similarity

Try first with 0.30  threshold

In [None]:
results=[]
similarityValue=[]
for i in range(big_data.count()[0]):
    QueryRew=nlp(big_data.loc[i]["review_text"])
    QueryFilm=nlp(big_data.loc[i]["endfilm"])
    Similarity=QueryRew.similarity(QueryFilm)
    tuples=(Similarity,big_data.loc[i]["is_spoiler"])
    similarityValue.append(tuples)
    if(Similarity>0.30):
        tuplesR=(True,big_data.loc[i]["is_spoiler"])
    else:
        tuplesR=(False,big_data.loc[i]["is_spoiler"])
    results.append(tuplesR)

### Compute Sentence Similarity

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [None]:
dataset = pd.DataFrame(results, columns=['Predicted', 'Real'])

In [58]:
# Calcola l'accuracy e l'F1-score
accuracy = accuracy_score(dataset['Real'], dataset['Predicted'])
f1 = f1_score(dataset['Real'], dataset['Predicted'])
recall = recall_score(dataset['Real'], dataset['Predicted'])
precision = precision_score(dataset['Real'], dataset['Predicted'])

In [59]:
print(f"Accuracy: {accuracy}, F1: {f1}, Precision: {precision}, Recall: {recall}")

Accuracy: 0.401284, F1: 0.41996349558808144, Precision: 0.2827297250760489, Recall: 0.8160787975541431


In [60]:
"""with open("../Output/outputSentenceBERTfirst.txt", "a") as f:
    print(f"Accuracy: {accuracy}, F1: {f1}, Precision: {precision}, Recall: {recall}",file=f) """

### Try to define a new threshold
Based on average value of cosine similarity for the two class 

In [40]:
similarityValue= pd.DataFrame(similarityValue, columns=['cosine_values', 'Real'])

In [41]:
similarityValue[similarityValue["Real"]==True]["cosine_values"].mean()

0.3980170037410408

In [42]:
similarityValue[similarityValue["Real"]==False]["cosine_values"].mean()

0.37382817538169427

In [49]:
def computeSimilar(data):
    if data > 0.38:
        return True
    else:
        return False


In [50]:
similarityValue['Predicted']=similarityValue['cosine_values'].apply(computeSimilar)

In [55]:
# Calcola l'accuracy e l'F1-score
accuracy = accuracy_score(similarityValue['Real'], similarityValue['Predicted'])
f1 = f1_score(similarityValue['Real'], similarityValue['Predicted'])
recall = recall_score(similarityValue['Real'], similarityValue['Predicted'])
precision = precision_score(similarityValue['Real'], similarityValue['Predicted'])

In [57]:
print(f"Accuracy: {accuracy}, F1: {f1}, Precision: {precision}, Recall: {recall}")

Accuracy: 0.535692, F1: 0.38998344588380585, Precision: 0.29949873675204014, Recall: 0.5588120124100123


In [56]:
with open("../Output/outputSentenceBERT.txt", "a") as f:
    print(f"Accuracy: {accuracy}, F1: {f1}, Precision: {precision}, Recall: {recall}",file=f)

### Note
We didn't apply the algorithm to the entire dataset because also testing on a small dataset, it becomes practically impossible to define a threshold value of similarity to determine whether a comment is a spoiler or not.