# Sentence Similarity
We attempted to calculate the similarity between the review and the plot of the reviewed movie.

In [None]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

### Import Libraries

We used pandas to read the dataset, scikit-learn to compute the metric and spaCy Sentence-BERT to calculate the similarity between the plot of the movie and the review.

In [None]:
import spacy_sentence_bert 
import spacy
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [None]:
nlp=spacy_sentence_bert.load_model('en_stsb_roberta_large')
similarityValue = []

### Read the datasets
We use the original dataset

In [None]:
dataRew=pd.read_json("../Dataset/IMDB_reviews.json",lines=True)

In [None]:
dataMovie=pd.read_json('../Dataset/IMDB_movie_details.json',lines=True)

In [None]:
dataRewS=dataRew[["movie_id","review_text","is_spoiler"]]

In [None]:
dataRewS

drop the film where the synopsis is not present

In [None]:
dataMovie=dataMovie[dataMovie["plot_synopsis"]!='']

We use the end of the plot synopsis to calculate similarity because it is more likely to contain spoilers, we use only the last 512 rows.

In [None]:
dataMovie['last'] = dataMovie['plot_synopsis'].apply(lambda x: x[-512:])

In [None]:
dataMovie

In [None]:
dataMovieS=dataMovie[["last","movie_id"]]

In [None]:
dataMovieS

In [None]:
dataRewS.info()

### Merge  the two datasets based on movie_id

In [None]:
dataSimilar=dataRewS.merge(dataMovieS,left_on="movie_id",right_on="movie_id",how="left")

In [None]:
dataSimilar

In [None]:
dataSimilar=dataSimilar.dropna()

In [None]:
dataSimilar=dataSimilar.rename(columns={'last':'endfilm'})

In [None]:
dataSimilar

### Try first on small dataset
We first tested the model on a smaller dataset of 2000 rows, with 1000 positive examples and 1000 negative examples.

In [None]:
dataReducedTrue=dataSimilar[0:1000]

In [None]:
dataReducedFalse=dataSimilar[5200 :6200]

In [None]:
dataReducedFalse

In [None]:
dataReduced=pd.concat([dataReducedTrue,dataReducedFalse],axis=0)

In [None]:
dataReduced

In [None]:
dataReduced.index = range(0, len(dataReduced))

In [None]:
dataReduced

Try with 0.30 threshold

In [None]:
results=[]
similarityValue=[]
for i in range(dataReduced.count()[0]):
    QueryRew=nlp(dataReduced.loc[i]["review_text"])
    QueryFilm=nlp(dataReduced.loc[i]["endfilm"])
    Similarity=QueryRew.similarity(QueryFilm)
    tuples=(Similarity,dataReduced.loc[i]["is_spoiler"])
    similarityValue.append(tuples)
    if(Similarity>0.30):
        tuplesR=(True,dataReduced.loc[i]["is_spoiler"])
    else:
        tuplesR=(False,dataReduced.loc[i]["is_spoiler"])
    results.append(tuplesR)

In [None]:
dataset = pd.DataFrame(results, columns=['Predicted', 'Real']) 


In [None]:
# Calcola l'accuracy e l'F1-score
accuracy = accuracy_score(dataset['Real'], dataset['Predicted'])
f1 = f1_score(dataset['Real'], dataset['Predicted'])
recall = recall_score(dataset['Real'], dataset['Predicted'])
precision = precision_score(dataset['Real'], dataset['Predicted'])


print(f"Accuracy: {accuracy}, F1: {f1}, Precision: {precision}, Recall: {recall}")

The results seem encouraging; we can test the model on a dataset of 250,000 rows.

### Try on a Bigger subset
We tested it on a subset approximately half the size of the dataset.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
big_data,second_part = train_test_split(dataSimilar, train_size=250000, stratify=dataSimilar['is_spoiler'])

In [None]:
big_data.index = range(0, len(big_data))

### Compute the similarity

Try with the same threshold as before

In [None]:
results=[]
similarityValue=[]
for i in range(big_data.count()[0]):
    QueryRew=nlp(big_data.loc[i]["review_text"])
    QueryFilm=nlp(big_data.loc[i]["endfilm"])
    Similarity=QueryRew.similarity(QueryFilm)
    tuples=(Similarity,big_data.loc[i]["is_spoiler"])
    similarityValue.append(tuples)
    if(Similarity>0.30):
        tuplesR=(True,big_data.loc[i]["is_spoiler"])
    else:
        tuplesR=(False,big_data.loc[i]["is_spoiler"])
    results.append(tuplesR)

### Compute Sentence Similarity

In [None]:
dataset = pd.DataFrame(results, columns=['Predicted', 'Real'])

In [None]:
# Calcola l'accuracy e l'F1-score
accuracy = accuracy_score(dataset['Real'], dataset['Predicted'])
f1 = f1_score(dataset['Real'], dataset['Predicted'])
recall = recall_score(dataset['Real'], dataset['Predicted'])
precision = precision_score(dataset['Real'], dataset['Predicted'])

In [None]:
print(f"Accuracy: {accuracy}, F1: {f1}, Precision: {precision}, Recall: {recall}")

### Try to define a new threshold
Based on average value of cosine similarity for the two class 

In [None]:
similarityValue= pd.DataFrame(similarityValue, columns=['cosine_values', 'Real'])

In [None]:
similarityValue[similarityValue["Real"]==True]["cosine_values"].mean()

In [None]:
similarityValue[similarityValue["Real"]==False]["cosine_values"].mean()

In [None]:
def computeSimilar(data):
    if data > 0.38:
        return True
    else:
        return False


In [None]:
similarityValue['Predicted']=similarityValue['cosine_values'].apply(computeSimilar)

In [None]:
# Calcola l'accuracy e l'F1-score
accuracy = accuracy_score(similarityValue['Real'], similarityValue['Predicted'])
f1 = f1_score(similarityValue['Real'], similarityValue['Predicted'])
recall = recall_score(similarityValue['Real'], similarityValue['Predicted'])
precision = precision_score(similarityValue['Real'], similarityValue['Predicted'])

In [None]:
print(f"Accuracy: {accuracy}, F1: {f1}, Precision: {precision}, Recall: {recall}")

save the result

In [None]:
with open("../Output/outputSentenceSim.txt", "a") as f:
    print(f"Accuracy: {accuracy}, F1: {f1}, Precision: {precision}, Recall: {recall}",file=f)