# Compute sentence embeddings out of movie synopses

In [9]:
import pandas as pd
import json
import numpy as np

from sentence_transformers import SentenceTransformer
from nltk import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import textract
from tqdm import tqdm
import nltk
import pickle

### US movies dataset

In [10]:
CHARACTER_DATA_PATH = './data/character.metadata.tsv'
MOVIE_DATA_PATH = './data/movie.metadata.tsv'
SUMMARIES_DATA_PATH = './data/plot_summaries.txt'
NAME_DATA_PATH = './data/name.clusters.txt'
TYPE_DATA_PATH = './data/tvtropes.clusters.txt'

In [11]:
MOVIE_HEADER = ['Wikipedia_movie_ID','Freebase_movie_ID','Movie_name','Movie_release_date',
                'Movie_box_office_revenue','Movie_runtime','Movie_languages','Movie_countries','Movie_genres']

movies = pd.read_table(MOVIE_DATA_PATH,header=None,names=MOVIE_HEADER)

In [12]:
len(movies)

81741

In [13]:
def format_dict(x):
    n = len(x)
    if n==0:
        return np.nan
    else:
        return ','.join(str(s) for s in list(x.values()))
    

try :

    movies['Movie_genres'] = movies['Movie_genres'].apply(json.loads).apply(format_dict)
    movies['Movie_countries'] = movies['Movie_countries'].apply(json.loads).apply(format_dict)
    movies['Movie_languages'] = movies['Movie_languages'].apply(json.loads).apply(format_dict)
except json.decoder.JSONDecodeError:
    print('Data has already been parsed and modified.')
    
# Keep only American movies
us_movies = movies[movies['Movie_countries'].astype(str).str.contains('United States of America')]
print("Number of US movies : {}".format(len(us_movies)))

Number of US movies : 34408


### Summaries dataset

In [14]:
summaries = pd.read_table(SUMMARIES_DATA_PATH,header=None,names=['Wikipedia_movie_ID','Summary'])
summaries.sample(3)

Unnamed: 0,Wikipedia_movie_ID,Summary
14495,17871536,Twenty-one year old Valentine is a part-time ...
19343,15510926,"After being fired as a theater usher, Dan Quig..."
7428,28253687,Within an old dilapidated and seemingly automa...


### Merge the two datasets

In [15]:
us_summaries = us_movies.merge(summaries, how='inner', on=['Wikipedia_movie_ID'])
print("Number of US movies with summary : {}".format(len(us_summaries)))

Number of US movies with summary : 20788


In [16]:
# Sort 'us_summaries'
us_summaries = us_summaries.sort_values(by=['Wikipedia_movie_ID'])

In [18]:
us_summaries.head()

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Summary
3710,3217,/m/014hr,Army of Darkness,1992-10-09,21502796.0,81.0,English Language,United States of America,"Cult,Horror,Stop motion,Costume drama,Action/A...","After being pulled through a time portal, Ash ..."
687,3333,/m/0151l,The Birth of a Nation,1915,50000000.0,190.0,"Silent film,English Language",United States of America,"Silent film,Indie,Costume drama,Epic,Black-and...",The film follows two juxtaposed families: the...
12322,3746,/m/017n9,Blade Runner,1982-06-25,33139618.0,116.0,"Japanese Language,Cantonese,English Language,G...","United States of America,Hong Kong","Thriller,Cyberpunk,Science Fiction,Future noir...","{{Hatnote}} In Los Angeles, November 2019, ret..."
12478,3837,/m/018f8,Blazing Saddles,1974-02-07,119500000.0,93.0,"Yiddish Language,English Language",United States of America,"Western,Satire,Comedy","In the American Old West of 1874, construction..."
2537,3947,/m/0191n,Blue Velvet,1986-08,8551228.0,120.0,English Language,United States of America,"Thriller,Mystery,Crime Fiction",Jeffrey Beaumont returns to his logging home ...


### Vectorization of summaries

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\beynes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
model = SentenceTransformer('all-MiniLM-L6-v2')

**Note**
We first tried to use another model for sentence embedding, called 'bert-base-nli-mean-tokens'.
But actually, it might be deprecated.
Instead, we choose to use 'all-MiniLM-L6-v2'(see https://www.sbert.net/docs/pretrained_models.html), which offers a good trade-off between accuracy of the embeddings and computational time.

In [21]:
def text2vec(text):
    # Input : the whole text as one string
    # Output : mean vector of all the embeddings vectors related to the sentences of the text input, as well as all embeddings
    
    # Remove special characters
    text = text.replace(u'\xa0', u' ')
    text = text.replace(u'\xc2','')
    # Split the whole text into sentences
    sentences = sent_tokenize(text)
    # Sentence embedding using 'all-MiniLM-L6-v2' (convert a sentence into a vector of dimension 384 characterizing its semantic value)
    # It is based on BERT model 
    df_text_sents = pd.Series(sentences)
    df_text_embeddings = df_text_sents.map(lambda x: model.encode(x))
    # Convert all vectors from reference text into one single vector, by taking the mean
    mean_vector = df_text_embeddings.sum() / df_text_embeddings.size
    
    return(mean_vector,df_text_embeddings)

In [22]:
embeddings = {}

for index,movie in tqdm(us_summaries.iterrows(), total=us_summaries.shape[0]):
    try:
        embeddings[movie['Wikipedia_movie_ID']] = text2vec(movie['Summary'])
    except:
        print("An error occurred for movie ID {}".format(movie['Wikipedia_movie_ID']))

# Store embeddings (serialize)
with open('all_embeddings.pickle', 'wb') as handle:
    pickle.dump(embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

100%|██████████████████████████████████████████████████████████████████████████| 20788/20788 [1:47:49<00:00,  3.21it/s]
