### Important : check in "power options" of your computer that it does not go into sleep mode after x hours

In [2]:
import pandas as pd
import json
import numpy as np

### US movies dataset

In [3]:
CHARACTER_DATA_PATH = './data/character.metadata.tsv'
MOVIE_DATA_PATH = './data/movie.metadata.tsv'
SUMMARIES_DATA_PATH = './data/plot_summaries.txt'
NAME_DATA_PATH = './data/name.clusters.txt'
TYPE_DATA_PATH = './data/tvtropes.clusters.txt'

In [4]:
MOVIE_HEADER = ['Wikipedia_movie_ID','Freebase_movie_ID','Movie_name','Movie_release_date',
                'Movie_box_office_revenue','Movie_runtime','Movie_languages','Movie_countries','Movie_genres']

movies = pd.read_table(MOVIE_DATA_PATH,header=None,names=MOVIE_HEADER)

In [5]:
def format_dict(x):
    n = len(x)
    if n==0:
        return np.nan
    else:
        return ','.join(str(s) for s in list(x.values()))
    

try :

    movies['Movie_genres'] = movies['Movie_genres'].apply(json.loads).apply(format_dict)
    movies['Movie_countries'] = movies['Movie_countries'].apply(json.loads).apply(format_dict)
    movies['Movie_languages'] = movies['Movie_languages'].apply(json.loads).apply(format_dict)
except json.decoder.JSONDecodeError:
    print('Data has already been parsed and modified.')
    
# Keep only American movies
us_movies = movies[movies['Movie_countries'].astype(str).str.contains('United States of America')]
print("Number of US movies : {}".format(len(us_movies)))

Number of US movies : 34408


### Summaries dataset

In [6]:
summaries = pd.read_table(SUMMARIES_DATA_PATH,header=None,names=['Wikipedia_movie_ID','Summary'])
summaries.sample(3)

Unnamed: 0,Wikipedia_movie_ID,Summary
33324,26825116,Maria is a white farmer who runs a failing co...
40711,19587744,A 13-year-old boy named Billy runs from home a...
3544,29287983,Junior detective Gray discovers that the ecce...


### Merging the two datasets

In [7]:
us_summaries = us_movies.merge(summaries, how='inner', on=['Wikipedia_movie_ID'])
print("Number of US movies with summary : {}".format(len(us_summaries)))

Number of US movies with summary : 20788


### Splitting the dataframe (one different part of the dataframe for each person)

In [8]:
# We first sort 'us_summaries' to make sure that we are working with movies in the same order before iterating over the rows
us_summaries = us_summaries.sort_values(by=['Wikipedia_movie_ID'])

# Splitting (note : len(us_summaries) = 20788)
#range_for_ben = [0,4000]
#range_for_romain = [4000,8000]
range_for_augustin = [8000,12000]
#range_for_erwann = [12000,16000]
#range_for_lucas = [16000,20789]

In [9]:
split_us_summaries = us_summaries[range_for_augustin[0]:range_for_augustin[1]]

### Vectorization of summaries

In [None]:
# Packages to be installed (other installations might be needed, see below)
!pip install sentence_transformers
!pip install textract

In [10]:
from sentence_transformers import SentenceTransformer
from nltk import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import textract
from tqdm import tqdm
import nltk
import pickle

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\beynes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [13]:
def text2vec(text):
    # Input : the whole text as one string
    # Output : mean vector of all the embeddings vectors related to the sentences of the text input, as well as all embeddings
    
    # Remove special characters
    text = text.replace(u'\xa0', u' ')
    text = text.replace(u'\xc2','')
    # Split the whole text into sentences
    sentences = sent_tokenize(text)
    # Embedding using BERT model (can convert a sentence into a vector of dimension 768)
    df_text_sents = pd.Series(sentences)
    df_text_embeddings = df_text_sents.map(lambda x: model.encode(x))
    # Convert all vectors from reference text into one single vector, by taking the mean
    mean_vector = df_text_embeddings.sum() / df_text_embeddings.size
    
    return(mean_vector,df_text_embeddings)

In [25]:
embeddings = {}

for index,movie in tqdm(split_us_summaries.iterrows(), total=split_us_summaries.shape[0]):
    try:
        embeddings[movie['Wikipedia_movie_ID']] = text2vec(movie['Summary'])
    except:
        print("An error occurred for movie ID {}".format(movie['Wikipedia_movie_ID']))

# Store embeddings (serialize)
with open('embeddings_augustin.pickle', 'wb') as handle:
    pickle.dump(embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:10<00:00,  3.56s/it]


In [26]:
# To load the embeddings
with open('embeddings_augustin.pickle', 'rb') as handle:
    b = pickle.load(handle)