In [2]:
import pandas as pd
import json
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Movies dataset

In [3]:
CHARACTER_DATA_PATH = './data/character.metadata.tsv'
MOVIE_DATA_PATH = './data/movie.metadata.tsv'
SUMMARIES_DATA_PATH = './data/plot_summaries.txt'
NAME_DATA_PATH = './data/name.clusters.txt'
TYPE_DATA_PATH = './data/tvtropes.clusters.txt'

In [4]:
MOVIE_HEADER = ['Wikipedia_movie_ID','Freebase_movie_ID','Movie_name','Movie_release_date',
                'Movie_box_office_revenue','Movie_runtime','Movie_languages','Movie_countries','Movie_genres']

movie = pd.read_table(MOVIE_DATA_PATH,header=None,names=MOVIE_HEADER)

In [10]:
def format_dict(x):
    n = len(x)
    if n==0:
        return np.nan
    else:
        return ','.join(str(s) for s in list(x.values()))
    

try :

    movie['Movie_genres'] = movie['Movie_genres'].apply(json.loads).apply(format_dict)
    movie['Movie_countries'] = movie['Movie_countries'].apply(json.loads).apply(format_dict)
    movie['Movie_languages'] = movie['Movie_languages'].apply(json.loads).apply(format_dict)
except json.decoder.JSONDecodeError:
    print('Data has already been parsed and modified.')
    
# Keep only American movies
us_movies = movie[movie['Movie_countries'].astype(str).str.contains('United States of America')]
print("Number of US movies : {}".format(len(us_movies)))

Data has already been parsed and modified.
Number of US movies : 34408


# Summaries dataset

In [9]:
summaries = pd.read_table(SUMMARIES_DATA_PATH,header=None,names=['Wikipedia_movie_ID','Summary'])
summaries.sample(5)

Unnamed: 0,Wikipedia_movie_ID,Summary
21354,23437432,Following the multiple-Aema theme started in M...
9162,3151857,Leonard Helperman is a 4th grader whose mother...
20874,3657826,Sam Dalmas is an American writer currently li...
15311,4894283,A single eventful night in the lives of a crew...
33876,6832750,Dr. Alice Dodgson gets her medical license re...


In [20]:
summaries[summaries["Summary"]=='NaN']

Unnamed: 0,Wikipedia_movie_ID,Summary


# Merging

In [12]:
us_summaries = us_movies.merge(summaries, how='inner', on=['Wikipedia_movie_ID'])
print("Number of US movies with summary : {}".format(len(us_summaries)))

Number of US movies with summary : 20788


# Vectorization of summaries

In [13]:
!pip install sentence_transformers
!pip install textract

Collecting sentence_transformers
  Using cached sentence-transformers-2.2.2.tar.gz (85 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Obtaining dependency information for transformers<5.0.0,>=4.6.0 from https://files.pythonhosted.org/packages/9a/06/e4ec2a321e57c03b7e9345d709d554a52c33760e5015fdff0919d9459af0/transformers-4.35.0-py3-none-any.whl.metadata
  Downloading transformers-4.35.0-py3-none-any.whl.metadata (123 kB)
     ---------------------------------------- 0.0/123.1 kB ? eta -:--:--
     --- ------------------------------------ 10.2/123.1 kB ? eta -:--:--
     --- ------------------------------------ 10.2/123.1 kB ? eta -:--:--
     ------------ ------------------------ 41.0/123.1 kB 279.3 kB/s eta 0:00:01
     ------------------------------------ 123.1/123.1 kB 718.6 kB/s eta 0:00:00
Collecting tqdm (from sentence_transformers)
  Obtaining dependenc

Collecting textract
  Using cached textract-1.6.5-py3-none-any.whl (23 kB)
Collecting argcomplete~=1.10.0 (from textract)
  Using cached argcomplete-1.10.3-py2.py3-none-any.whl (36 kB)
Collecting beautifulsoup4~=4.8.0 (from textract)
  Using cached beautifulsoup4-4.8.2-py3-none-any.whl (106 kB)
Collecting chardet==3.* (from textract)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
     ---------------------------------------- 0.0/133.4 kB ? eta -:--:--
     --- ------------------------------------ 10.2/133.4 kB ? eta -:--:--
     --- ------------------------------------ 10.2/133.4 kB ? eta -:--:--
     ----------- ------------------------- 41.0/133.4 kB 245.8 kB/s eta 0:00:01
     -----------------------------------  133.1/133.4 kB 787.7 kB/s eta 0:00:01
     ------------------------------------ 133.4/133.4 kB 658.4 kB/s eta 0:00:00
Collecting docx2txt~=0.8 (from textract)
  Using cached docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py): started
  Preparing metadat

DEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [14]:
from sentence_transformers import SentenceTransformer
from nltk import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import textract
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
import nltk
#nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\beynes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [17]:
def text2vec(text):
    # Input : the whole text as one string
    # Output : mean vector of all the embeddings vectors related to the sentences of the text input, as well as all embeddings
    
    # Remove special characters
    text = text.replace(u'\xa0', u' ')
    text = text.replace(u'\xc2','')
    # Split the whole text into sentences
    sentences = sent_tokenize(text)
    # Embedding using BERT model (can convert a sentence into a vector of dimension 768)
    df_text_sents = pd.Series(sentences)
    df_text_embeddings = df_text_sents.map(lambda x: model.encode(x))
    # Convert all vectors from reference text into one single vector, by taking the mean
    mean_vector = df_text_embeddings.sum() / df_text_embeddings.size
    
    return(mean_vector,df_text_embeddings)

In [18]:
sentences = us_summaries['Summary'].tolist()

sentence_embeddings = []

for sent in tqdm(sentences):
    sentence_embeddings.append(text2vec(sent))

us_summaries['Embedding'] = sentence_embeddings

  0%|                                                                            | 16/20788 [00:40<14:43:43,  2.55s/it]


KeyboardInterrupt: 