In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import spacy

In [2]:
def process_text(documents:dict, nlp_pipeline, n_process:int=8, batch_size:int=100):
    '''
    Using Spacy's NLP pipeline, lemmatize and tokenize text documents.
    Adapted from ADA 2022 text analysis exercise.
    '''

    # Assign document ids, values
    doc_ids, doc_vals = documents.keys(), documents.values()
    n_docs = len(documents)
    
    processed_docs = list()
    for doc_id, doc in tqdm(zip(doc_ids, nlp.pipe(doc_vals, n_process=n_process, batch_size=batch_size)), total=n_docs):

        # Process document using Spacy NLP pipeline.
        ents = doc.ents  # Named entities

        # Keep only words (no numbers, no punctuation).
        # Lemmatize tokens, remove punctuation and remove stopwords.
        doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

        # Remove common words from a stopword list and keep only words of length 3 or more.
        doc = [token for token in doc if token not in STOPWORDS and len(token) > 2]

        # Add named entities, but only if they are a compound of more than word.
        doc.extend([str(entity) for entity in ents if len(entity) > 1])

        
        processed_docs.append({'ID':doc_id, 'Text':doc})
        
    return processed_docs

In [3]:
data_path = Path('../data/')

### Read movie id conversion table

For consistency, we index everything by Wikidata IDs, rather than Freebase or IMDb IDs.

In [4]:
movie_ids = pd.read_csv(data_path.joinpath('processed/wikipedia_ids.csv'))
movie_ids.head()

Unnamed: 0,Freebase ID,Wikidata URI,Wikidata ID,IMDb ID
0,/m/03vyhn,http://www.wikidata.org/entity/Q261700,Q261700,tt0228333
1,/m/08yl5d,http://www.wikidata.org/entity/Q16250726,Q16250726,tt0245916
2,/m/0crgdbh,http://www.wikidata.org/entity/Q4978832,Q4978832,tt0094806
3,/m/0285_cd,http://www.wikidata.org/entity/Q7995657,Q7995657,tt0094320
4,/m/01mrr1,http://www.wikidata.org/entity/Q869644,Q869644,tt0083949


### Initialize an NLP pipeline

In [2]:
# Set the computing device. Transformer models are optimized for GPU but the rest optimized for CPU.
spacy.require_cpu()

 # Initialize an NLP pipeline
nlp = spacy.load('en_core_web_lg')

STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS

## Tokenization & lemmatization of subtitles

In [5]:
subtitles_path = data_path.joinpath('raw/subtitles/')

# Use IMDb IDs to get Wikidata IDs
movie_ids_imdb = movie_ids.set_index('IMDb ID').copy()

subtitles= {}
for subtitle in tqdm(list(subtitles_path.glob('*.txt'))):
    imdb_id = subtitle.stem
    
    # Manually fix a problematic entry
    if imdb_id == 'tt0250469':
        # It seems that the film and tv series named 'Killjoy' have the same IMDb ID.
        wikidata_id = movie_ids_imdb.loc[imdb_id]['Wikidata ID'][0]
    else:
        wikidata_id = movie_ids_imdb.loc[imdb_id]['Wikidata ID']
    
    # Read a subtitle and record it with its Wikipedia ID
    with open(subtitle, 'r', encoding='utf-8') as file:
        subtitles[wikidata_id] = file.read().replace('\n', ' ')

  0%|          | 0/27329 [00:00<?, ?it/s]

In [7]:
subtitles_tokinezed = process_text(subtitles, nlp, n_process=8, batch_size=16)

  0%|          | 0/27329 [00:00<?, ?it/s]

In [13]:
subtitles_tokinezed = pd.DataFrame.from_dict(subtitles_tokinezed)
subtitles_tokinezed.rename({'ID':'Wikidata ID', 'Text':'Subtitle'}, axis=1, inplace=True)
subtitles_tokinezed.head()

Unnamed: 0,Wikidata ID,Subtitle
0,Q1755660,"['Divine', 'Comedy', 'Dante', 'Alighieri', 'HE..."
1,Q2576383,"['MABEL', 'WHEEL', 'Parte', 'Farce', 'Comedy',..."
2,Q2294948,"['fireman', 'let', 'house', 'insuranceand', 'd..."
3,Q1923676,"['cure', 'health', 'spring', 'throw', 'liquor'..."
4,Q1070484,"['dog', 'life', 'Dawn', 'scrap', 'thoroughbred..."


In [12]:
subtitles_tokinezed.to_parquet(data_path.joinpath('processed/subtitles_tokenized.parquet.gzip'))

## Tokenization & lemmatization of plot summaries

In [14]:
col_names = ['Wikipedia movie ID', 'Summary']
plot_summaries = pd.read_csv(data_path.joinpath('raw/MovieSummaries/plot_summaries.txt'), delimiter='\t', names=col_names)
plot_summaries.head()

Unnamed: 0,Wikipedia movie ID,Summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [15]:
col_names = [
    'Wikipedia movie ID',
    'Freebase movie ID',
    'Movie name',
    'Movie release date',
    'Movie box office revenue',
    'Movie runtime',
    'Movie languages (Freebase ID:name tuples)',
    'Movie countries (Freebase ID:name tuples)',
    'Movie genres (Freebase ID:name tuples)'
]
df_movies = pd.read_csv(data_path.joinpath('raw/MovieSummaries/movie.metadata.tsv'), delimiter='\t', names=col_names)
df_movies.head()

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples)
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [16]:
# Match Wikipedia IDs across CMU dataset and mask accordingly
mask = plot_summaries['Wikipedia movie ID'].isin(df_movies['Wikipedia movie ID'])
df_summaries = plot_summaries[mask].copy().reset_index(drop=True)

freebase_ids = df_movies.set_index('Wikipedia movie ID').loc[df_summaries['Wikipedia movie ID']]['Freebase movie ID']
df_summaries['Freebase ID'] = freebase_ids.reset_index(drop=True)
df_summaries = df_summaries.set_index('Freebase ID').drop('Wikipedia movie ID', axis=1)

df_summaries.head()

Unnamed: 0_level_0,Summary
Freebase ID,Unnamed: 1_level_1
/m/076w2lb,"Shlykov, a hard-working taxi driver and Lyosha..."
/m/0gkz15s,The nation of Panem consists of a wealthy Capi...
/m/051zjwb,Poovalli Induchoodan is sentenced for six yea...
/m/06xtz3,"The Lemon Drop Kid , a New York City swindler,..."
/m/02tqm5,Seventh-day Adventist Church pastor Michael Ch...


In [17]:
summaries = df_summaries.squeeze().to_dict()

In [18]:
summaries_tokinezed = process_text(summaries, nlp, n_process=8, batch_size=160)

  0%|          | 0/42204 [00:00<?, ?it/s]

In [22]:
summaries_tokinezed = pd.DataFrame.from_dict(summaries_tokinezed)
summaries_tokinezed.rename({'ID':'Freebase ID', 'Text':'Summary'}, axis=1, inplace=True)
summaries_tokinezed.head()

Unnamed: 0,Freebase ID,Subtitle
0,/m/076w2lb,"[Shlykov, hard, work, taxi, driver, Lyosha, sa..."
1,/m/0gkz15s,"[nation, Panem, consist, wealthy, Capitol, poo..."
2,/m/051zjwb,"[Poovalli, Induchoodan, sentence, year, prison..."
3,/m/06xtz3,"[Lemon, Drop, Kid, New, York, City, swindler, ..."
4,/m/02tqm5,"[seventh, day, Adventist, Church, pastor, Mich..."


In [24]:
# Match Freebase IDs with Wikidata IDs
mask = summaries_tokinezed['Freebase ID'].isin(movie_ids['Freebase ID'])
summaries_tokinezed['Wikidata ID'] = None

wikidata_ids = movie_ids.set_index('Freebase ID').loc[summaries_tokinezed.loc[mask, 'Freebase ID']]['Wikidata ID']
summaries_tokinezed.loc[mask, 'Wikidata ID'] = wikidata_ids.values

summaries_tokinezed.head()

Unnamed: 0,Freebase ID,Subtitle,Wikidata ID
0,/m/076w2lb,"[Shlykov, hard, work, taxi, driver, Lyosha, sa...",Q2552456
1,/m/0gkz15s,"[nation, Panem, consist, wealthy, Capitol, poo...",Q212965
2,/m/051zjwb,"[Poovalli, Induchoodan, sentence, year, prison...",Q6965425
3,/m/06xtz3,"[Lemon, Drop, Kid, New, York, City, swindler, ...",Q7746905
4,/m/02tqm5,"[seventh, day, Adventist, Church, pastor, Mich...",Q1249239


In [25]:
summaries_tokinezed.to_parquet(data_path.joinpath('processed/summaries_tokenized.parquet.gzip'))