In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

In [20]:
from utils.character_attributes_extraction import character_names_from_text, character_attributes_from_text, character_active_verbs_from_text, character_patient_verbs_from_text

In [3]:
plots =  pd.read_csv(
    'data/MovieSummaries/plot_summaries.txt', 
    sep='\t', 
    names=['wiki_id', 'plot']
)

plots['plot'] = plots['plot'].apply(lambda x: ' '.join(x.split()))

plots.head(5)

Unnamed: 0,wiki_id,plot
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six year...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


# Extract characters names

## Spacy

[Spacy](https://spacy.io/usage/linguistic-features) is a really nice library that allows to do many nlp tasks easilly.
We will use named entity recognition and dependency parsing the most. Also we will probably use built-in word2vec.

In [4]:
import spacy

nlp = spacy.load("en_core_web_md")

In [5]:
doc = nlp("All happy families are alike; each unhappy family is unhappy in its own way.")

In [6]:
from spacy import displacy
displacy.render(doc, style='dep')

In [13]:
character_list = []

#For now I didn't parse the whole dataset, because the method needs refining
for index, row in tqdm([row for row in plots.iterrows()][:300]):
    plot = row['plot'] 
    character_names = character_names_from_text(plot)
    character_attributes = character_attributes_from_text(plot)
    character_active_verbs = character_active_verbs_from_text(plot)
    character_patient_verbs = character_patient_verbs_from_text(plot)
    for name in character_names:
        character_list.append(
            {
                'wiki_id': row['wiki_id'],
                'character': name,
                'adj': character_attributes.get(name, []),
                'active': character_active_verbs.get(name, []),
                'patient': character_patient_verbs.get(name, []),
            }
        )

character_df = pd.DataFrame(character_list)
character_df.head()
    

100%|██████████| 300/300 [03:14<00:00,  1.54it/s]


Unnamed: 0,wiki_id,character,adj,active,patient
0,31186339,Haymitch Abernathy,[],[warns],[]
1,31186339,Snow,[],"[summons, considers]",[]
2,31186339,Katniss,[],"[taken, survives, drops, warned, runs, shoots,...","[gave, find, tormenting, spares, force, tells,..."
3,31186339,Peeta Mellark,[son],"[reveals, meant, forms, begs, tells]",[]
4,31186339,Primrose Everdeen,[old],[chosen],[]


In [14]:
character_df.to_csv('data/character_attributes_sample_300.csv')

In [19]:
character_df[character_df['active'].map(len)>5]


Unnamed: 0,wiki_id,character,adj,active,patient
2,31186339,Katniss,[],"[taken, survives, drops, warned, runs, shoots,...","[gave, find, tormenting, spares, force, tells,..."
22,20663735,Maranchery Karunakara Menon,[],"[returns, refuses, regrets, accused, judged, s...",[]
24,2231378,Kid,[swindler],"[touting, comes, convinces, provide, make, dec...",[sentenced]
29,2231378,Charley,"[Oxford, Oxford]","[decides, kidnap, reveals, moving, sent, demands]","[visits, pay, overpowers]"
34,595909,Lindy,[wife],"[returns, saw, seems, charged, found, insisted]",[]
...,...,...,...,...,...
1571,8204853,Josie McBroom,"[When, proud, stand]","[convinces, comes, finds, patches, discover, t...",[call]
1579,4179195,Troy Duffy,[bartender],"[riding, taken, direct, works, enjoys, insults...","[hire, singling]"
1580,204774,Douzi,[Bean],"[escape, walks, screams, attaches, perform, sa...","[urges, gave]"
1587,204774,Master Guan,[director],"[refuses, beating, begins, urges, trains, shames]",[]
