In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

In [None]:
from utils.character_attributes_extraction import character_names_from_text, character_attributes_from_text, character_active_verbs_from_text, character_patient_verbs_from_text

In [None]:
plots =  pd.read_csv(
    'data/MovieSummaries/plot_summaries.txt', 
    sep='\t', 
    names=['wiki_id', 'plot']
)

plots['plot'] = plots['plot'].apply(lambda x: ' '.join(x.split()))

plots.head(5)

# Extract characters names

## Spacy

[Spacy](https://spacy.io/usage/linguistic-features) is a really nice library that allows to do many nlp tasks easilly.
We will use named entity recognition and dependency parsing the most. Also we will probably use built-in word2vec.

In [None]:
import spacy

nlp = spacy.load("en_core_web_md")

In [None]:
doc = nlp("As Gregor Samsa awoke one morning from uneasy dreams he found himself transformed in his bed into an enormous insect.")

In [None]:
from spacy import displacy
displacy.render(doc, style='dep')

In [None]:
plot = plots.loc[51]['plot']
print(plot)

In [None]:
character_attributes_from_text(plot)

In [None]:
character_active_verbs_from_text(plot)

Extracting features for characters in each movie.

In [None]:
%%script false --no-raise-error
# it takes 5 hours 22 minutes to run 

character_list = []

for index, row in tqdm([row for row in plots.iterrows()]):
    plot = row['plot'] 
    character_names = character_names_from_text(plot)
    character_attributes = character_attributes_from_text(plot)
    character_active_verbs = character_active_verbs_from_text(plot)
    character_patient_verbs = character_patient_verbs_from_text(plot)
    for name in character_names:
        character_list.append(
            {
                'wiki_id': row['wiki_id'],
                'character': name,
                'adj': character_attributes.get(name, []),
                'active': character_active_verbs.get(name, []),
                'patient': character_patient_verbs.get(name, []),
            }
        )

character_df = pd.DataFrame(character_list)
character_df.head()

In [None]:
%%script false --no-raise-error

character_df.to_csv('data/character_attributes.csv')

In [None]:
%%script false --no-raise-error

character_df[character_df['active'].map(len)>5]

## Usage example

In [None]:
from utils.character_attributes_extraction import attributes2vec

In [None]:
characters =  pd.read_csv(
    'data/character_attributes.csv', 
    index_col=0,
    converters={
        "adj": lambda x: x.strip("[]").replace("'","").split(", "), # need this to read list columns from csv
        "active": lambda x: x.strip("[]").replace("'","").split(", "),
        "patient": lambda x: x.strip("[]").replace("'","").split(", ")
        }
    )
characters.head()

In [None]:
for i, r in characters.head().iterrows():
        print(" ".join([str(len(l)) for l in attributes2vec(r)]))