In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

In [3]:
from utils.character_attributes_extraction import character_names_from_text, character_attributes_from_text, character_active_verbs_from_text, character_patient_verbs_from_text

2023-12-22 23:55:19.044645: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-22 23:55:19.099102: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
plots =  pd.read_csv(
    'data/MovieSummaries/plot_summaries.txt', 
    sep='\t', 
    names=['wiki_id', 'plot']
)

plots['plot'] = plots['plot'].apply(lambda x: ' '.join(x.split()))

plots.head(5)

Unnamed: 0,wiki_id,plot
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six year...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


# Extract characters names

## Spacy

For the attribute extraction we will use [Spacy](https://spacy.io/usage/linguistic-features) - library that allows to do many nlp tasks easily.
We will primarily employ named entity recognition and dependency parsing. Additionally, for embeddings, we will utilize the built-in word2vec model.

In [5]:
import spacy

nlp = spacy.load("en_core_web_md")

In [6]:
doc = nlp("As Gregor Samsa awoke one morning from uneasy dreams he found himself transformed in his bed into an enormous insect.")

In [7]:
from spacy import displacy
displacy.render(doc, style='dep')

In [8]:
plot = plots.loc[51]['plot']
print(plot)

A farmer is busy with hoeing while his son Porky is ploughing the fields with his horse Dobbin. Hank Horsefly speeds up the process. The farmer and Porky are about to take a turn for the worst as Mr. Viper The Snake comes with a Mortgage form ready to evict them unless a sum of rent money is paid. Porky applies for a job as horse driving milkman with a strict condition not to break a single bottle. Porky is doing well until Hank having followed their trail, sends Dobbin going at full speed and crashing, causing all the milk bottles to break. As Porky despairs, Dobbin accidentally enters a horse race. When the race starts, Dobbin isn't getting far, until Hank kick starts Dobbin to overtake every racer and wins a $40,000 prize. Porky makes it to the farm in the nick of time, riding in a roofless limo. Porky pays the owed money to Mr. Viper and Hank gives him a kick.


In [9]:
character_attributes_from_text(plot)

{'Dobbin': ['his']}

In [10]:
character_active_verbs_from_text(plot)

{'Hank Horsefly': ['speed', 'follow'], 'Dobbin': ['enter', 'get']}

Extracting features for characters in each movie.

In [None]:
%%script false --no-raise-error
# it takes 5 hours 22 minutes to run 

character_list = []

for index, row in tqdm([row for row in plots.iterrows()]):
    plot = row['plot'] 
    character_names = character_names_from_text(plot)
    character_attributes = character_attributes_from_text(plot)
    character_active_verbs = character_active_verbs_from_text(plot)
    character_patient_verbs = character_patient_verbs_from_text(plot)
    for name in character_names:
        character_list.append(
            {
                'wiki_id': row['wiki_id'],
                'character': name,
                'adj': character_attributes.get(name, []),
                'active': character_active_verbs.get(name, []),
                'patient': character_patient_verbs.get(name, []),
            }
        )

character_df = pd.DataFrame(character_list)
character_df.head()

In [None]:
%%script false --no-raise-error

character_df.to_csv('data/character_attributes.csv')

In [None]:
%%script false --no-raise-error

character_df[character_df['active'].map(len)>5]

## Usage example

In [11]:
from utils.character_attributes_extraction import attributes2vec

In [12]:
characters =  pd.read_csv(
    'data/character_attributes.csv', 
    index_col=0,
    converters={
        "adj": lambda x: x.strip("[]").replace("'","").split(", "), # need this to read list columns from csv
        "active": lambda x: x.strip("[]").replace("'","").split(", "),
        "patient": lambda x: x.strip("[]").replace("'","").split(", ")
        }
    )
characters.head()

Unnamed: 0,wiki_id,character,adj,active,patient
0,31186339,Primrose Everdeen,[old],[choose],[]
1,31186339,Caesar Flickerman,[],[],[]
2,31186339,Peeta Mellark,[son],"[take, reveal, mean, form, present, beg, tell]",[force]
3,31186339,Cato,[],[kill],"[encounter, wound, shoot]"
4,31186339,Snow,[],"[summon, consider]",[]


In [13]:
for i, r in characters.head().iterrows():
        print(" ".join([str(len(l)) for l in attributes2vec(r)]))

1 1 0
0 0 0
1 7 1
0 1 3
0 2 0
