# Analyse HP 4 subtitles

In [1]:
%load_ext autoreload
%autoreload 2

## Import packages

In [2]:
%%time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from bechdelai.processing.process_srt import create_txt_blocks, load_srt
from bechdelai.processing.analyse_srt import extract_keyword, word_is_synonym_of_person

2022-06-14 23:07:57,063 loading file C:\Users\natha\.flair\models\ner-english\4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4
2022-06-14 23:08:00,345 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
CPU times: total: 17.8 s
Wall time: 29.5 s


In [3]:
pd.set_option('display.max_rows', 100)

## Load data

In [4]:
fpath = "../../../data/srt/harry_potter_4.srt"

In [5]:
srt_list = load_srt(fpath)

## Process srt to blocks

In [6]:
srt_blocks = create_txt_blocks(srt_list, second_gap=0)

In [7]:
srt_blocks[:3]

["Bloody kids. How fastidious you've become, Wormtail.",
 'As I recall, you once called the nearest gutter pipe home.',
 'Could it be that the task of nursing me has become wearisome for you?']

## Analyse

In [8]:
import json

In [59]:
dict_path = "../../../data/dictionary/syn_person.json"

In [60]:
with open(dict_path, "r") as f:
    person_syn_dict = json.load(f)

In [12]:
from bechdelai.scripts.scrap_dictionary import person_syn

In [63]:
import re

def def_with_person_syn(definition):
    """Returns whether the definition starts with
    a or an and a synonym of person
    """
    words_syn = "|".join(person_syn)
    pattern = f"^(a|an)((\s)|(\s[a-zA-Z]*\s))({words_syn})"
    
    return bool(re.match(pattern, definition))

def find_word_gender(definition):
    woman_pat = "^a (girl|woman)"
    man_pat = "^a (boy|man)"
    
    if re.match(woman_pat, definition):
        return "woman"
    elif re.match(man_pat, definition):
        return "man"
    else:
        return "unknown"

def process_syn_dict(person_syn_dict):

    person_syn_dict = {
        k: v 
        for k, v in person_syn_dict.items() 
        if def_with_person_syn(list(v["def"].values())[0][0])
    }
    
    for k, v in person_syn_dict.items():
        def_ = list(v["def"].values())[0][0]
        
        person_syn_dict[k]["gender"] = find_word_gender(def_)
        
    return person_syn_dict

person_syn_dict_ = process_syn_dict(person_syn_dict)

In [82]:
def time_to_sec(t):
    seconds = (t.hour * 60 + t.minute) * 60 + t.second
    return seconds

In [86]:
%%time
results = []

for srt in srt_list:
    ent = extract_keyword(srt.text)
    
    entities = []
    for e in ent:
        ent_ = {}
        
        if e[3] == "NOUN":
            w = word_is_synonym_of_person(e[0], person_syn_dict_)
            if w:
                gender = person_syn_dict_[w]["gender"]
                ent_["entity"] = e[0]
                ent_["gender"] = gender
                
        elif e[3] == "PRON":
            if e[0].lower() in ['he', 'him', 'his', 'himself']:
                ent_["entity"] = e[0]
                ent_["gender"] = "man"
                            
            elif e[0].lower() in ['she', 'her', 'hers', 'herself']:
                ent_["entity"] = e[0]
                ent_["gender"] = "woman"
                
            else:
                ent_["entity"] = e[0]
                ent_["gender"] = "unknown"
                
        else:
            ent_["entity"] = e[0]
            ent_["gender"] = "unknown"
            
        if ent_:
            entities.append(ent_)
            
    if entities:
        results.append({
            "start_sec": time_to_sec(srt.start.to_time()),
            "end_sec": time_to_sec(srt.end.to_time()),
            "duration": time_to_sec(srt.duration.to_time()),
            "entities": entities
        })

CPU times: total: 1h 4min 18s
Wall time: 8min 6s


In [96]:
for res in results:
    print(f"===== from {res['start_sec']}s to {res['end_sec']}s =====")
    print("Find following entities:")
    for ent in res["entities"]:
        ent_ = ent['entity'].replace('\n', ' ')
        print(f"\t- {ent_} (detected gender `{ent['gender'].upper()}`)")
    print()

===== from 155s to 158s =====
Find following entities:
	- Wormtail (detected gender `UNKNOWN`)
	- Wormtail (detected gender `UNKNOWN`)

===== from 163s to 168s =====
Find following entities:
	- it (detected gender `UNKNOWN`)

===== from 168s to 171s =====
Find following entities:
	- my Lord Voldemort (detected gender `UNKNOWN`)

===== from 172s to 175s =====
Find following entities:
	- we (detected gender `UNKNOWN`)
	- it  (detected gender `UNKNOWN`)
	- the boy (detected gender `UNKNOWN`)

===== from 175s to 178s =====
Find following entities:
	- The boy (detected gender `UNKNOWN`)
	- everything (detected gender `UNKNOWN`)

===== from 178s to 182s =====
Find following entities:
	- It (detected gender `UNKNOWN`)
	- him (detected gender `MAN`)
	- it (detected gender `UNKNOWN`)

===== from 185s to 189s =====
Find following entities:
	- Good (detected gender `UNKNOWN`)

===== from 192s to 194s =====
Find following entities:
	- them (detected gender `UNKNOWN`)

===== from 206s to 209s =====