# Analyse HP 4 subtitles

In [1]:
%load_ext autoreload
%autoreload 2

## Import packages

In [4]:
%%time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

from bechdelai.processing.process_srt import create_txt_blocks, load_srt
from bechdelai.processing.dictionary import extract_person_entities_from_srt
from bechdelai.data.tmdb import get_movie_cast_from_id

CPU times: total: 0 ns
Wall time: 38.9 ms


In [3]:
pd.set_option('display.max_rows', 100)

## Load data

### srt

In [5]:
fpath = "../../../data/srt/harry_potter_4.srt"

In [6]:
srt_list = load_srt(fpath)

### TMDB

In [7]:
tmdb_id = "674"

In [23]:
cast = get_movie_cast_from_id(tmdb_id)["cast"]

## Process srt to blocks

In [6]:
srt_blocks = create_txt_blocks(srt_list, second_gap=0)

In [7]:
srt_blocks[:3]

["Bloody kids. How fastidious you've become, Wormtail.",
 'As I recall, you once called the nearest gutter pipe home.',
 'Could it be that the task of nursing me has become wearisome for you?']

## Analyse

### Load person synonyms dictionary

In [10]:
dict_path = "../../../data/dictionary/syn_person.json"

In [11]:
with open(dict_path, "r") as f:
    person_syn_dict = json.load(f)

In [12]:
# some examples
print(person_syn_dict["boy"])
print()
print(person_syn_dict["murderess"])

{'url': 'https://www.dictionary.com/browse/boy', 'def': {'noun': ['a young man who lacks maturity, judgment, etc.', "a son: Sam's oldest boy is helping him in the business.", 'Disparaging and Offensive. a term used to refer to or address a man considered by the speaker to be inferior in ethnicity, nationality, or occupational status.']}, 'gender': 'unknown'}

{'url': 'https://www.dictionary.com/browse/murderess', 'def': {'noun': ['a woman who commits murder.']}, 'gender': 'woman'}


### Extract entities corresponding to person

In [18]:
from bechdelai.processing.analyse_srt import extract_keyword


In [19]:
%%time
results = []

for srt in srt_list[:20]:
    res = extract_person_entities_from_srt(srt, person_syn_dict)
        
    if res:
        res = extract_keyword(srt.text)
        results.append(res)

CPU times: total: 1min 27s
Wall time: 11 s


In [52]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\natha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [58]:
# stopwords.words('english')

In [53]:
from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

text = "Nick likes to play football, however he is not too fond of tennis."
text_tokens = word_tokenize(text)

tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\natha/nltk_data'
    - 'C:\\Users\\natha\\apps\\miniforge3\\envs\\neuralcoref\\nltk_data'
    - 'C:\\Users\\natha\\apps\\miniforge3\\envs\\neuralcoref\\share\\nltk_data'
    - 'C:\\Users\\natha\\apps\\miniforge3\\envs\\neuralcoref\\lib\\nltk_data'
    - 'C:\\Users\\natha\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [47]:
from rapidfuzz.distance import Levenshtein
from itertools import permutations
import itertools

MIN_DIST_MATCH_CHARACTER = 2

def compute_dist_on_each_words(s1, s2):
    s1 = s1.split()
    s2 = s2.split()
    
    res = []
    for w1, w2 in itertools.product(s1, s2):
        res.append(Levenshtein.distance(w1, w2))
        
    return min(res)
    

def match_entity_with_cast(txt, cast):
    # remove stopwords from cast
    # TODO
    
    for c in cast:
        res = compute_dist_on_each_words(txt, c["character"])
        
        if res <= MIN_DIST_MATCH_CHARACTER:
            return c
        
    return None
        
    
match_entity_with_cast(results[2][0][0], cast)
    

{'adult': False,
 'gender': 2,
 'id': 5469,
 'known_for_department': 'Acting',
 'name': 'Ralph Fiennes',
 'original_name': 'Ralph Fiennes',
 'popularity': 36.721,
 'profile_path': '/tJr9GcmGNHhLVVEH3i7QYbj6hBi.jpg',
 'cast_id': 4,
 'character': 'Lord Voldemort',
 'credit_id': '52fe4268c3a36847f801c229',
 'order': 4}

In [48]:
results[2]

[('my Lord Voldemort', 8, 11, 'PROPN')]

In [16]:
ent_

'Harry !'

In [14]:
for res in results:
    print(f"===== from {res['start_sec']}s to {res['end_sec']}s =====")
    print("Find following entities:")
    for ent in res["entities"]:
        ent_ = ent['entity'].replace('\n', ' ')
        print(f"\t- {ent_} (detected gender `{ent['gender'].upper()}`)")
    print()

===== from 155s to 158s =====
Find following entities:
	- Wormtail (detected gender `UNKNOWN`)
	- Wormtail (detected gender `UNKNOWN`)

===== from 163s to 168s =====
Find following entities:
	- it (detected gender `UNKNOWN`)

===== from 168s to 171s =====
Find following entities:
	- my Lord Voldemort (detected gender `UNKNOWN`)

===== from 172s to 175s =====
Find following entities:
	- we (detected gender `UNKNOWN`)
	- it  (detected gender `UNKNOWN`)
	- the boy (detected gender `UNKNOWN`)

===== from 175s to 178s =====
Find following entities:
	- The boy (detected gender `UNKNOWN`)
	- everything (detected gender `UNKNOWN`)

===== from 178s to 182s =====
Find following entities:
	- It (detected gender `UNKNOWN`)
	- him (detected gender `MAN`)
	- it (detected gender `UNKNOWN`)

===== from 185s to 189s =====
Find following entities:
	- Good (detected gender `UNKNOWN`)

===== from 189s to 192s =====
Find following entities:
	- our old comrades (detected gender `UNKNOWN`)

===== from 192s t

The end.