# NLP-based Speech Analysis of DraCor Data

In [None]:
from collections import Counter
from typing import Any

import requests
import pandas as pd
import spacy
from wordcloud import WordCloud

In [None]:
#! pip install requests pandas spacy wordcloud  

# 1. Retrieve data
* Decide for a corpus and play 


### 1.1 Set URL to API

In [None]:
base_url = "https://dracor.org/api/v1/"

### 1.2 Get corpus overview 

In [None]:
corpus_overview_url = base_url + "corpora/"
corpus_list = requests.get(corpus_overview_url).json()

corpus_abbreviations = []

for corpus_description in corpus_list:
    name = corpus_description["name"]
    print(f'{name}: {corpus_description["title"]}')
    corpus_abbreviations.append(name)

### 1.3 Select corpus

In [None]:
for i in range(10):
    # get corpusname with user input
    # save corpusname in variable
    corpusname = str(input("Please choose a corpusname from the list above. Enter the abbreviation: "))
    if corpusname not in corpus_abbreviations:
        print("The abbreviation you selected is not in the list. Please enter the abbreviation again.")
    else:
        print("Success!")
        break
else:
    corpusname = "ger"

### 1.4 Get the metadata of the corpus

In [None]:
corpus_metadata_url = corpus_overview_url + corpusname + "/metadata"

# perform request
metadata_file = requests.get(corpus_metadata_url, headers={"accept": "text/csv"}, stream=True)
metadata_file.raw.decode_content=True

# read metadata to DataFrame
metadata_df = pd.read_csv(metadata_file.raw, sep=",", encoding="utf-8")

metadata_df.head()

In [None]:
metadata_df.name

### 1.5 Select Play 

In [None]:
for i in range(10):
    # get playname with user input
    # save playname in variable
    playname = str(input("Please choose a playname. Enter the abbreviation: "))
    if playname not in metadata_df.name.unique():
        print("The abbreviation you selected is not in the list. Please enter the abbreviation again.")
    else:
        print("Success!")
        break
else:
    playname = "goethe-faust-in-urspruenglicher-gestalt"

### 1.5 Get spoken text of the selected play in the selected corpus

In [None]:
play_url = corpus_overview_url + corpusname + "/plays/" + playname + "/spoken-text-by-character"
try:
    play = requests.get(play_url).json()
except:
    print("No spoken text could be found for this play: ", play_url)

What does the data look like?

In [None]:
type(play)

In [None]:
play[0]

In [None]:
play[0].keys()

## 2. Annotation
* Dependent on the language of the corpus, a spacy model needs to be chosen: https://spacy.io/usage/models
* No models available for alsatian and bashkir
* Set spacy model name and download the model
* Load spacy model
* With spacy we can:
  * Sentencise
  * Tokenise
  * Lemmatise
  * PoS-Tag
  * Dependency Parse

### 2.0 Load spacy model 

In [None]:
spacy_model_name = "de_core_news_sm"

In [None]:
! python -m spacy download {spacy_model_name}

In [None]:
nlp = spacy.load(spacy_model_name) 

### 2.1 Annotate character speech 

In [None]:
for character in play:
    text = "\n".join(character["text"])
    character["text_annotated"] = nlp(text)


### 2.2 Navigate the annotations

In [None]:
play[0].keys()

In [None]:
text_annotated = play[0]["text_annotated"]

In [None]:
type(text_annotated)

In [None]:
for token in text_annotated[:50]:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.head)

## 3. Analyse
* Compare speech of the different sex 

1. Compare content words: adjectives, nouns, verbs
  * unique words for female, male, unknown
  * frequency comparison

2. Compare noun-adjective pairs
  * extract with dependency parses 


### 3.0 Comparison: Number of Words

In [None]:
def calculate_num_words_by_sex(play, sex) -> pd.DataFrame:
    num_words = {entry: 0 for entry in sex}
    for character in play:
        num_words[character["sex"]] += len(character["text_annotated"])
    return pd.DataFrame(num_words, index=[0])

In [None]:
# potentially fix wrong metadata
possible_male_values = ["MALE ", "MAE"]
for character in play:
    if character["sex"] in possible_male_values:
        character["sex"] = "MALE"

In [None]:
unique_sex_values = set([entry["sex"] for entry in play])
unique_sex_values

In [None]:
num_words_by_sex = calculate_num_words_by_sex(play, unique_sex_values)

In [None]:
num_words_by_sex

### 3.1 Collect female and male content words and compute frequencies 
1. Identify Pos-Tags for content words: Nouns, verbs, adjectives (adverbs)
2. Filter characters speech by content words
3. Count words

__Result__


| Sex | Male | Female | Unknown |
|--------|--------|--------|--------|
| Word 1   | frq1   | frq1   | frq 1  |
| Word 2 | frq2   | frq2   | frq 2  |
| Word 3 | frq3   | frq3   | frq 3  |

In [None]:
def get_lemmata_by_pos(annotated_text: spacy.tokens.doc.Doc, 
                       pos_tags:list[str]=['NOUN', 'VERB', 'ADJ', 'ADV']) -> list[str]:
    lemmata = []
    for token in annotated_text:
        if token.pos_ in pos_tags:
            lemmata.append(token.lemma_)
    return lemmata

In [None]:
def calculate_lemma_frq_by_sex(play:list[dict[str, Any]],
                               sex: set, pos_tags:list[str]=['NOUN', 'VERB', 'ADJ', 'ADV']) -> pd.DataFrame:
    frequencies = {entry:{} for entry in sex}
    for character in play: 
        lemmata = get_lemmata_by_pos(character["text_annotated"], pos_tags)
        print(len(lemmata))
        lemma_frq = Counter(lemmata)
        for word, frq in lemma_frq.items():
            if word not in frequencies[character["sex"]]:
                frequencies[character["sex"]][word] = 0
            frequencies[character["sex"]][word] += frq
    return pd.DataFrame(frequencies).fillna(0)          

In [None]:
frq_df = calculate_lemma_frq_by_sex(play, unique_sex_values)
len(frq_df)

In [None]:
frq_df.head()

### Calculate relative frequencies 

In [None]:
frq_rel_df = pd.DataFrame()
for entry in unique_sex_values:
    frq_rel_df[entry] = frq_df[entry].div(num_words_by_sex[entry].item())

In [None]:
frq_rel_df.head()

### 3.2 Compare Male and Female unique words
* Which words are mentioned only by male / female characters? 

In [None]:
men_only = frq_rel_df[(frq_df.MALE > 0) & (frq_df.FEMALE == 0)] #& (frq_df.UNKNOWN == 0)]
women_only = frq_rel_df[(frq_df.FEMALE > 0) & (frq_df.MALE == 0)] #& (frq_df.UNKNOWN == 0)]

In [None]:
women_only.FEMALE.sort_values(ascending=False)[:50]

In [None]:
wc = WordCloud().generate_from_frequencies(women_only.FEMALE)

In [None]:
wc.to_image()

In [None]:
men_only.MALE.sort_values(ascending=False)[:50]

In [None]:
wc = WordCloud().generate_from_frequencies(men_only.MALE)
wc.to_image()

### 3.3 Greatest difference 
* What are the words with the biggest difference in frequency?

In [None]:
frq_rel_df["MALE_FEMALE"] = frq_rel_df.MALE - frq_rel_df.FEMALE

In [None]:
frq_rel_df["FEMALE_MALE"] = frq_rel_df["MALE_FEMALE"] * -1

In [None]:
frq_rel_df.MALE_FEMALE.sort_values()

In [None]:
frq_rel_df.MALE_FEMALE.nlargest(10)

In [None]:
wc = WordCloud().generate_from_frequencies(frq_rel_df.MALE_FEMALE)

In [None]:
wc.to_image()

In [None]:
wc = WordCloud().generate_from_frequencies(frq_rel_df.FEMALE_MALE)
wc.to_image()

### 3.4 Noun-adjective pairs 
* Extract adjective-noun pairs by dependency parses 

In [None]:
def get_adj_noun_pairs(play: list[dict[str, Any]], unique_sex_values: set[str]={'FEMALE', 'MALE', 'UNKNOWN'}) -> dict[str, list[tuple]]:
    adj_noun_pairs_by_sex = {key: [] for key in unique_sex_values}
    for character in play:
        for token in character["text_annotated"]:
            if token.pos_ == "ADJ":
                if token.head.pos_ == "NOUN":
                    adj_noun_pairs_by_sex[character["sex"]].append((token.lemma_, token.head.lemma_))
    return adj_noun_pairs_by_sex

In [None]:
adj_noun_pairs_by_sex = get_adj_noun_pairs(play)

In [None]:
Counter(adj_noun_pairs_by_sex["MALE"]).most_common(30)

In [None]:
Counter(adj_noun_pairs_by_sex["FEMALE"]).most_common(30)

## 4. Extend to corpus
1. Retrieve spoken text for each play
2. Annotate plays
3. Perform analyses

In [None]:
play[0].keys()

### 4.1 Get spoken text of all plays in the selected corpus

In [None]:
plays = []
for name in metadata_df.name:
    play_url = corpus_overview_url + corpusname + "/plays/" + name + "/spoken-text-by-character"
    try:
        play_info = requests.get(play_url).json()
        for character in play_info:
            character["play_name"] = name
        plays.append(play_info)
    except:
        print("No spoken text could be found for this play: ", play_url)

### 4.2 Annotate texts with spacy 

In [None]:
# for faster processing with spacy
disable_components = ['ner', 'sentencizer']

In [None]:
all_texts = []
for play_info in plays:
    for character in play_info:
        all_texts.append("\n".join(character["text"]))

In [None]:
len(all_texts)

In [None]:
annotated_texts = list(nlp.pipe(all_texts[:5], disable=disable_components))

In [None]:
# unpack characters 
character_list = [character for play in plays for character in play]

annotated_characters = []
for character_info, annotation in zip(character_list, annotated_texts):
    annotated_play = {
        **character_info,
        'text_annotated': annotation
    }
    annotated_characters.append(annotated_play)

### 4.3 Analysis – it's up to you!