# Speech Analysis of DraCor Data

In [None]:
import requests
import pandas as pd
import spacy
from tqdm import tqdm
from collections import Counter
from wordcloud import WordCloud

# 1. Retrieve data
* Decide for a corpus and play 


### 1.1 Set URL to API

In [None]:
base_url = "https://dracor.org/api/v1/"

### 1.2 Get corpus overview 

In [None]:
corpus_overview_url = base_url + "corpora/"
corpus_list = requests.get(corpus_overview_url).json()

corpus_abbreviations = []

for corpus_description in corpus_list:
    name = corpus_description["name"]
    print(f'{name}: {corpus_description["title"]}')
    corpus_abbreviations.append(name)

### 1.3 Select corpus

In [None]:
for i in range(10):
    # get corpusname with user input
    # save corpusname in variable
    corpusname = str(input("Please choose a corpusname from the list above. Enter the abbreviation: "))
    if corpusname not in corpus_abbreviations:
        print("The abbreviation you selected is not in the list. Please enter the abbreviation again.")
    else:
        print("Success!")
        break
else:
    corpusname = "ger"

### 1.4 Get the metadata of the corpus

In [None]:
corpus_metadata_url = corpus_overview_url + corpusname + "/metadata"

# perform request
metadata_file = requests.get(corpus_metadata_url, headers={"accept": "text/csv"}, stream=True)
metadata_file.raw.decode_content=True

# read metadata to DataFrame
metadata_df = pd.read_csv(metadata_file.raw, sep=",", encoding="utf-8")

metadata_df.head()

### 1.5 Select Play 

In [None]:
for i in range(10):
    # get playname with user input
    # save playname in variable
    playname = str(input("Please choose a playname. Enter the abbreviation: "))
    if playname not in metadata_df.name.unique():
        print("The abbreviation you selected is not in the list. Please enter the abbreviation again.")
    else:
        print("Success!")
        break
else:
    playname = "goethe-faust-in-urspruenglicher-gestalt"

### 1.5 Get full text of all plays in the selected corpus

In [None]:
play_url = corpus_overview_url + corpusname + "/plays/" + playname + "/spoken-text-by-character"
try:
    play = requests.get(play_url).json()
except:
    print("No spoken text could be found for this play: ", play_url)

What does the data look like?

In [None]:
play[0]

In [None]:
play[0].keys()

## 2. Annotation
* Dependent on the language of the corpus, a spacy model needs to be chosen
* No models available for alsatian and bashkir
* Set spacy model name and download the model
* Load spacy model
* With spacy we can:
  * Tokenise
  * Lemmatise
  * PoS-Tag
  * Dependency Parse

In [None]:
spacy_model_name = "de_core_news_sm"

In [None]:
! python -m spacy download {spacy_model_name}

In [None]:
nlp = spacy.load(spacy_model_name) 

In [None]:
for character in play:
    text = "\n".join(character["text"])
    character["text_annotated"] = nlp(text)


### Navigate the annotations

In [None]:
text_annotated = play[0]["text_annotated"]

In [None]:
type(text_annotated)

In [None]:
for token in text_annotated[:20]:
    print(token.text, token.lemma_, token.pos_, token.head)

## 3. Analyse
* Compare speech of the different gender 

1. Compare content words: adjectives, nouns, verbs
  * unique words for female, male, unknown
  * frequency comparison

2. Compare noun-adjective pairs
  * extract with dependency parses 


### 3.0 Comparison number of words

In [None]:
def calculate_num_words_by_gender(play, gender) -> pd.DataFrame:
    num_words = {entry: 0 for entry in gender}
    for character in play:
        num_words[character["gender"]] += len(character["text_annotated"])
    return pd.DataFrame(num_words, index=[0])

In [None]:
unique_gender_values = set([entry["gender"] for entry in play])
unique_gender_values

In [None]:
num_words_by_gender = calculate_num_words_by_gender(play, unique_gender_values)

In [None]:
num_words_by_gender

### 3.1 Collect female and male content words and frequencies 

| Gender | Male | Female | Unknown |
|--------|--------|--------|--------|
| Word 1   | frq1   | frq1   | frq 1  |
| Word 2 | frq2   | frq2   | frq 2  |
| Word 3 | frq3   | frq3   | frq 3  |

In [None]:
def get_lemmata_by_pos(annotated_text: spacy.tokens.doc.Doc, 
                       pos_tags=['NOUN', 'VERB', 'ADJ']) -> list[str]:
    lemmata = []
    for token in annotated_text:
        if token.pos_ in pos_tags:
            lemmata.append(token.lemma_)
    return lemmata
    

In [None]:
def calculate_lemma_frq_by_gender(play:dict, gender: set) -> pd.DataFrame:
    frequencies = {entry:{} for entry in gender}
    for character in play: 
        lemmata = get_lemmata_by_pos(character["text_annotated"])
        lemma_frq = Counter(lemmata)
        for word, frq in lemma_frq.items():
            if word not in frequencies[character["gender"]]:
                frequencies[character["gender"]][word] = 0
            frequencies[character["gender"]][word] += frq
    return pd.DataFrame(frequencies).fillna(0)
                

In [None]:
frq_df = calculate_lemma_frq_by_gender(play, unique_gender_values)
len(frq_df)

In [None]:
frq_df.head()

In [None]:
frq_df.MALE.div(num_words_by_gender.MALE.item())

In [None]:
num_words_by_gender.MALE.item()

### Relative frequencies 

In [None]:
frq_rel_df = pd.DataFrame()
for entry in unique_gender_values:
    frq_rel_df[entry] = frq_df[entry].div(num_words_by_gender[entry].item())

In [None]:
frq_rel_df

### 3.2 Compare Male and Female unique words
* Which words are mentioned only by men / women? 

In [None]:
pip install wordcloud

In [None]:
men_only = frq_rel_df[(frq_df.MALE > 0) & (frq_df.FEMALE == 0) & (frq_df.UNKNOWN == 0)]
women_only = frq_rel_df[(frq_df.FEMALE > 0) & (frq_df.MALE == 0) & (frq_df.UNKNOWN == 0)]

In [None]:
women_only.FEMALE

In [None]:
wc = WordCloud().generate_from_frequencies(women_only.FEMALE)

In [None]:
wc.to_image()

In [None]:
men_only

In [None]:
wc = WordCloud().generate_from_frequencies(men_only.MALE)
wc.to_image()

### 3.3 Greatest difference 
* What are the words with the biggest difference in frequency?

In [None]:
frq_rel_df["MALE_FEMALE"] = frq_rel_df.MALE - frq_rel_df.FEMALE

In [None]:
frq_rel_df["FEMALE_MALE"] = frq_rel_df["MALE_FEMALE"] * -1

In [None]:
frq_rel_df.MALE_FEMALE.sort_values()

In [None]:
frq_df.loc["gro√ü"]

In [None]:
frq_rel_df.MALE_FEMALE.nlargest(10)

In [None]:
wc = WordCloud().generate_from_frequencies(frq_df.MALE_FEMALE)

In [None]:
wc.to_image()

In [None]:
wc = WordCloud().generate_from_frequencies(frq_df.FEMALE_MALE)
wc.to_image()

### 3.4 Noun-adjective pairs 
* Extract adjective-noun pairs by dependency parses 

In [None]:
def get_adj_noun_pairs(play, unique_gender_values):

    adj_noun_pairs_by_gender = {key: [] for key in unique_gender_values}

    for character in play:
        for token in character["text_annotated"]:
            if token.pos_ == "ADJ":
                if token.head.pos_ == "NOUN":
                    adj_noun_pairs_by_gender[character["gender"]].append((token.lemma_, token.head.lemma_))
    return adj_noun_pairs_by_gender

In [None]:
adj_noun_pairs_by_gender = get_adj_noun_pairs(play, unique_gender_values)

In [None]:
Counter(adj_noun_pairs_by_gender["MALE"]).most_common()

In [None]:
Counter(adj_noun_pairs_by_gender["FEMALE"]).most_common()