<a href="https://colab.research.google.com/github/datamaunz/sophoclesApp/blob/main/bootstrap_lemma_distributions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re, os
from collections import Counter

## Functions

### Functions (lemmatization)

In [None]:
def return_lemma(word):
  return lemmatizer.lemmatize([cltk_normalize(word)])[0][-1]

def remove_punctuation(text, exceptions=None):
    """
    Return a string with punctuation removed.

    Parameters:
        text (str): The text to remove punctuation from.
        exceptions (list): List of symbols to keep in the given text.

    Return:
        str: The input text without the punctuation.
    """

    all_but = [
        r'\w',
        r'\s'
    ]

    if exceptions is not None:
        all_but.extend(exceptions)

    pattern = '[^{}]'.format(''.join(all_but))

    return re.sub(pattern, '', text) 

def add_lemmata_column(df):

  for index, row in df.iterrows():
    verse = remove_punctuation(row.Speech)
    df.loc[index, "lemmata"] = " ".join([return_lemma(word).replace(" ̓", "") for word in verse.split(" ")])
    frame["lemma_min"] = frame["lemma"].apply(lambda x: x.lower().replace(" ̓", ""))
  return df

def create_lemmata_counts_df(title, read_path, write_path):
  df = pd.read_csv(f'{read_path}/{title}.csv')
  lemma_df = pd.DataFrame(Counter([l for sublist in [str(verse).split(" ") for verse in df.lemmata.to_list()] for l in sublist if l != ""]), index=["COUNT"]).T.reset_index().rename(columns={"index":"lemma"}).sort_values("COUNT", ascending=False)
  lemma_df.set_index(lemma_df.columns[0]).to_csv(f'{write_path}/{title}.csv')
  return lemma_df

def save_file(df, path, title, first_col_as_index=True):
  if first_col_as_index == True:
    df.set_index(df.columns[0]).to_csv(f'{path}/{title}.csv')
  else:
    df.to_csv(f'{path}/{title}.csv')

### Functions (bootstrapping)

In [None]:


def sample_distributions(df, number_of_iterations):

  all_lemmata = [l for sublist in [str(verse).split(" ") for verse in df.lemmata.to_list()] for l in sublist if l != ""]

  frames = [pd.DataFrame(Counter(all_lemmata), index=["COUNT"]).T]
  for iteration in range(number_of_iterations):
    np.random.seed(iteration)
    frames.append(pd.DataFrame(Counter(np.random.choice(all_lemmata, len(all_lemmata), replace=True)), index=[iteration]).T)
  
  return frames, all_lemmata

def bootstrap_summary_table_from_sampled_distributions(frames, all_lemmata, z_score):

  bootstrapped_lemmata_counts_df = pd.concat(frames, axis=1).fillna(0)

  bootstrap_summary_df = pd.merge(frames[0], round(pd.DataFrame(bootstrapped_lemmata_counts_df.mean(axis=1)).rename(columns={0:"MEAN"})), left_index=True, right_index=True)
  bootstrap_summary_df["MAX"] = bootstrapped_lemmata_counts_df.max(axis=1)
  bootstrap_summary_df["MIN"] = bootstrapped_lemmata_counts_df.min(axis=1)
  bootstrap_summary_df["STD"] = bootstrapped_lemmata_counts_df.std(axis=1)
  bootstrap_summary_df["LOW"] = bootstrap_summary_df["MEAN"] - z_score * bootstrap_summary_df["STD"]
  bootstrap_summary_df["LOW"] = round(bootstrap_summary_df["LOW"].apply(lambda x: 0 if x < 0 else x))
  bootstrap_summary_df["HIGH"] = round(bootstrap_summary_df["MEAN"] + z_score * bootstrap_summary_df["STD"])
  bootstrap_summary_df["ALL"] = len(all_lemmata)


  bootstrap_summary_df["MIN_perc"] = bootstrap_summary_df["MIN"] / bootstrap_summary_df["ALL"]
  bootstrap_summary_df["LOW_perc"] = bootstrap_summary_df["LOW"] / bootstrap_summary_df["ALL"]
  bootstrap_summary_df["COUNT_perc"] = bootstrap_summary_df["COUNT"] / bootstrap_summary_df["ALL"]
  bootstrap_summary_df["HIGH_perc"] = bootstrap_summary_df["HIGH"] / bootstrap_summary_df["ALL"]
  bootstrap_summary_df["MAX_perc"] = bootstrap_summary_df["MAX"] / bootstrap_summary_df["ALL"]

  bootstrap_summary_df["title"] = title
  bootstrap_summary_df = bootstrap_summary_df.reset_index().rename(columns = {"index":"lemma"})
  bootstrap_summary_df["lemma_min"] = bootstrap_summary_df.lemma.apply(lambda x: x.lower().replace(" ̓", ""))

  return bootstrap_summary_df  



## Execution

In [None]:
author = "sophocles"
path_texts = f"/content/drive/MyDrive/Colab Notebooks/workWithBen/greekTexts/{author}"
path_lemmata = f"/content/drive/MyDrive/Colab Notebooks/workWithBen/lemmataTables/{author}"
titles = [x.replace(".csv", "").strip() for x in os.listdir(path_texts) if ".csv" in x]
titles

['Antigone',
 'Oedipus Colonus',
 'Ajax',
 'Electra',
 'Oedipus Tyrannus',
 'Philoctetes',
 'Trachiniae']

### add lemmata columns to csv files with original text

In [None]:
pd.read_csv(f'{path_texts}/{titles[0]}.csv')

Unnamed: 0,Name,Speech,Lemmas,verse_number,inferred_verse_number
0,Ἀντιγόνη,"ὦ κοινὸν αὐτάδελφον Ἰσμήνης κάρα,",εἰμί κοινός αὐτάδελφος κάρ,1.0,1.0
1,Ἀντιγόνη,ἆρ᾽ οἶσθ᾽ ὅ τι Ζεὺς τῶν ἀπ᾽ Οἰδίπου κακῶν,ἄρα ὀιστός ὅς τίς Ζεύς ὁ Ἆπις Οἰδίπους κάκη,,2.0
2,Ἀντιγόνη,ὁποῖον οὐχὶ νῷν ἔτι ζώσαιν τελεῖ;,ὁποῖος οὐ ἐγώ ἔτι ζάω τέλλω,,3.0
3,Ἀντιγόνη,οὐδὲν γὰρ οὔτ᾽ ἀλγεινὸν οὔτ᾽ ἄτης ἄτερ,οὐδείς γάρ οὔτε ἀλγεινός οὔτε ἄτη ἄτερ,,4.0
4,Ἀντιγόνη,"οὔτ᾽ αἰσχρὸν οὔτ᾽ ἄτιμόν ἐσθ᾽, ὁποῖον οὐ",οὔτε αἰσχρός οὔτε ἄτιμος ἔσθω ὁποῖος οὐ,5.0,5.0
...,...,...,...,...,...
1228,Χορός,πρῶτον ὑπάρχει. χρὴ δὲ τά γ᾽ εἰς θεοὺς,πρότερος ὑπάρχω χράω δέ ὁ γε εἶμι θεός,,1348.0
1229,Χορός,μηδὲν ἀσεπτεῖν. μεγάλοι δὲ λόγοι,μηδείς ἀσεπτέω μέγας δέ λόγος,1350.0,1350.0
1230,Χορός,μεγάλας πληγὰς τῶν ὑπεραύχων,μέγας πλήξ ὁ ὑπέραυχος,,1351.0
1231,Χορός,ἀποτίσαντες,ἀποτίνω,,1352.0


In [None]:
for title in titles:
  df = pd.read_csv(f'{path_texts}/{title}.csv')
  df["title"] = title
  df = add_lemmata_column(df)
  save_file(df, path_texts, title)

### bootstrap distribution summaries

In [None]:
number_of_iterations = 1000
z_score = 1.96

for title in titles:
  df = pd.read_csv(f'{path_texts}/{title}.csv')
  frames, all_lemmata = sample_distributions(df, number_of_iterations)
  bootstrap_summary_df = bootstrap_summary_table_from_sampled_distributions(frames, all_lemmata, z_score)
  save_file(bootstrap_summary_df, path_lemmata, title)

### Zip files

In [None]:
!zip -r /content/sophocles_lemmata_tables.zip "/content/drive/MyDrive/Colab Notebooks/workWithBen/lemmataTables/sophocles"

  adding: content/drive/MyDrive/Colab Notebooks/workWithBen/lemmataTables/sophocles/ (stored 0%)
  adding: content/drive/MyDrive/Colab Notebooks/workWithBen/lemmataTables/sophocles/Antigone.csv (deflated 84%)
  adding: content/drive/MyDrive/Colab Notebooks/workWithBen/lemmataTables/sophocles/Oedipus Colonus.csv (deflated 85%)
  adding: content/drive/MyDrive/Colab Notebooks/workWithBen/lemmataTables/sophocles/Ajax.csv (deflated 84%)
  adding: content/drive/MyDrive/Colab Notebooks/workWithBen/lemmataTables/sophocles/Electra.csv (deflated 84%)
  adding: content/drive/MyDrive/Colab Notebooks/workWithBen/lemmataTables/sophocles/Oedipus Tyrannus.csv (deflated 85%)
  adding: content/drive/MyDrive/Colab Notebooks/workWithBen/lemmataTables/sophocles/Philoctetes.csv (deflated 84%)
  adding: content/drive/MyDrive/Colab Notebooks/workWithBen/lemmataTables/sophocles/Trachiniae.csv (deflated 84%)


In [None]:
bootstrap_summary_df

Unnamed: 0,lemma,COUNT,MEAN,MAX,MIN,STD,LOW,HIGH,ALL,MIN_perc,LOW_perc,COUNT_perc,HIGH_perc,MAX_perc,title,lemma_min
0,λόγος,27,27.0,49.0,13.0,5.428900,16.0,38.0,6506,0.001998,0.002459,0.004150,0.005841,0.007532,Trachiniae,λόγος
1,μέν,59,59.0,85.0,38.0,7.757176,44.0,74.0,6506,0.005841,0.006763,0.009069,0.011374,0.013065,Trachiniae,μέν
2,εἰμί,91,91.0,119.0,53.0,9.212906,73.0,109.0,6506,0.008146,0.011220,0.013987,0.016754,0.018291,Trachiniae,εἰμί
3,ἀρχαῖος,2,2.0,6.0,0.0,1.416559,0.0,5.0,6506,0.000000,0.000000,0.000307,0.000769,0.000922,Trachiniae,ἀρχαῖος
4,ἄνθρωπος,6,6.0,16.0,0.0,2.413443,1.0,11.0,6506,0.000000,0.000154,0.000922,0.001691,0.002459,Trachiniae,ἄνθρωπος
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1885,ἑστῶτ,1,1.0,7.0,0.0,0.994969,0.0,3.0,6506,0.000000,0.000000,0.000154,0.000461,0.001076,Trachiniae,ἑστῶτ
1886,χαλεπός,1,1.0,5.0,0.0,1.005885,0.0,3.0,6506,0.000000,0.000000,0.000154,0.000461,0.000769,Trachiniae,χαλεπός
1887,ὑπέχω,1,1.0,6.0,0.0,1.027270,0.0,3.0,6506,0.000000,0.000000,0.000154,0.000461,0.000922,Trachiniae,ὑπέχω
1888,παρατίθημι,1,1.0,6.0,0.0,0.959683,0.0,3.0,6506,0.000000,0.000000,0.000154,0.000461,0.000922,Trachiniae,παρατίθημι
