In [2]:
import pandas as pd
import numpy as np
import re, os
from collections import Counter

In [41]:
def sample_distributions(df, number_of_iterations):
    
  all_lemmas = [l for sublist in [str(verse).split(" ") for verse in df.Lemmas.to_list()] for l in sublist if l != ""]

  frames = [pd.DataFrame(Counter(all_lemmas), index=["COUNT"]).T]
  for iteration in range(number_of_iterations):
    np.random.seed(iteration)
    frames.append(pd.DataFrame(Counter(np.random.choice(all_lemmas, len(all_lemmas), replace=True)), index=[iteration]).T)
  
  return frames, all_lemmas

def bootstrap_summary_table_from_sampled_distributions(frames, all_lemmata, z_score):

  bootstrapped_lemmata_counts_df = pd.concat(frames, axis=1).fillna(0)

  bootstrap_summary_df = pd.merge(frames[0], round(pd.DataFrame(bootstrapped_lemmata_counts_df.mean(axis=1)).rename(columns={0:"MEAN"})), left_index=True, right_index=True)
  bootstrap_summary_df["MAX"] = bootstrapped_lemmata_counts_df.max(axis=1)
  bootstrap_summary_df["MIN"] = bootstrapped_lemmata_counts_df.min(axis=1)
  bootstrap_summary_df["STD"] = bootstrapped_lemmata_counts_df.std(axis=1)
  bootstrap_summary_df["LOW"] = bootstrap_summary_df["MEAN"] - z_score * bootstrap_summary_df["STD"]
  bootstrap_summary_df["LOW"] = round(bootstrap_summary_df["LOW"].apply(lambda x: 0 if x < 0 else x))
  bootstrap_summary_df["HIGH"] = round(bootstrap_summary_df["MEAN"] + z_score * bootstrap_summary_df["STD"])
  bootstrap_summary_df["ALL"] = len(all_lemmata)


  bootstrap_summary_df["MIN_perc"] = bootstrap_summary_df["MIN"] / bootstrap_summary_df["ALL"]
  bootstrap_summary_df["LOW_perc"] = bootstrap_summary_df["LOW"] / bootstrap_summary_df["ALL"]
  bootstrap_summary_df["COUNT_perc"] = bootstrap_summary_df["COUNT"] / bootstrap_summary_df["ALL"]
  bootstrap_summary_df["HIGH_perc"] = bootstrap_summary_df["HIGH"] / bootstrap_summary_df["ALL"]
  bootstrap_summary_df["MAX_perc"] = bootstrap_summary_df["MAX"] / bootstrap_summary_df["ALL"]

  bootstrap_summary_df["title"] = title
  bootstrap_summary_df = bootstrap_summary_df.reset_index().rename(columns = {"index":"lemma"})
  bootstrap_summary_df["lemma_lower"] = bootstrap_summary_df.lemma.apply(lambda x: x.lower().replace(" ̓", ""))

  return bootstrap_summary_df  

def save_file(df, path, title, first_col_as_index=True):
      if first_col_as_index == True:
            df.set_index(df.columns[0]).to_csv(f'{path}/{title}.csv')
      else:
            df.to_csv(f'{path}/{title}.csv')



In [6]:
author = "sophocles"
path_texts = f"dataFiles/texts_with_lemmas/{author}"
path_lemmata = f"dataFiles/lemma_tables/{author}"
titles = [x.replace(".csv", "").strip() for x in os.listdir(path_texts) if ".csv" in x]
titles

['Oedipus Tyrannus',
 'Electra',
 'Antigone',
 'Oedipus Colonus',
 'Trachiniae',
 'Ajax',
 'Philoctetes']

In [8]:
df = pd.read_csv(f'{path_texts}/{titles[0]}.csv')

In [20]:
#pd.DataFrame(Counter([item for sublist in [line.split() for line in df.Lemmas.to_list()] for item in sublist]), index=[0]).T.sort_values(0, ascending=False).head(50)

In [45]:
number_of_iterations = 1000
z_score = 1.96

for title in titles:
  df = pd.read_csv(f'{path_texts}/{title}.csv')
  frames, all_lemmas = sample_distributions(df, number_of_iterations)
  bootstrap_summary_df = bootstrap_summary_table_from_sampled_distributions(frames, all_lemmas, z_score)
  save_file(bootstrap_summary_df, path_lemmata, title)

In [48]:
import plotly.express as px
px.bar?

[0;31mSignature:[0m
[0mpx[0m[0;34m.[0m[0mbar[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdata_frame[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mx[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0my[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolor[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpattern_shape[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfacet_row[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfacet_col[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfacet_col_wrap[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfacet_row_spacing[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfacet_col_spacing[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhover_name[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;