This notebook provides functions for creating a csv file containing all counts from a single corpus as well as functions that calculate percentages per sound and the standard deviation over the corpus. 

In [1]:
import sys
import os
import pandas as pd
import statistics

In [2]:
def obtain_counts(inputdir, totals=False):


    counts_collection = []
    for filename in os.listdir(inputdir):
        if filename.endswith("_counts.csv"):
            mycsv = pd.read_csv(os.path.join(inputdir, filename))
            play_name = filename.rstrip("_counts.csv")
            mycsv.index = [play_name]
            counts_collection.append(mycsv)

    corpus_counts = pd.concat(counts_collection)
    
    # Total sum per column:
    corpus_counts.loc['Total', :] = corpus_counts.sum(axis=0)

    # Total sum per row:
    if totals:
        corpus_counts.loc[:, 'Total'] = corpus_counts.sum(axis=1)

    return corpus_counts

In [3]:
def obtain_percentages(corpus_counts):

    corpus_stats = corpus_counts.copy()
    corpus_stats.iloc[:, 0:] = corpus_stats.iloc[:, 0:].apply(lambda x: x.div(x.sum()), axis=1).astype(float)

    return corpus_stats

In [4]:

def obtain_stdevs(corpus_stats):

    corpus_stdevs = corpus_stats.copy()
    #dropping the line with totals since these should not be taken into account when calculating the stdev
    corpus_stdevs.drop(corpus_stats.tail(1).index,inplace=True)
    corpus_stdevs = corpus_stdevs.std()

    return corpus_stdevs

In [5]:

def create_outfile_name(inputdir):
    
    if inputdir.endswith("/"):
        outfilename = inputdir.split('/')[-2] + "_corpus_counts.csv"
    else:
        outfilename = inputdir.split('/')[-1] + "_corpus_counts.csv"

    return outfilename

In [6]:

def get_all_corpus_statistics(inputdir, outputdir):
    '''This function provides the first steps for obtaining corpus statistics in one go (from directory with counts to standard deviations)'''
    
    corpus_counts = obtain_counts(inputdir)
    corpus_stats = obtain_percentages(corpus_counts)
    corpus_std = obtain_stdevs(corpus_stats)
    # write out corpus counts to file
    if not os.path.isdir(outputdir):
        os.mkdir(outputdir)
    outfilename = create_outfile_name(inputdir)
    #check if output dir exists and create if not

    corpus_counts.to_csv(os.path.join(outputdir, outfilename), index=True)
    stats_out = outfilename.replace("_counts.csv", "_stats.csv")
    corpus_stats.to_csv(os.path.join(outputdir, stats_out))
    std_out = outfilename.replace("_counts.csv", "_stdev.csv")
    corpus_std.to_csv(os.path.join(outputdir, std_out))

In [7]:



def subtract_total_row(df):
    
    # Ensure 'Total' is in the index
    if 'Total' not in df.index:
        raise ValueError("DataFrame must have a row with index 'Total'")

    # Get the Total row
    total_row = df.loc['Total']

    # Subtract Total row from all other rows
    df_diff = df.drop('Total').subtract(total_row)

    return df_diff



def analyze_corpus(statsfile, stdev, outputfile):


    new_stdevs = pd.read_csv(stdev, index_col=0)
    #transpose since now it is one column
    new_stdevs = new_stdevs.T

    my_stats = pd.read_csv(statsfile, index_col=0)
    #calculate difference between percentage in row and average percentage
    differences = subtract_total_row(my_stats)
    
    #calculate how many stdev the value differs
    times_stdev = differences / new_stdevs.iloc[0]
    times_stdev.to_csv(outputfile)

In [8]:
#Step 1: call function to obtain counts spreadsheet

#define input directory
inputdir = '../original_study_out/'
corpus_counts = obtain_counts(inputdir)

#define output directory
outputdir = '../extra_nb_test/'
if not os.path.isdir(outputdir):
    os.mkdir(outputdir)
#this can also be replaced by an outfilename of choice
outfilename = create_outfile_name(inputdir)

#write out counts output file
corpus_counts.to_csv(os.path.join(outputdir, outfilename), index=True)

In [11]:
#Step 2 obtain percentages and standard deviation. 
#The current setup assumes the pervious cell has been run write before and corpus_counts and outfilename are defined 

corpus_stats = obtain_percentages(corpus_counts)
corpus_std = obtain_stdevs(corpus_stats)

stats_out = os.path.join(outputdir,outfilename.replace("_counts.csv", "_stats.csv"))
corpus_stats.to_csv(stats_out)
std_out = os.path.join(outputdir,outfilename.replace("_counts.csv", "_stdev.csv"))
corpus_std.to_csv(std_out)

In [12]:
#Step 3 create overview of how many times standard deviation each sound deviates
#The current setup assumes previous cells were run just before, if not define paths to stats_out, stdev_out

stddev_overview_out = outfilename.replace("_counts.csv", "difference_in_stdev.csv")
#ugly, make prettier
analyze_corpus(stats_out, std_out, stddev_overview_out)