This notebook provides functions for group analysis based on sound deviations observed in individual plays. As a basis it takes a file with standard deviations per sound for each play as input. Individual functions obtain information on the percentage of plays with a deviation, the average number of deviations over the corpus and among those plays that have at least one deviation as well as the number of deviations where sounds occur more frequently and where they occur less frequently than average. In addition, the number of times individual sounds deviate is provided, presented from most deviating to less often deviating sound.

In [None]:
import pandas as pd

In [None]:
def provide_percentages_of_deviations_in_plays(df, abs_values_2stdev):

    # Check each row with 2stddev indicated: does it have *any* True value
    rows_with_at_least_one_deviation = abs_values_2stdev.any(axis=1)

    # Compute percentage of such rows
    percentage_containing_deviation = (rows_with_at_least_one_deviation.sum() / len(df)) * 100

    # the average number of deviations per play in the group (over all plays)
    avg_n_deviations_all_rows = abs_values_2stdev.sum(axis=1).mean()

    # the average number of deviations per play in the group over those plays that have at least one deviation
    avg_n_deviations_deviating_rows = abs_values_2stdev[abs_values_2stdev.any(axis=1)].sum(axis=1).mean()

    # the percentage of deviations with ≥ +2 SD (above the mean) over all plays
    positive_2stdeviations = (df >= 2).sum().sum() / df.size * 100

    # the percentage of values ≤ -2 SD (below the mean) over all plays
    negative_2stdeviations = (df <= -2).sum().sum() / df.size * 100

    #print outcomes
    print(f"Percentage of plays with at least one deviating sound (over 2 stdev): {percentage_containing_deviation:.2f}%")
    
    print(f"Average sound deviations in the group (all plays): {avg_n_deviations_all_rows:.2f}%")
    print(f"Average sound deviations in the group (only plays with at least one deviation): {avg_n_deviations_deviating_rows:.2f}%")
    print(f"Percentage of sounds occurring (at least 2 stddev) more than average (all plays in group): {positive_2stdeviations:.2f}%")
    print(f"Percentage of sounds occurring (at least 2 stddev) less than average (all plays in group): {negative_2stdeviations:.2f}%")

In [None]:
def provide_overview_of_most_commonly_deviating_sounds(df, abs_values2stdev):


    sorted_deviations = abs_values2stdev.sum().sort_values(ascending=False)
    print(sorted_deviations)
    
    #deviation_counts = abs_values2stdev.sum()
    #print(deviation_counts)
    #top5_deviations = sorted_deviations.head(5)
    #print(top5_deviations)

In [None]:
#provide path to input file here

my_input_csv_file = "../original_outdir/original_study_out_corpus_diffstddev.csv"

#read in csv file (header is 0 specifically states the first row specifies the phenomena
df = pd.read_csv(my_input_csv_file, header = 0)

#exclude first column with filenames
numeric_df = df.iloc[:, 1:]

#get values with at least 2 stddev deviation
abs_values_2stdev = numeric_df.abs() >= 2

#obtains stats per play
provide_percentages_of_deviations_in_plays(numeric_df, abs_values_2stdev)

#obtains stats per sound
provide_overview_of_most_commonly_deviating_sounds(numeric_df, abs_values_2stdev)
