In [2]:
import pandas as pd

In [3]:
def read_in_data():
    header_names = ["call", "conversation_topic", "person_and_type","start", "end"]
    return pd.read_csv('../data/genderedCorpus.csv', sep=',', names=header_names)

def get_conversation_topic_df(df, topic):
    """
    Gets the rows that match the conversation topic specified.
    """
    return df.loc[df['conversation_topic'] == topic]

def remove_conversation_topic_df(df, topic):
    """
    Gets the rows that do not contain a specific topic. Can pass a string containing topics to remove, delimited by a comma
    """
    for individual_topic in topic.split(","):
        df = df.loc[df['conversation_topic'] != individual_topic]
    return df

def get_call_df(df, call_id):
    """
    This gets the rows containing the call_id. 
    """
    return df.loc[df['call'] == call_id] 

def get_all_call_ids(df):
    """
    Returns an array of the names of the call ids
    """
    return df.call.unique()

def get_all_conversation_topics(df):
    """
    Returns array of all the names of the conversation topics
    """
    return df.conversation_topic.unique()
gender_types = {"caller_F receiver_M": 0, "caller_F receiver_F": 0, "caller_M receiver_F": 0, "caller_M receiver_M": 0}
cue_types = {"laughter": 0, "silence": 0, "filler": 0, "bc": 0}
df = read_in_data()


In [4]:
def occurrence_of_each_event(df, types):
    """
    This takes in a dataframe and produces the total number of each conversation cue occurred.
    """
    occurrences = df['person_and_type'].value_counts()
    for occ in occurrences.index:
        for s_type in types:
            if s_type in occ:
                types[s_type] += occurrences[occ]
    return types

def total_time_of_each_event(df, types):
    """
    This calculates the total time of each cue in the dataset, returning a dictionary of each cue
    """
    for cue in types:
        new_df = df[df["person_and_type"].str.contains(cue)] # for each cue, get df of all rows containing cue
        types[cue] = (new_df["end"] - new_df["start"]).sum(axis=0) # end time - start time and sum them all
    return types

def mean_time_of_each_event(df, types):
    """
    Gets the mean of each conversation cue. Returns a dictionary.
    """
    types = {"laughter": 0, "silence": 0, "filler": 0, "bc": 0}
    for cue in types:
        new_df = df[df["person_and_type"].str.contains(cue)]
        types[cue] = (new_df["end"] - new_df["start"]).mean(axis=0)
    return types

In [5]:
occ_count = occurrence_of_each_event(df, cue_types.copy())
time_total = total_time_of_each_event(df, cue_types.copy())
mean_time = mean_time_of_each_event(df, cue_types.copy())
overlap_occurrencesrences = occurrence_of_each_event(df, gender_types.copy())
overlap_total_time = total_time_of_each_event(df, gender_types.copy())
overlap_mean_time = mean_time_of_each_event(df, gender_types.copy())
df.person_and_type.unique()

array(['silence', 'receiver_M', 'caller_F', 'laughter_cF',
       'caller_F laughter_rM', 'laughter_rM', 'caller_F filler_rM',
       'laughter_cF receiver_M', 'laughter_cF laughter_rM',
       'caller_F receiver_M', 'filler_cF', 'filler_cF receiver_M',
       'bc_rM caller_F', 'bc_rM filler_cF', 'bc_rM', 'bc_cF receiver_M',
       'filler_rM', 'bc_cF', 'filler_cF laughter_rM', 'bc_rM receiver_M',
       'end', 'caller_M', 'caller_M receiver_M', 'laughter_cM',
       'laughter_cM receiver_M', 'filler_cM', 'filler_cM receiver_M',
       'caller_M laughter_rM', 'bc_cM receiver_M', 'caller_M filler_rM',
       'bc_rM caller_M', 'laughter_cM laughter_rM', 'bc_cM filler_rM',
       'filler_rM receiver_M', 'filler_rM laughter_cM', 'bc_cF filler_rM',
       'filler_cM filler_rM', 'bc_rM filler_cM', 'bc_rM laughter_cF',
       'filler_cF filler_rM', 'laughter_rM receiver_M', 'receiver_F',
       'caller_F receiver_F', 'filler_rF', 'filler_cF filler_rF',
       'laughter_rF', 'filler_cF receive