In [57]:
import pandas as pd
import os
import numpy as np
def read_file(f):
    '''convert a csv file to a pd DataFrame. Also parse the filename into vole1/vole2/day/experiment
    and return as a dict.'''
    
    
    df = pd.read_csv(f)
    _, fname = os.path.split(f)
    #vvvvv this should get replaced with a header in each filef
    vole1 = fname.split('_')[0]
    vole2 = fname.split('_')[1]
    day = fname.split('_')[-1].split('.')[0]
    experiment = fname.split('__')[-1].split('_day')[0]
    return df, {'vole1':vole1, 'vole2':vole2,'day': day, 'experiment':experiment,}

def analyze_file(f):
    df, info_dict = read_file(f)
    return analyze_df(df), info_dict
    
    
def analyze_df(df):
    counts = get_event_counts(df)
    latencies = get_event_latencies(df)
    
    return [counts, latencies]

def get_event_counts(df):
    '''read a DataFrame and count the occurences of every unique event within the "event" column'''
    counts = {}
    for e in df.event.unique():
        counts[f'{e}_count'] = len(df.loc[df.event == e])
    return counts

def get_event_latencies(df):
    '''read a DataFrame, and get mean and median latencies for any events with associated latencies. Will not count
    events that have no associated latencies, even if they exist for other animals. (should fix this)'''
    e_list = df.loc[pd.notna(df.latency), 'event'].unique()
    latencies = {}
    for e in e_list:
        latencies[f'{e}_latency_mean'] = round(df.loc[df.event == e, 'latency'].mean(), 5)
        latencies[f'{e}_latency_median'] = round(df.loc[df.event == e, 'latency'].median(),5)
        
    return latencies

def parse_file(file):
    '''read a file, and convert it to a summary of counts, median latencies, and mean latencies.'''
    _, fname = os.path.split(file)
    dicts, info = analyze_file(file)
    out = []
    
    
    
    cols = [col for col in info.keys()]
    vals = [info[key] for key in cols]
    cols+=['metric', 'value','file']
    
    for dict in dicts:
        for key, value in dict.items():
            tmp = []
            tmp+=vals
            tmp+=[key,value, fname]
            out+=[tmp]
    
    return pd.DataFrame(data = np.asarray(out), columns = cols)
    
def assemble_files(directory):
    '''return a list of paths to files to parse'''
    os.chdir(directory)

    #create an empty 2d list
    out_names = []

    #this will assemble a list of ALL filenames for images, sorted by timestamp of acquisition


    for root, dirs, files in os.walk(directory):
        out = [os.path.join(root, f) for f in sorted(files) if
            f.endswith('.csv') if not 'summary' in f]
        out_names += out
    return out_names

def parse_directory(dir):
    '''read all files in a dir, and add to a large output dataframe'''
    files = assemble_files(dir)
    out_df = parse_file(files[0])
    for file in files[1:]:
        out_df = out_df.append(parse_file(file))
    return out_df

def summarize_directory(dir, output_file_name = None, output_file_dir = None):
    '''use this to parse a directory and save it to a csv file. Will search deeply within a directory (IE a 
    directory of directories.)
    dir              --> input directory (path)
    output_file_name --> name for the summary file. default "summary.csv" (string)
    output_file_dir  --> where to save the summary. Default is same dir as input (path)'''
    
    if output_file_dir == None:
        output_file_dir = dir
    if not output_file_name:
        output_file_name = 'summary.csv'
    output_file_path = os.path.join(output_file_dir, output_file_name)
    
    out_df = parse_directory(dir)
    out_df.to_csv(output_file_path)
    

In [59]:
summarize_directory('/Users/davidprotter/Downloads/6_06_operant_data/6_06_contingent_train_1side_day_4/')