In [None]:
#cleaning up mmia transcripts 
# 3/12/24 Emily Doherty ###
import pandas as pd
import os
from datetime import datetime, timedelta

####NOTES
#Need MMIA recordings for 105 + 113
#agent responses starting with session 104
#timestamps of JIA agent are off for 104-113
#agent timestamps line up for 114-117

In [None]:
def convert_to_time_format(timestamp_str):
    try:
        timestamp_obj = datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S.%fZ')
    except ValueError:
        try:
            timestamp_obj = datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%SZ')
        except ValueError:
            try:
                timestamp_obj = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S.%f')
            except ValueError:
                try:
                    timestamp_obj = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S')
                except ValueError:
                    raise ValueError("Timestamp format not supported")
    timestamp_obj -= timedelta(hours=7) ###Because mmia server is ahead 7 hours
    # Format the datetime object to display only the time in HH:MM format
    time_format = timestamp_obj.strftime('%-I:%M %p')  # '%-I' removes leading zero for hour
    
    return time_format

## Import timestamps of Knowledge Sharing and Brainstorming


In [None]:
#import recordingID info
mmiaID=pd.read_excel('/Users/emilydoherty/Desktop/JIA/JIA_recordingIDs.xlsx')

#import KS/BS timestamps
file_path='/Users/emilydoherty/Downloads/JIA_Transcripts_Tracker start_stop times - Tracker.csv'
tracker=pd.read_csv(file_path)
tracker.dropna(axis=1, how='all', inplace=True)
tracker = tracker.drop(tracker.index[0])
tracker = tracker.iloc[:, :-5]
tracker.drop(tracker.columns[1:5], axis=1, inplace=True)
tracker = tracker.rename(columns={tracker.columns[0]: 'Session_ID'})
tracker = tracker.rename(columns={tracker.columns[1]: 'KS_START'})
tracker = tracker.rename(columns={tracker.columns[2]: 'KS_END'})
tracker = tracker.rename(columns={tracker.columns[3]: 'BS_START'})
tracker = tracker.rename(columns={tracker.columns[4]: 'BS_END'})
tracker['Session_ID']=tracker['Session_ID'].astype('int64')

#merge the two
merged_df = pd.merge(mmiaID, tracker, on='Session_ID', how='inner')


## Import RAW MMIA transcripts (first run pull_data_to_csv.py)

In [None]:
#loop thru transcript files 
csv_directory='/Users/emilydoherty/Desktop/JIA/raw_mmia'
output_csv_directory = '/Users/emilydoherty/Desktop/JIA/test'

os.makedirs(output_csv_directory, exist_ok=True)
for filename in os.listdir(csv_directory):
    if filename.endswith(".csv"):
        df = pd.read_csv(os.path.join(csv_directory, filename))
        columns_to_drop = [
            'chunk_link',
            'asr_mode',
            'sessionId',
            'class_id',
            'source',
            'recording_start_date',
            'chunk_start_date',
            'amr',
            'CoBi_Model']
        #not sure what utterance_id is?
        df.drop(columns_to_drop, axis=1, inplace=True)
        # df.dropna(subset=['text'], inplace=True) #dont want to drop rows w/o text bc jia agent doesnt have text populated
        df['new_timestamp'] = df['timestamp'].apply(convert_to_time_format)
        recording_id = df['recordingId'].iloc[0]  # Get recording ID from the CSV file
        
        # Filter the rows based on KS_Start and BS_End for the specific recording ID
        ks_start = merged_df.loc[merged_df['Recording_ID'] == recording_id, 'KS_START'].iloc[0]
        bs_end = merged_df.loc[merged_df['Recording_ID'] == recording_id, 'BS_END'].iloc[0]
        df = df[(df['new_timestamp'] >= ks_start) & (df['new_timestamp'] <= bs_end)]

        # Save the filtered DataFrame back to the CSV file
        output_filename = os.path.join(output_csv_directory, filename)
        df.to_csv(output_filename, index=False)

In [None]:
#analyze transcripts 

#4 jia sessions with agent responses w/ correct timestamps
id_114 = '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65e1169a7f935400089d241c.csv'
id_115 = '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65e26aa2e898da000895f84d.csv'
id_116 = '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65e37d88ca5a1700083fa08c.csv'
id_117= '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65ea5370a718610008c52ed2.csv'
ids = [id_114, id_115, id_116, id_117]
for id in ids:
    file = pd.read_csv(id)
    file = file.sort_values(by=['timestamp']) #sort by timestamp
    #counts of interest
    column_counts = {}
    columns = ['speaker', 'content_word', 'jia_agent_response', 'jia_dialogue_state']
    for column in columns:
        column_counts[column] = file[column].value_counts()
    print(f"Counts for {id}")
    print(column_counts)
    # column averages
    column_averages = {}
    columns = ['CoBi_Community', 'CoBi_Respect', 'CoBi_Thinking',
        'cps_maintain', 'cps_neg', 'cps_comm']
    for column in columns:
        column_averages[column] = file[column].mean()
    print(f"Averages for {id}")
    print(column_averages) 
    

In [53]:

ids = [
    '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65e1169a7f935400089d241c.csv',
    '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65e26aa2e898da000895f84d.csv',
    '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65e37d88ca5a1700083fa08c.csv',
    '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65ea5370a718610008c52ed2.csv'
]

# Define the column of interest
column_of_interest = 'jia_agent_response'


all_counts_df = pd.DataFrame()


for idx, id_path in enumerate(ids, start=114):
    file = pd.read_csv(id_path)
    file = file.sort_values(by=['timestamp'])  # Sort by timestamp
    counts = file[column_of_interest].value_counts().reset_index()
    counts.columns = [column_of_interest, idx]
    if all_counts_df.empty:
        all_counts_df = counts
    else:
        all_counts_df = pd.merge(all_counts_df, counts, how='outer', on=column_of_interest)
all_counts_df.fillna(0, inplace=True)
display(all_counts_df)


Unnamed: 0,jia_agent_response,114,115,116,117
0,Right now I only hear one person in the group ...,108,103,62,131
1,It seems like the conversation has stopped. T...,52,51,45,79
2,Your group seems to be having a content issue....,24,24,25,7
3,There's a continuous circular talk issue here....,4,4,2,4


In [51]:
columns_of_interest = ['CoBi_Community', 'CoBi_Respect', 'CoBi_Thinking', 'cps_maintain', 'cps_neg', 'cps_comm']
all_averages_df = pd.DataFrame()
for column_of_interest in columns_of_interest:
    averages = []
    for idx, id_path in enumerate(ids, start=114):
        file = pd.read_csv(id_path)
        file = file.sort_values(by=['timestamp'])
        average = file[column_of_interest].mean()
        averages.append(average)
    average_df = pd.DataFrame({f'{column_of_interest}_avg': averages})
    if all_averages_df.empty:
        all_averages_df = average_df
    else:
        all_averages_df = pd.concat([all_averages_df, average_df], axis=1)

all_averages_df = all_averages_df.T
all_averages_df.columns = range(114, 118)
all_averages_df.fillna(0, inplace=True)
display(all_averages_df)


Unnamed: 0,114,115,116,117
CoBi_Community_avg,0.079333,0.100047,0.10002,0.094527
CoBi_Respect_avg,0.078645,0.056223,0.039591,0.071182
CoBi_Thinking_avg,0.049018,0.045245,0.060125,0.046092
cps_maintain_avg,38.465149,46.746881,54.974546,38.276985
cps_neg_avg,38.607507,43.012825,41.471481,39.404959
cps_comm_avg,38.673685,46.132833,51.349481,34.521913
