In [39]:
#cleaning up mmia transcripts 
# 3/12/24 Emily Doherty ###
import pandas as pd
import os
from datetime import datetime, timedelta

####NOTES
#Need MMIA recordings for 105 + 113
#agent responses starting with session 104
#timestamps of JIA agent are off for 104-113
#agent timestamps line up for 114-117

In [40]:
def convert_to_time_format(timestamp_str):
    try:
        timestamp_obj = datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S.%fZ')
    except ValueError:
        try:
            timestamp_obj = datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%SZ')
        except ValueError:
            try:
                timestamp_obj = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S.%f')
            except ValueError:
                try:
                    timestamp_obj = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S')
                except ValueError:
                    raise ValueError("Timestamp format not supported")
    timestamp_obj -= timedelta(hours=7) ###Because mmia server is ahead 7 hours
    # Format the datetime object to display only the time in HH:MM format
    time_format = timestamp_obj.strftime('%-I:%M %p')  # '%-I' removes leading zero for hour
    
    return time_format

## Import timestamps of Knowledge Sharing and Brainstorming


In [41]:
#import recordingID info
mmiaID=pd.read_excel('/Users/emilydoherty/Desktop/JIA/JIA_recordingIDs.xlsx')

#import KS/BS timestamps
file_path='/Users/emilydoherty/Downloads/JIA_Transcripts_Tracker start_stop times - Tracker.csv'
tracker=pd.read_csv(file_path)
tracker.dropna(axis=1, how='all', inplace=True)
tracker = tracker.drop(tracker.index[0])
tracker = tracker.iloc[:, :-5]
tracker.drop(tracker.columns[1:5], axis=1, inplace=True)
tracker = tracker.rename(columns={tracker.columns[0]: 'Session_ID'})
tracker = tracker.rename(columns={tracker.columns[1]: 'KS_START'})
tracker = tracker.rename(columns={tracker.columns[2]: 'KS_END'})
tracker = tracker.rename(columns={tracker.columns[3]: 'BS_START'})
tracker = tracker.rename(columns={tracker.columns[4]: 'BS_END'})
tracker['Session_ID']=tracker['Session_ID'].astype('int64')

#merge the two
merged_df = pd.merge(mmiaID, tracker, on='Session_ID', how='inner')


## Import RAW MMIA transcripts (first run pull_data_to_csv.py)

In [42]:
#loop thru transcript files 
csv_directory='/Users/emilydoherty/Desktop/JIA/raw_mmia_v2'
output_csv_directory = '/Users/emilydoherty/Desktop/JIA/test'

os.makedirs(output_csv_directory, exist_ok=True)
for filename in os.listdir(csv_directory):
    if filename.endswith(".csv"):
        df = pd.read_csv(os.path.join(csv_directory, filename))
        columns_to_drop = [
            'chunk_link',
            'asr_mode',
            'sessionId',
            'class_id',
            'source',
            'recording_start_date',
            'chunk_start_date',
            'amr',
            'CoBi_Model']
        #not sure what utterance_id is?
        df.drop(columns_to_drop, axis=1, inplace=True)
        # df.dropna(subset=['text'], inplace=True) #dont want to drop rows w/o text bc jia agent doesnt have text populated
        df['new_timestamp'] = df['timestamp'].apply(convert_to_time_format)
        first_value_index = df['recordingId'].first_valid_index()
        recording_id = df.loc[first_value_index, 'recordingId']
        # Filter the rows based on KS_Start and BS_End for the specific recording ID
        ks_start = merged_df.loc[merged_df['Recording_ID'] == recording_id, 'KS_START'].iloc[0]
        bs_end = merged_df.loc[merged_df['Recording_ID'] == recording_id, 'BS_END'].iloc[0]
        df = df[(df['new_timestamp'] >= ks_start) & (df['new_timestamp'] <= bs_end)]

        # Save the filtered DataFrame back to the CSV file
        output_filename = os.path.join(output_csv_directory, filename)
        df.to_csv(output_filename, index=False)

In [43]:
#analyze transcripts 
folder = '/Users/emilydoherty/Desktop/JIA/test'
for filename in os.listdir(folder):
    if filename.endswith(".csv"):
        file = pd.read_csv(os.path.join(folder, filename))
# #4 jia sessions with agent responses w/ correct timestamps
# id_114 = '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65e1169a7f935400089d241c.csv'
# id_115 = '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65e26aa2e898da000895f84d.csv'
# id_116 = '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65e37d88ca5a1700083fa08c.csv'
# id_117= '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65ea5370a718610008c52ed2.csv'
# ids = [id_114, id_115, id_116, id_117]
# for id in ids:
    # file = pd.read_csv(id)
        # file=pd.read_csv{filename}
        file = file.sort_values(by=['timestamp']) #sort by timestamp
        #counts of interest
        column_counts = {}
        columns = ['speaker', 'content_word', 'jia_agent_response', 'jia_dialogue_state']
        for column in columns:
            column_counts[column] = file[column].value_counts()
        # print(f"Counts for {id}")
        print(f"Counts for {filename}")
        print(column_counts)
        # column averages
        column_averages = {}
        columns = ['CoBi_Community', 'CoBi_Respect', 'CoBi_Thinking',
            'cps_maintain', 'cps_neg', 'cps_comm']
        for column in columns:
            column_averages[column] = file[column].mean()
        # print(f"Averages for {id}")
        print(f"Averages for {filename}")
        print(column_averages) 
    

Counts for isat_data_mmia_jia_agent_makecode-65d5367d6b93ad0008b00d45.csv
{'speaker': speaker
220          61
221          43
undefined     1
Name: count, dtype: int64, 'content_word': content_word
environmental    4
sensors          4
sensor           4
blocks           2
block            2
Name: count, dtype: int64, 'jia_agent_response': jia_agent_response
It seems like the conversation has stopped.  Try throwing some ideas out there.           136
Right now I only hear one person in the group talking.  Let's bring in some new ideas.    126
Your group seems to be having a content issue.  Have you checked the related reading?      18
Name: count, dtype: int64, 'jia_dialogue_state': jia_dialogue_state
Circular_Talk_Collaboration_Issue_No_Speakers    136
Circular_Talk_Collaboration_Issue_One_Speaker    126
Flow                                              26
Circular_Talk_Content                             18
Name: count, dtype: int64}
Averages for isat_data_mmia_jia_agent_makecode-65d

In [44]:

ids = [
    '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65e1169a7f935400089d241c.csv',
    '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65e26aa2e898da000895f84d.csv',
    '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65e37d88ca5a1700083fa08c.csv',
    '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65ea5370a718610008c52ed2.csv'
]


column_of_interest = 'jia_agent_response'

all_counts_df = pd.DataFrame()

for idx, id_path in enumerate(ids, start=114):
    file = pd.read_csv(id_path)
    file = file.sort_values(by=['timestamp'])  # Sort by timestamp
    counts = file[column_of_interest].value_counts().reset_index()
    counts.columns = [column_of_interest, idx]
    if all_counts_df.empty:
        all_counts_df = counts
    else:
        all_counts_df = pd.merge(all_counts_df, counts, how='outer', on=column_of_interest)
all_counts_df.fillna(0, inplace=True)
display(all_counts_df)


Unnamed: 0,jia_agent_response,114,115,116,117
0,Right now I only hear one person in the group ...,108,103,62,131
1,It seems like the conversation has stopped. T...,52,51,45,79
2,Your group seems to be having a content issue....,24,24,25,7
3,There's a continuous circular talk issue here....,4,4,2,4


In [45]:
import os
import pandas as pd

folder_path = '/Users/emilydoherty/Desktop/JIA/test'  
column_of_interest = 'jia_dialogue_state'#'content_word' 'jia_agent_response'

all_counts_df = pd.DataFrame()

for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        file = pd.read_csv(file_path)
        file = file.sort_values(by=['timestamp'])  # Sort by timestamp
        counts = file[column_of_interest].value_counts().reset_index()
        counts.columns = [column_of_interest, file_name.split('-')[-1][:-4]]
        if all_counts_df.empty:
            all_counts_df = counts
        else:
            all_counts_df = pd.merge(all_counts_df, counts, how='outer', on=column_of_interest) 

all_counts_df.fillna(0, inplace=True)

recordingID_to_sessionID = dict(zip(mmiaID['Recording_ID'], mmiaID['Session_ID']))
all_counts_df = all_counts_df.rename(columns=lambda x: recordingID_to_sessionID.get(x, x))
# all_counts_df= pd.options.display.float_format = '{:,.0f}'.format

display(all_counts_df)

Unnamed: 0,jia_dialogue_state,110,104,111,112,107,106,115,116,117,114,108,109
0,Circular_Talk_Collaboration_Issue_No_Speakers,136,0,112,34,0,0,51,45,79,52,0,0
1,Circular_Talk_Collaboration_Issue_One_Speaker,126,0,105,90,0,0,103,62,131,108,0,0
2,Flow,26,0,85,96,0,0,69,50,77,35,0,0
3,Circular_Talk_Content,18,0,20,21,0,0,24,25,7,24,0,0
4,STUCK,0,143,0,0,31,48,0,0,0,0,63,41
5,TALKING_NO_TEXT,0,86,0,0,236,118,0,0,0,0,132,120
6,Circular_Talk_Collaboration_Issue,0,0,7,2,0,0,4,2,4,4,0,0


In [49]:
del column_of_interest
ids = [
    '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65e1169a7f935400089d241c.csv',
    '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65e26aa2e898da000895f84d.csv',
    '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65e37d88ca5a1700083fa08c.csv',
    '/Users/emilydoherty/Desktop/JIA/test/isat_data_mmia_jia_agent_makecode-65ea5370a718610008c52ed2.csv'
]

columns_of_interest = ['CoBi_Community', 'CoBi_Respect', 'CoBi_Thinking', 'cps_maintain', 'cps_neg', 'cps_comm']
all_averages_df = pd.DataFrame()
for column_of_interest in columns_of_interest:
    averages = []
    for idx, id_path in enumerate(ids, start=114):
        file = pd.read_csv(id_path)
        file = file.sort_values(by=['timestamp'])
        file = file.drop_duplicates(subset='chunk_num', keep='first') #because these annotations are on chunk basis, remove duplicates per chunk_num
        average = file[column_of_interest].mean()
        averages.append(average)
    average_df = pd.DataFrame({f'{column_of_interest}_avg': averages})
    if all_averages_df.empty:
        all_averages_df = average_df
    else:
        all_averages_df = pd.concat([all_averages_df, average_df], axis=1)

all_averages_df = all_averages_df.T
all_averages_df.columns = range(114, 118)
all_averages_df.fillna(0, inplace=True)
pd.options.display.float_format = '{:.3f}'.format

display(all_averages_df)


Unnamed: 0,114,115,116,117
CoBi_Community_avg,0.049,0.089,0.058,0.082
CoBi_Respect_avg,0.079,0.058,0.045,0.074
CoBi_Thinking_avg,0.045,0.037,0.049,0.035
cps_maintain_avg,32.835,37.159,40.033,28.677
cps_neg_avg,33.506,35.664,30.415,31.328
cps_comm_avg,31.699,37.424,37.05,25.022


In [50]:
import os
import pandas as pd

folder_path = '/Users/emilydoherty/Desktop/JIA/test'  # Folder containing CSV files
columns_of_interest = ['CoBi_Community', 'CoBi_Respect', 'CoBi_Thinking', 'cps_maintain', 'cps_neg', 'cps_comm']
all_averages_df = pd.DataFrame()

for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        averages = []
        # Iterate over columns of interest
        for column_of_interest in columns_of_interest:
            file = pd.read_csv(file_path)
            file = file.sort_values(by=['timestamp'])
            file = file.drop_duplicates(subset='chunk_num', keep='first') # Remove duplicates per chunk_num
            average = file[column_of_interest].mean()
            averages.append(average)
        # average_df = pd.DataFrame({file_name[:-4]: averages})
        average_df = pd.DataFrame({file_name.split('-')[-1][:-4]: averages})
        # average_df = pd.DataFrame({f'{column_of_interest}_avg': averages})
        if all_averages_df.empty:
            all_averages_df = average_df
        else:
            all_averages_df = pd.concat([all_averages_df, average_df], axis=1)

#Rename the rows
all_averages_df = all_averages_df.rename(index={i: col for i, col in enumerate(columns_of_interest)})
all_averages_df.fillna(0, inplace=True)

recordingID_to_sessionID = dict(zip(mmiaID['Recording_ID'], mmiaID['Session_ID']))
# Replace column titles with matching Session IDs
all_averages_df = all_averages_df.rename(columns=lambda x: recordingID_to_sessionID.get(x, x))
display(all_averages_df)


Unnamed: 0,110,104,111,112,107,106,115,116,117,114,108,109
CoBi_Community,0.052,0.055,0.086,0.106,0.125,0.083,0.089,0.058,0.082,0.049,0.092,0.092
CoBi_Respect,0.036,0.059,0.053,0.077,0.084,0.063,0.058,0.045,0.074,0.079,0.062,0.093
CoBi_Thinking,0.03,0.019,0.025,0.044,0.061,0.028,0.037,0.049,0.035,0.045,0.042,0.028
cps_maintain,19.704,18.369,25.083,42.729,42.467,33.583,37.159,40.033,28.677,32.835,28.517,36.405
cps_neg,19.853,19.701,23.554,37.845,45.056,35.822,35.664,30.415,31.328,33.506,26.999,34.233
cps_comm,19.59,19.16,25.148,42.375,41.124,32.614,37.424,37.05,25.022,31.699,30.162,33.489
