This notebook maps common ground annotations to utterance IDs retrieved from the oracle transcripts provided by CSU.
Edit the variables to match your paths and which groups you are running. 
utteranceDict is a dictionary of Groups.
    Each group is a dictionary of utteranceIDs.
        Each utterance is a dictionary of Start and End times, and the assosiated CGA annotation labels.
Example: {'Group_01': {'Group_01_0': {'Start': 3.45, 'End': 7.14, 'CGA': ['S0']}}}

Note: utteranceIDs can have multiple CGA labels. CGA labels may exist in multiple utterance IDs. This is due to speech overlap and CGA annotation.

In [7]:
import numpy as np
import os
import csv
import math
import statistics
import pandas as pd

In [8]:
#list the groups you are running
groups = ["Group_01"]

#list the file pattern to find the oracle and CGA annotations
#use a dummy group to replace in the strings
dummy_group = "Group_00"
oracle_csv_naming =fr"C:\Users\bradf\OneDrive - Colostate\Research\Initial Observations for Fib Weights\Data\Weights Task Dataset\{dummy_group}\{dummy_group}_Oracle.csv"
cga_csv_naming = fr"C:\Users\bradf\OneDrive - Colostate\Research\Initial Observations for Fib Weights\Data\Weights Task Dataset\{dummy_group}\{dummy_group}_CGA.csv"

#utterance dictionary
utteranceDict = {}
for group in groups:
    utteranceDict[group] = {}


In [9]:
def mapStatementtoUtterance(group):
    cga_csv = cga_csv_naming.replace(dummy_group, group)
    cga_df = pd.read_csv(cga_csv)
    cga_label = cga_df.loc[:, "Common Ground"]
    cga_start = cga_df.loc[:, "Begin Time - ss.msec"]
    cga_end = cga_df.loc[:, "End Time - ss.msec"]
    for num in range(len(cga_label)):
        label = cga_label[num].split(":")[0]
        start = float(cga_start[num])
        end = float(cga_end[num])
        for utterance in utteranceDict[group]:
            if ("CGA" not in utteranceDict[group][utterance]):
                utteranceDict[group][utterance]["CGA"] = []
            if (utteranceDict[group][utterance]["Start"] < end and utteranceDict[group][utterance]["End"] > start):
                utteranceDict[group][utterance]["CGA"].append(label)
        

In [10]:
def getUtteranceDictionary(group):
    oracle_csv = oracle_csv_naming.replace(dummy_group, group)
    oracle_df = pd.read_csv(oracle_csv)
    utterance_numbers = oracle_df.loc[:,"Utterance"]
    utterance_start = oracle_df.loc[:,"Start"]
    utterance_end = oracle_df.loc[:,"End"]
    for num in utterance_numbers:
        utteranceDict[group][f"{group}_{num}"] = {"Start":float(utterance_start[num]), "End":float(utterance_end[num])}


In [11]:
for group in groups:
    getUtteranceDictionary(group)
    mapStatementtoUtterance(group)
print(utteranceDict)

{'Group_01': {'Group_01_0': {'Start': 3.45, 'End': 7.14, 'CGA': []}, 'Group_01_1': {'Start': 9.87, 'End': 12.45, 'CGA': []}, 'Group_01_2': {'Start': 12.63, 'End': 15.75, 'CGA': []}, 'Group_01_3': {'Start': 16.2, 'End': 21.36, 'CGA': []}, 'Group_01_4': {'Start': 22.17, 'End': 24.27, 'CGA': []}, 'Group_01_5': {'Start': 24.42, 'End': 27.72, 'CGA': ['S024']}, 'Group_01_6': {'Start': 29.08, 'End': 29.83, 'CGA': ['ACCEPT(S024)']}, 'Group_01_7': {'Start': 29.84, 'End': 31.32, 'CGA': []}, 'Group_01_8': {'Start': 31.44, 'End': 34.56, 'CGA': []}, 'Group_01_9': {'Start': 35.01, 'End': 36.99, 'CGA': []}, 'Group_01_10': {'Start': 37.83, 'End': 47.82, 'CGA': []}, 'Group_01_11': {'Start': 48.27, 'End': 52.25, 'CGA': []}, 'Group_01_12': {'Start': 52.58, 'End': 54.84, 'CGA': []}, 'Group_01_13': {'Start': 54.93, 'End': 58.29, 'CGA': []}, 'Group_01_14': {'Start': 58.33, 'End': 58.84, 'CGA': []}, 'Group_01_15': {'Start': 59.22, 'End': 60.92, 'CGA': []}, 'Group_01_16': {'Start': 60.66, 'End': 62.49, 'CGA':