# This script extracts words from the csv 
## to be used to calculate: 
- words per minute
- total words
- variance of words across match 
- variance of words across speakers
    

### *function to get cleaned words list per transcript and count the number of words*

In [None]:
def get_cleaned_words_list(transcript):
    import pandas as pd

    # import transcript of interest
    coded_transcripts_df = pd.read_csv(transcript)

    # locate the columns for sentences and players
    coded_transcripts_df['Sentence'] = coded_transcripts_df['Sentence'].astype(str)
    all_sentences_column = coded_transcripts_df.iloc[:, 3]
    players_column = coded_transcripts_df.iloc[:, 0]

    # extract the content of each column into a list
    all_sentences = list(all_sentences_column.values) 
    players = list(players_column.values)
    
    # create an empty list to hold all words
    all_words = []

    # got through each sentence 
    for sentence in all_sentences:
        #print(sentence)
        split_the_sentence = sentence.split()

        # from each sentence, extract the word
        for word in split_the_sentence:
            all_words.append(word)

    # take out all the '(inaudible)' lines

    cleaned_word_list = []

    list_with_inaudible_words = ["(inaudible)", "(inaduible", "cross", "talk)", "like"]
    for word in all_words: 
        if word in list_with_inaudible_words :
            continue
        else: 
            cleaned_word_list.append(word)

    #print(cleaned_word_list)
    
    return cleaned_word_list

### *function to get team number*

In [None]:
def get_team_number_from_csv(transcript):

    split_file_name_into_content = transcript.split("\\")
    #print(split_file_name_into_content)
    team_number_file_name = split_file_name_into_content[6][:-4]
    #print(team_number_file_name)
    split = team_number_file_name.split("-")

    team_number = split[1]
   
    return team_number


### *import the team log csv*

In [None]:
import pandas as pd

log_of_team_data_df = pd.read_csv(r'D:\Projects\UG-league-project\data\coded-transcripts\log-of-team-data.csv')
log_of_team_data_df.head()
print(log_of_team_data_df.dtypes)

### *run the functions over all the cleaned transcripts and merge output with team log*

In [None]:
import glob

# go through the files in the directory
transcripts = glob.glob(r'D:\Projects\UG-league-project\data\coded-transcripts\recoded-transcripts\*.csv')


#print('Files in folder:',transcripts)
#print("")
team_num_and_total_words_dict = {'team_number': [], 'total_words': []}

# loop over each transcript
for transcript in transcripts: 
    #print(transcript)
    
    # get the cleaned words list
    cleaned_words = get_cleaned_words_list(transcript)
    
    # count the total number of cleaned words
    total_cleaned_words = len(cleaned_words)
    
    # add the total number of cleaned words to the dictionary
    team_num_and_total_words_dict['total_words'].append(total_cleaned_words)
    
    # then get the corresponding team number
    team_number = get_team_number_from_csv(transcript)
    #print(team_number)
    
    # and add the corresponding team number to the dictionary
    team_num_and_total_words_dict['team_number'].append(int(team_number))


    print('total words = ', total_cleaned_words)
    print("")

team_num_and_total_words_dict


convert dictionary with team number and total words to dictionary

In [None]:
import pandas as pd

# convert the dictionary to dataframe
team_num_and_total_words_df = pd.DataFrame.from_dict(team_num_and_total_words_dict)

team_num_and_total_words_df
#print(team_num_and_total_words_df.dtypes)

In [None]:
# merge the dataframes on the team_number column 
merged_df = pd.merge(log_of_team_data_df, team_num_and_total_words_df, on = 'team_number')

merged_df

## this block converts minutes and seconds to seconds

In [None]:
def get_minute_index(transcript):
    # get a correctly formatted timeobject (ignore date output)
    transcript['time_object'] = pd.to_datetime(transcript['Timestamp'], format = '%M:%S:%f')

    # get only the minutes from the datetime object
    transcript['minute_index'] = transcript['time_object'].dt.minute
    
    return transcript


In [None]:
from datetime import datetime 
import pandas as pd
transcript = pd.read_csv(r'D:\Projects\UG-league-project\data\coded-transcripts\recoded-transcripts\team-22.csv')
transcript.head()

get_minute_index(transcript)

## Function to extract single words for wpm calculation

In [None]:
def extract_cleaned_words(coded_transcripts_df):

    # locate the columns for sentences and players
    all_sentences_column = coded_transcripts_df.iloc[:, 3]

    # extract the content of each column into a list
    all_sentences = list(all_sentences_column.values) 
    
    # create an empty list to hold all words
    all_words = []

    # got through each sentence 
    for sentence in all_sentences:
        split_the_sentence = sentence.split()

        # from each sentence, extract the word
        for word in split_the_sentence:
            all_words.append(word)

    # take out all the '(inaudible)' lines

    cleaned_word_list = []

    list_with_inaudible_words = ["(inaudible)", "(inaduible", "cross", "talk)", "like"]
    for word in all_words: 
        if word in list_with_inaudible_words :
            continue
        else: 
            cleaned_word_list.append(word)

    #print(cleaned_word_list)
    
    return len(cleaned_word_list)

## Function to get word utterances each minute

In [None]:
def get_words_each_minute(transcript):
    # get unique minute indexes
    # for each row of that minute index, get the number of words
    # that's the number of words per minute

    unique_minutes =  pd.unique(transcript['minute_index'])

    words_each_minute = []

    for minute in range(max(unique_minutes) + 1):
        sentences_this_minute = transcript[transcript['minute_index'] == minute]

        number_of_words_in_minute_index = extract_cleaned_words(sentences_this_minute)

        words_each_minute.append(number_of_words_in_minute_index)


    #print("unique minutes: ", unique_minutes)

    #print('words each minute:', words_each_minute)

    # returns a list of total word utterance each minute
    return words_each_minute

In [None]:
## category utterance 

In [None]:
from datetime import datetime 
import pandas as pd
transcript = pd.read_csv(r'D:\Projects\UG-league-project\data\coded-transcripts\recoded-transcripts\team-22.csv')

transcript = get_minute_index(transcript)

get_code_utterance_each_minute(transcript)

In [None]:
# navigate to the category column
code_column = transcript.iloc[:, 8]

# extract the content of each column into a list
all_codes = list(code_column.values) 

unique_minutes =  pd.unique(transcript['minute_index'])

In [None]:
# create an empty list to hold all codes
opinion_analysis = []

# splitting the dataframe into sub dataframes based on minute index
for minute in range(max(unique_minutes) + 1):
    codes_this_minute = transcript[transcript['minute_index'] == minute]
    
    for code in codes_this_minute['rater1_1']:
        if code == 'opinion or analysis': 
            opinion_analysis.append(code)

print(opinion_analysis)

In [None]:
# get the category column
    code_column = transcript.iloc[:, 8]

    # extract the content of each column into a list
    all_codes = list(code_column.values) 

    unique_minutes =  pd.unique(transcript['minute_index'])

    # create empty lists to hold counts of each category per minute
    commands_each_minute = []
    suggestions_each_minute = []
    opinions_each_minute = []
    observations_each_minute = []
    questions_each_minute = []
    answers_each_minute = []
    agrees_each_minute = []
    disagrees_each_minute = []
    sharings_each_minute = []
    emotions_each_minute = []
    encourages_each_minute = []
    nontasks_each_minute = []
    frustrations_each_minute = []
    humours_each_minute = []
    apologies_each_minute = []
    thanks_each_minute = []
    miscs_each_minute = []
    
    # create an empty list to hold all codes
    command = []
    suggestion = []
    opinion_analysis = []
    observation = []
    question = []
    answer = []
    agree_acknowledge = []
    disagree = []
    sharing_intention = []
    emotional_expression = []
    encouragement = []
    non_task = []
    frustration = []
    humour_taunting = []
    apologies_remorse = []
    thanks_welcome = []
    misc = []

    # extract the category counts per minute

    for minute in range(max(unique_minutes) + 1):
        codes_this_minute = transcript[transcript['minute_index'] == minute]

        for code in codes_this_minute['rater1_1']:

            if code == 'misc': 
                misc.append(code)

            if code == 'command': 
                command.append(code)

            if code == 'observation': 
                observation.append(code)

            if code == 'suggestion': 
                suggestion.append(code)

            if code == 'question or inquiry': 
                question.append(code)

            if code == 'answer': 
                answer.append(code)

            if code == 'sharing intention': 
                sharing_intention.append(code)

            if code == 'disagree': 
                disagree.append(code)

            if code == 'humour or taunting': 
                humour_taunting.append(code)

            if code == 'frustration': 
                frustration.append(code)

            if code == 'apologies': 
                apologies_remorse.append(code)

            if code == 'non-task related': 
                non_task.append(code)

            if code == 'agree or acknowledge': 
                agree_acknowledge.append(code)

            if code == 'opinion or analysis': 
                opinion_analysis.append(code)

            if code == 'encouragement': 
                encouragement.append(code)

            if code == 'emotional expression': 
                emotional_expression.append(code)

            if code == 'thanks or welcome': 
                thanks_welcome.append(code)


            commands_each_minute.append(len(command))
            suggestions_each_minute.append(len(suggestion))
            opinions_each_minute.append(len(opinion_analysis))
            observations_each_minute.append(len(observation))
            questions_each_minute.append(len(question))
            answers_each_minute.append(len(answer))
            agrees_each_minute.append(len(agree_acknowledge))
            disagrees_each_minute.append(len(disagree))
            sharings_each_minute.append(len(sharing_intention))
            emotions_each_minute.append(len(emotional_expression))
            encourages_each_minute.append(len(encouragement))
            nontasks_each_minute.append(len(non_task))
            frustrations_each_minute.append(len(frustration))
            humours_each_minute.append(len(humour_taunting))
            apologies_each_minute.append(len(apologies_remorse))
            thanks_each_minute.append(len(thanks_welcome))
            miscs_each_minute.append(len(misc))

## Descriptive Analysis Scripts

this function gets the descriptive stats

Within team variables: 
 - mean wpm : rate of chat
 - variance wpm : distribution of chat across match duration

In [None]:
def get_descriptive_stats_for_words(count_of_words_each_minute):
    # get descriptive stats

    # get the stats and add stats as element to list
    get_mean_wpm = np.mean(count_of_words_each_minute)
    
    get_variance_wpm = np.var(count_of_words_each_minute)
    
    get_sum_wpm = np.sum(count_of_words_each_minute)

    get_median_wpm = np.median(count_of_words_each_minute)

    #print('mean:', mean_wpm)
    #print('variance:', variance_wpm)
    #print('sum:', sum_wpm)
    #print('median:', median_wpm)
    
    return get_mean_wpm, get_variance_wpm, get_sum_wpm, get_median_wpm

In [None]:
## TRYING TO RUN IT

This section runs all the functions to get descriptive stats for word utterances
----------------

to do: 
- merge with the log of team data

In [None]:
from datetime import datetime 
import pandas as pd
import numpy as np
import glob

# populate the descriptives
team_number = []
mean_wpm = []
variance_wpm = []
sum_words = []
median_wpm = []

# go through the files in the directory
transcripts = glob.glob(r'D:\Projects\UG-league-project\data\coded-transcripts\recoded-transcripts\*.csv')

#get_team_number_from_xls(transcripts)
for transcript in transcripts: 
    print(transcript)

    # get team number
    team_number.append(int(get_team_number_from_csv(transcript)))
    
    # read the transcript as a dataframe
    transcript = pd.read_csv(transcript)

    #transcript.head()

    # first convert to minute index
    get_minute_index(transcript)

    # then get the word utterance each minute
    words_each_minute = get_words_each_minute(transcript)

    # then get descriptives
    get_mean_wpm, get_variance_wpm, get_sum_wpm, get_median_wpm = get_descriptive_stats_for_words(words_each_minute)

    mean_wpm.append(get_mean_wpm)
    variance_wpm.append(get_variance_wpm)
    sum_words.append(get_sum_wpm)
    median_wpm.append(get_median_wpm)

print(team_number, mean_wpm, variance_wpm, sum_words, median_wpm)

create a dataframe of descriptives

In [None]:
descriptives = {'team_number': team_number, 'mean_wpm': mean_wpm, 'variance_wpm': variance_wpm, 'sum_words': sum_words, 'median_wpm': median_wpm}

descriptives_df = pd.DataFrame(descriptives)

descriptives_df.head()

In [None]:
merged_df

In [None]:
# merge the dataframes on the team_number column 
log_with_descriptives = pd.merge(log_of_team_data_df, descriptives_df, on = 'team_number')

log_with_descriptives

# uncomment to save to csv
#log_with_descriptives.to_csv(r'D:\Projects\UG-league-project\data\coded-transcripts\log_with_descriptives.csv', index = False)

## This section gets descriptive stats for code utterance

In [None]:
# descriptives to populate

team_number = []

# opinions/analysis
opinion_mean_upm = []
opinion_variance_upm = []
opinion_sum = []
opinion_median_upm = []

# observations
obs_mean_upm = []
obs_variance_upm = []
obs_sum = []
obs_median_upm = []

# commands
command_mean_upm = []
command_variance_upm = []
command_sum = []
command_median_upm = []

# suggestions
suggest_mean_upm = []
suggest_variance_upm = []
suggest_sum = []
suggest_median_upm = []

# thanks
thanks_mean_upm = []
thanks_variance_upm = []
thanks_sum = []
thanks_median_upm = []

# sharing intentions
intentions_mean_upm = []
intentions_variance_upm = []
intentions_sum = []
intentions_median_upm = []

# questions/inquirys
questions_mean_upm = []
questions_variance_upm = []
questions_sum = []
questions_median_upm = []

# answers
answers_mean_upm = []
answers_variance_upm = []
answers_sum = []
answers_median_upm = []

# emotional expressions
emotionex_mean_upm = []
emotionex_variance_upm = []
emotionex_sum = []
emotionex_median_upm = []

# agrees/acknowledge
agree_mean_upm = []
agree_variance_upm = []
agree_sum = []
agree_median_upm = []

# humour/taunting
humour_mean_upm = []
humour_variance_upm = []
humour_sum = []
humour_median_upm = []

# apologies
apology_mean_upm = []
apology_variance_upm = []
apology_sum = []
apology_median_upm = []

# encouragements
encourage_mean_upm = []
encourage_variance_upm = []
encourage_sum = []
encourage_median_upm = []

# frustrations
frust_mean_upm = []
frust_variance_upm = []
frust_sum = []
frust_median_upm = []

# disagrees
disagree_mean_upm = []
disagree_variance_upm = []
disagree_sum = []
disagree_median_upm = []

# non-task 
nontask_mean_upm = []
nontask_variance_upm = []
nontask_sum = []
nontask_median_upm = []

# misc
misc_mean_upm = []
misc_variance_upm = []
misc_sum = []
misc_median_upm = []




In [None]:
# opinions
# then get descriptives
get_mean_wpm, get_variance_wpm, get_sum_wpm, get_median_wpm = get_descriptive_stats_for_words(words_each_minute)

# append to corresponding list
opinion_mean_upm = []
opinion_variance_upm = []
opinion_sum = []
opinion_median_upm = []




### to do: 
- get speaker distribution

get speaker distribution
- step 1: get total words per player per team
- step 2: calculate variance of words per player per team