# This script extracts words from the csv 
## to be used to calculate: 
- words per minute
- total words
- variance of words across match 
- variance of words across speakers
    

### *function to get cleaned words list per transcript and count the number of words*

In [1]:
def get_cleaned_words_list(transcript):
    import pandas as pd

    # import transcript of interest
    coded_transcripts_df = pd.read_csv(transcript)

    # locate the columns for sentences and players
    coded_transcripts_df['Sentence'] = coded_transcripts_df['Sentence'].astype(str)
    all_sentences_column = coded_transcripts_df.iloc[:, 3]
    players_column = coded_transcripts_df.iloc[:, 0]

    # extract the content of each column into a list
    all_sentences = list(all_sentences_column.values) 
    players = list(players_column.values)
    
    # create an empty list to hold all words
    all_words = []

    # got through each sentence 
    for sentence in all_sentences:
        #print(sentence)
        split_the_sentence = sentence.split()

        # from each sentence, extract the word
        for word in split_the_sentence:
            all_words.append(word)

    # take out all the '(inaudible)' lines

    cleaned_word_list = []

    list_with_inaudible_words = ["(inaudible)", "(inaduible", "cross", "talk)", "like"]
    for word in all_words: 
        if word in list_with_inaudible_words :
            continue
        else: 
            cleaned_word_list.append(word)

    #print(cleaned_word_list)
    
    return cleaned_word_list

### *function to get team number*

In [2]:
def get_team_number_from_csv(transcript):

    split_file_name_into_content = transcript.split("\\")
    #print(split_file_name_into_content)
    team_number_file_name = split_file_name_into_content[6][:-4]
    #print(team_number_file_name)
    split = team_number_file_name.split("-")

    team_number = split[1]
   
    return team_number


### *import the team log csv*

In [3]:
import pandas as pd

log_of_team_data_df = pd.read_csv(r'D:\Projects\UG-league-project\data\coded-transcripts\log-of-team-data.csv')
log_of_team_data_df.head()
print(log_of_team_data_df.dtypes)

team_number           int64
team_size             int64
match_id              int64
outcome              object
match_duration       object
kills_per_minute    float64
dtype: object


### *run the functions over all the cleaned transcripts and merge output with team log*

In [4]:
import glob

# go through the files in the directory
transcripts = glob.glob(r'D:\Projects\UG-league-project\data\coded-transcripts\recoded-transcripts\*.csv')


#print('Files in folder:',transcripts)
#print("")
team_num_and_total_words_dict = {'team_number': [], 'total_words': []}

# loop over each transcript
for transcript in transcripts: 
    #print(transcript)
    
    # get the cleaned words list
    cleaned_words = get_cleaned_words_list(transcript)
    
    # count the total number of cleaned words
    total_cleaned_words = len(cleaned_words)
    
    # add the total number of cleaned words to the dictionary
    team_num_and_total_words_dict['total_words'].append(total_cleaned_words)
    
    # then get the corresponding team number
    team_number = get_team_number_from_csv(transcript)
    #print(team_number)
    
    # and add the corresponding team number to the dictionary
    team_num_and_total_words_dict['team_number'].append(int(team_number))


    print('total words = ', total_cleaned_words)
    print("")

team_num_and_total_words_dict


total words =  1862

total words =  3476

total words =  1220

total words =  3310

total words =  2678

total words =  2907

total words =  1754

total words =  3004

total words =  1530

total words =  942

total words =  1114

total words =  2527

total words =  1990

total words =  5619

total words =  2742

total words =  2741

total words =  1500

total words =  2232

total words =  1130

total words =  2818

total words =  2758

total words =  2857

total words =  3096

total words =  945

total words =  3897

total words =  3798

total words =  1973

total words =  1420

total words =  2148

total words =  3881

total words =  2567

total words =  2414

total words =  2558

total words =  2412

total words =  1632

total words =  3777

total words =  1570

total words =  2700

total words =  3245

total words =  4973

total words =  976

total words =  1919

total words =  831

total words =  2642

total words =  3239

total words =  1749

total words =  2579

total words =  34

{'team_number': [1,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  2,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  3,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  4,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  5,
  6,
  7,
  8,
  9],
 'total_words': [1862,
  3476,
  1220,
  3310,
  2678,
  2907,
  1754,
  3004,
  1530,
  942,
  1114,
  2527,
  1990,
  5619,
  2742,
  2741,
  1500,
  2232,
  1130,
  2818,
  2758,
  2857,
  3096,
  945,
  3897,
  3798,
  1973,
  1420,
  2148,
  3881,
  2567,
  2414,
  2558,
  2412,
  1632,
  3777,
  1570,
  2700,
  3245,
  4973,
  976,
  1919,
  831,
  2642,
  3239,
  1749,
  2579,
  3481]}

convert dictionary with team number and total words to dictionary

In [5]:
import pandas as pd

# convert the dictionary to dataframe
team_num_and_total_words_df = pd.DataFrame.from_dict(team_num_and_total_words_dict)

team_num_and_total_words_df
#print(team_num_and_total_words_df.dtypes)

Unnamed: 0,team_number,total_words
0,1,1862
1,10,3476
2,11,1220
3,12,3310
4,13,2678
5,14,2907
6,15,1754
7,16,3004
8,17,1530
9,18,942


In [6]:
# merge the dataframes on the team_number column 
merged_df = pd.merge(log_of_team_data_df, team_num_and_total_words_df, on = 'team_number')

merged_df

Unnamed: 0,team_number,team_size,match_id,outcome,match_duration,kills_per_minute,total_words
0,1,3,5056494058,l,25:15:00,0.713,1862
1,2,5,5064844365,w,29:39:00,1.484,2527
2,3,3,5081077877,w,26:40:00,0.862,3096
3,4,5,5060243044,w,23:08:00,1.47,2412
4,5,5,5062654604,l,33:04:00,0.877,2642
5,6,3,5064350747,w,19:35:00,1.532,3239
6,7,3,5062628707,l,30:27:00,0.92,1749
7,8,3,5074481840,l,32:13:00,0.962,2579
8,9,3,5063912089,l,30:28:00,0.919,3481
9,10,3,5061949934,w,35:41:00,0.589,3476


## this block converts minutes and seconds to seconds

In [7]:
def get_minute_index(transcript):
    # get a correctly formatted timeobject (ignore date output)
    transcript['time_object'] = pd.to_datetime(transcript['Timestamp'], format = '%M:%S:%f')

    # get only the minutes from the datetime object
    transcript['minute_index'] = transcript['time_object'].dt.minute
    
    return transcript


In [8]:
from datetime import datetime 
import pandas as pd
transcript = pd.read_csv(r'D:\Projects\UG-league-project\data\coded-transcripts\recoded-transcripts\team-22.csv')
transcript.head()

get_minute_index(transcript)

Unnamed: 0,Player,Team,Timestamp,Sentence,code-1-eve,code-2-eve,code-1-aj,code-2-aj,rater1_1,rater1_2,rater2_1,rater2_2,time_object,minute_index
0,2,22,0:04:00,Who am I against again?,5,,5,,question or inquiry,,question or inquiry,,1900-01-01 00:00:04,0
1,3,22,0:07:00,probably against renekton,6,,6,,answer,,answer,,1900-01-01 00:00:07,0
2,2,22,0:08:00,"Trundle, nah im jungle",6,,6,,answer,,answer,,1900-01-01 00:00:08,0
3,2,22,0:19:00,Trundle is one of those matchups where like if...,17,,17,,opinion or analysis,,opinion or analysis,,1900-01-01 00:00:19,0
4,2,22,0:25:00,um bot lane is going to be the island for the ...,15,,15,,non-task related,,non-task related,,1900-01-01 00:00:25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431,2,22,36:52:00,Jesus.,19,,19,,emotional expression,,emotional expression,,1900-01-01 00:36:52,36
432,2,22,36:58:00,Yeah i dont.,0,,3,,misc,,observation,,1900-01-01 00:36:58,36
433,3,22,37:28:00,Jesus,19,,19,,emotional expression,,emotional expression,,1900-01-01 00:37:28,37
434,2,22,37:35:00,That was good game,18,,18,,encouragement,,encouragement,,1900-01-01 00:37:35,37


## Function to extract single words for wpm calculation

In [9]:
def extract_cleaned_words(coded_transcripts_df):

    # locate the columns for sentences and players
    all_sentences_column = coded_transcripts_df.iloc[:, 3]

    # extract the content of each column into a list
    all_sentences = list(all_sentences_column.values) 
    
    # create an empty list to hold all words
    all_words = []

    # got through each sentence 
    for sentence in all_sentences:
        split_the_sentence = sentence.split()

        # from each sentence, extract the word
        for word in split_the_sentence:
            all_words.append(word)

    # take out all the '(inaudible)' lines

    cleaned_word_list = []

    list_with_inaudible_words = ["(inaudible)", "(inaduible", "cross", "talk)", "like"]
    for word in all_words: 
        if word in list_with_inaudible_words :
            continue
        else: 
            cleaned_word_list.append(word)

    #print(cleaned_word_list)
    
    return len(cleaned_word_list)

## Function to get word utterances each minute

In [10]:


def get_words_each_minute(transcript):
    # get unique minute indexes
    # for each row of that minute index, get the number of words
    # that's the number of words per minute

    unique_minutes =  pd.unique(transcript['minute_index'])

    words_each_minute = []

    for minute in range(max(unique_minutes) + 1):
        sentences_this_minute = transcript[transcript['minute_index'] == minute]

        number_of_words_in_minute_index = extract_cleaned_words(sentences_this_minute)

        words_each_minute.append(number_of_words_in_minute_index)


    #print("unique minutes: ", unique_minutes)

    #print('words each minute:', words_each_minute)

    # returns a list of total word utterance each minute
    return words_each_minute

# Function to get category utterance per minute

In [11]:
def get_code_utterance_each_minute(transcript):

    # get the category column
    code_column = transcript.iloc[:, 8]

    # extract the content of each column into a list
    all_codes = list(code_column.values) 

    unique_minutes =  pd.unique(transcript['minute_index'])

    # create empty lists to hold counts of each category per minute
    commands_each_minute = []
    suggestions_each_minute = []
    opinions_each_minute = []
    observations_each_minute = []
    questions_each_minute = []
    answers_each_minute = []
    agrees_each_minute = []
    disagrees_each_minute = []
    sharings_each_minute = []
    emotions_each_minute = []
    encourages_each_minute = []
    nontasks_each_minute = []
    frustrations_each_minute = []
    humours_each_minute = []
    apologies_each_minute = []
    thanks_each_minute = []
    miscs_each_minute = []

    # extract the category counts per minute

    for minute in range(max(unique_minutes) + 1):
        codes_this_minute = transcript[transcript['minute_index'] == minute]


        # create an empty list to hold all codes
        command = []
        suggestion = []
        opinion_analysis = []
        observation = []
        question = []
        answer = []
        agree_acknowledge = []
        disagree = []
        sharing_intention = []
        emotional_expression = []
        encouragement = []
        non_task = []
        frustration = []
        humour_taunting = []
        apologies_remorse = []
        thanks_welcome = []
        misc = []

        for code in codes_this_minute['rater1_1']:

            if code == 'misc': 
                misc.append(code)

            if code == 'command': 
                command.append(code)

            if code == 'observation': 
                observation.append(code)

            if code == 'suggestion': 
                suggestion.append(code)

            if code == 'question or inquiry': 
                question.append(code)

            if code == 'answer': 
                answer.append(code)

            if code == 'sharing intention': 
                sharing_intention.append(code)

            if code == 'disagree': 
                disagree.append(code)

            if code == 'humour or taunting': 
                humour_taunting.append(code)

            if code == 'frustration': 
                frustration.append(code)

            if code == 'apologies': 
                apologies_remorse.append(code)

            if code == 'non-task related': 
                non_task.append(code)

            if code == 'agree or acknowledge': 
                agree_acknowledge.append(code)

            if code == 'opinion or analysis': 
                opinion_analysis.append(code)

            if code == 'encouragement': 
                encouragement.append(code)

            if code == 'emotional expression': 
                emotional_expression.append(code)

            if code == 'thanks or welcome': 
                thanks_welcome.append(code)


        commands_each_minute.append(len(command))
        suggestions_each_minute.append(len(suggestion))
        opinions_each_minute.append(len(opinion_analysis))
        observations_each_minute.append(len(observation))
        questions_each_minute.append(len(question))
        answers_each_minute.append(len(answer))
        agrees_each_minute.append(len(agree_acknowledge))
        disagrees_each_minute.append(len(disagree))
        sharings_each_minute.append(len(sharing_intention))
        emotions_each_minute.append(len(emotional_expression))
        encourages_each_minute.append(len(encouragement))
        nontasks_each_minute.append(len(non_task))
        frustrations_each_minute.append(len(frustration))
        humours_each_minute.append(len(humour_taunting))
        apologies_each_minute.append(len(apologies_remorse))
        thanks_each_minute.append(len(thanks_welcome))
        miscs_each_minute.append(len(misc))

    print('opinions each minute:', opinions_each_minute, '\n'
          'commands each minute:', commands_each_minute, '\n'
          'suggestions each minute:', suggestions_each_minute, '\n'
          'observations each minute:', observations_each_minute, '\n'
          'questions each minute:', questions_each_minute, '\n'
          'answers each minute:', answers_each_minute, '\n'
          'agree/acknowldge each minute:', agrees_each_minute, '\n'
          'disagrees each minute:', disagrees_each_minute, '\n'
          'sharing intentions each minute: ', sharings_each_minute, '\n'
          'emotional expressions each minute:', emotions_each_minute, '\n'
          'encouragements each minute:', encourages_each_minute, '\n'
          'non tasks each minute:', nontasks_each_minute, '\n'
          'frustrations each minute:', frustrations_each_minute, '\n'
          'humour each minute:', humours_each_minute, '\n'
          'apologies each minute:', apologies_each_minute, '\n'
          'thanks each minute:', thanks_each_minute, '\n'
          'miscs each minute:', miscs_each_minute)

    return

## Descriptive Analysis Scripts

this function gets the descriptive stats

Within team variables: 
 - mean wpm : rate of chat
 - variance wpm : distribution of chat across match duration

In [12]:
def get_descriptive_stats_for_words(count_of_words_each_minute):
    # get descriptive stats

    # get the stats and add stats as element to list
    get_mean_wpm = np.mean(count_of_words_each_minute)
    
    get_variance_wpm = np.var(count_of_words_each_minute)
    
    get_sum_wpm = np.sum(count_of_words_each_minute)

    get_median_wpm = np.median(count_of_words_each_minute)

    #print('mean:', mean_wpm)
    #print('variance:', variance_wpm)
    #print('sum:', sum_wpm)
    #print('median:', median_wpm)
    
    return get_mean_wpm, get_variance_wpm, get_sum_wpm, get_median_wpm

In [13]:
## TRYING TO RUN IT

This section runs all the functions to get descriptive stats for word utterances
----------------

to do: 
- merge with the log of team data

In [20]:
from datetime import datetime 
import pandas as pd
import numpy as np
import glob

# populate the descriptives
team_number = []
mean_wpm = []
variance_wpm = []
sum_wpm = []
median_wpm = []

# go through the files in the directory
transcripts = glob.glob(r'D:\Projects\UG-league-project\data\coded-transcripts\recoded-transcripts\*.csv')

#get_team_number_from_xls(transcripts)
for transcript in transcripts: 
    print(transcript)

    # get team number
    team_number.append(int(get_team_number_from_csv(transcript)))
    
    # read the transcript as a dataframe
    transcript = pd.read_csv(transcript)

    #transcript.head()

    # first convert to minute index
    get_minute_index(transcript)

    # then get the word utterance each minute
    words_each_minute = get_words_each_minute(transcript)

    # then get descriptives
    get_mean_wpm, get_variance_wpm, get_sum_wpm, get_median_wpm = get_descriptive_stats_for_words(words_each_minute)

    mean_wpm.append(get_mean_wpm)
    variance_wpm.append(get_variance_wpm)
    sum_wpm.append(get_sum_wpm)
    median_wpm.append(get_median_wpm)

print(team_number, mean_wpm, variance_wpm, sum_wpm, median_wpm)

D:\Projects\UG-league-project\data\coded-transcripts\recoded-transcripts\team-1.csv
D:\Projects\UG-league-project\data\coded-transcripts\recoded-transcripts\team-10.csv
D:\Projects\UG-league-project\data\coded-transcripts\recoded-transcripts\team-11.csv
D:\Projects\UG-league-project\data\coded-transcripts\recoded-transcripts\team-12.csv
D:\Projects\UG-league-project\data\coded-transcripts\recoded-transcripts\team-13.csv
D:\Projects\UG-league-project\data\coded-transcripts\recoded-transcripts\team-14.csv
D:\Projects\UG-league-project\data\coded-transcripts\recoded-transcripts\team-15.csv
D:\Projects\UG-league-project\data\coded-transcripts\recoded-transcripts\team-16.csv
D:\Projects\UG-league-project\data\coded-transcripts\recoded-transcripts\team-17.csv
D:\Projects\UG-league-project\data\coded-transcripts\recoded-transcripts\team-18.csv
D:\Projects\UG-league-project\data\coded-transcripts\recoded-transcripts\team-19.csv
D:\Projects\UG-league-project\data\coded-transcripts\recoded-trans

create a dataframe of descriptives

In [22]:
descriptives = {'team_number': team_number, 'mean_wpm': mean_wpm, 'variance_wpm': variance_wpm, 'sum_wpm': sum_wpm, 'median_wpm': median_wpm}

descriptives_df = pd.DataFrame(descriptives)

descriptives_df.head()

Unnamed: 0,team_number,mean_wpm,variance_wpm,sum_wpm,median_wpm
0,1,71.615385,1152.159763,1862,67.0
1,10,115.866667,511.648889,3476,120.5
2,11,48.8,740.64,1220,44.0
3,12,103.4375,682.808594,3310,106.0
4,13,111.583333,1073.993056,2678,121.5


In [23]:
merged_df

Unnamed: 0,team_number,team_size,match_id,outcome,match_duration,kills_per_minute,total_words
0,1,3,5056494058,l,25:15:00,0.713,1862
1,2,5,5064844365,w,29:39:00,1.484,2527
2,3,3,5081077877,w,26:40:00,0.862,3096
3,4,5,5060243044,w,23:08:00,1.47,2412
4,5,5,5062654604,l,33:04:00,0.877,2642
5,6,3,5064350747,w,19:35:00,1.532,3239
6,7,3,5062628707,l,30:27:00,0.92,1749
7,8,3,5074481840,l,32:13:00,0.962,2579
8,9,3,5063912089,l,30:28:00,0.919,3481
9,10,3,5061949934,w,35:41:00,0.589,3476


In [25]:
# merge the dataframes on the team_number column 
log_with_descriptives = pd.merge(log_of_team_data_df, descriptives_df, on = 'team_number')

log_with_descriptives.to_csv(r'D:\Projects\UG-league-project\data\coded-transcripts\log_with_descriptives.csv', index = False)

In [None]:
import numpy as np

def get_variance(code_utterance_each_minute):
    
    variance_opinions = 
    variance

### to do: 
- loop descriptive stats calculator over all transcripts
- get speaker distribution

get speaker distribution
- step 1: get total words per player per team
- step 2: calculate variance of words per player per team