# This script extracts words from the csv 
## to be used to calculate: 
- words per minute
- total words
- variance of words across match 
- variance of words across speakers
    

### *function to get cleaned words list per transcript and count the number of words*

In [1]:
def get_cleaned_words_list(transcript):
    import pandas as pd

    # import transcript of interest
    coded_transcripts_df = pd.read_csv(transcript)

    # locate the columns for sentences and players
    all_sentences_column = coded_transcripts_df.iloc[:, 3]
    players_column = coded_transcripts_df.iloc[:, 0]

    # extract the content of each column into a list
    all_sentences = list(all_sentences_column.values) 
    players = list(players_column.values)
    
    # create an empty list to hold all words
    all_words = []

    # got through each sentence 
    for sentence in all_sentences:
        split_the_sentence = sentence.split()

        # from each sentence, extract the word
        for word in split_the_sentence:
            all_words.append(word)

    # take out all the '(inaudible)' lines

    cleaned_word_list = []

    list_with_inaudible_words = ["(inaudible)", "(inaduible", "cross", "talk)", "like"]
    for word in all_words: 
        if word in list_with_inaudible_words :
            continue
        else: 
            cleaned_word_list.append(word)

    #print(cleaned_word_list)
    
    return(cleaned_word_list)

### *function to get team number*

In [2]:
# extract the team number
def get_team_number(transcript):
    
    split_file_name_into_content = transcript.split("\\")
    team_number = split_file_name_into_content[5][:-4].split("[")[1][:-1]
    
    print("team number = ", team_number)
    
    return(team_number)

    
    

### *import the team log csv*

In [3]:
import pandas as pd

log_of_team_data_df = pd.read_csv(r'D:\Projects\UG-league-project\data\log-of-team-data.csv')
log_of_team_data_df.head()
print(log_of_team_data_df.dtypes)

team_number         float64
team_size           float64
match_id            float64
outcome              object
match_duration       object
kills_per_minute    float64
dtype: object


### *run the functions over all the cleaned transcripts and merge output with team log*

In [4]:
import glob

# go through the files in the directory
coded_transcripts_in_csv = glob.glob(r'D:\Projects\UG-league-project\data\coded-transcripts\*.csv')
print('Files in folder:', coded_transcripts_in_csv)
print("")

team_num_and_total_words_dict = {'team_number': [], 'total_words': []}

# for each file, calculate the cohens kappa
for file in coded_transcripts_in_csv:
    
    # use function to get total cleaned words 
    cleaned_words = get_cleaned_words_list(file)
    total_cleaned_words = len(cleaned_words)
    team_num_and_total_words_dict['total_words'].append(total_cleaned_words)

    # and corresponding team number
    team_number = get_team_number(file)
    team_num_and_total_words_dict['team_number'].append(int(team_number))


    print('total words = ', total_cleaned_words)
    print("")

team_num_and_total_words_dict

Files in folder: ['D:\\Projects\\UG-league-project\\data\\coded-transcripts\\team-[15].csv', 'D:\\Projects\\UG-league-project\\data\\coded-transcripts\\team-[16].csv', 'D:\\Projects\\UG-league-project\\data\\coded-transcripts\\team-[19].csv', 'D:\\Projects\\UG-league-project\\data\\coded-transcripts\\team-[32].csv', 'D:\\Projects\\UG-league-project\\data\\coded-transcripts\\team-[34].csv', 'D:\\Projects\\UG-league-project\\data\\coded-transcripts\\team-[46].csv', 'D:\\Projects\\UG-league-project\\data\\coded-transcripts\\team-[5].csv', 'D:\\Projects\\UG-league-project\\data\\coded-transcripts\\team-[6].csv', 'D:\\Projects\\UG-league-project\\data\\coded-transcripts\\team-[7].csv']

team number =  15
total words =  1754

team number =  16
total words =  3002

team number =  19
total words =  1114

team number =  32
total words =  3787

team number =  34
total words =  1420

team number =  46
total words =  968

team number =  5
total words =  2642

team number =  6
total words =  3239



{'team_number': [15, 16, 19, 32, 34, 46, 5, 6, 7],
 'total_words': [1754, 3002, 1114, 3787, 1420, 968, 2642, 3239, 1749]}

In [5]:
# convert the dictionary to dataframe
team_num_and_total_words_df = pd.DataFrame.from_dict(team_num_and_total_words_dict)
team_num_and_total_words_df
#print(team_num_and_total_words_df.dtypes)

Unnamed: 0,team_number,total_words
0,15,1754
1,16,3002
2,19,1114
3,32,3787
4,34,1420
5,46,968
6,5,2642
7,6,3239
8,7,1749


In [6]:
# merge the dataframes on the team_number column 
merged_df = pd.merge(log_of_team_data_df, team_num_and_total_words_df, on = 'team_number')

merged_df

Unnamed: 0,team_number,team_size,match_id,outcome,match_duration,kills_per_minute,total_words
0,5.0,5.0,5062655000.0,loss,33:04,0.877,2642
1,6.0,5.0,5064351000.0,won,19:35,1.532,3239
2,7.0,3.0,5062629000.0,loss,30:27,0.92,1749
3,15.0,5.0,5052427000.0,won,30:45,0.846,1754
4,16.0,3.0,5065998000.0,won,23:12,1.681,3002
5,19.0,3.0,3763756000.0,won,24:04,0.956,1114
6,32.0,5.0,5108557000.0,won,29:49,1.473,3787
7,34.0,5.0,5097518000.0,won,23:57,1.336,1420
8,46.0,3.0,5135379000.0,won,38:46,1.032,968


## this block converts minutes and seconds to seconds

In [7]:
# convert match duration from minutes and seconds to seconds
from datetime import timedelta

def seconder(x):
    mins, secs = map(float, x.split(':'))
    td = timedelta(minutes=mins, seconds=secs)
    return td.total_seconds()

merged_df['match_duration_in_seconds'] = merged_df['match_duration'].apply(seconder)

merged_df.head()

Unnamed: 0,team_number,team_size,match_id,outcome,match_duration,kills_per_minute,total_words,match_duration_in_seconds
0,5.0,5.0,5062655000.0,loss,33:04,0.877,2642,1984.0
1,6.0,5.0,5064351000.0,won,19:35,1.532,3239,1175.0
2,7.0,3.0,5062629000.0,loss,30:27,0.92,1749,1827.0
3,15.0,5.0,5052427000.0,won,30:45,0.846,1754,1845.0
4,16.0,3.0,5065998000.0,won,23:12,1.681,3002,1392.0


In [None]:
# combine all the words into one long string
full_combined_transcript = " ".join(cleaned_word_list)
full_combined_transcript

## Descriptive Analysis Scripts

In [12]:
# get mean words per minute per team 
def mean_words_per_minute(merged_df): 
    pd.set_option('display.float_format', '{:.3f}'.format)
    
    merged_df['mean_wpm'] = merged_df['total_words']/(merged_df['match_duration_in_seconds']/60)
    
    return merged_df
    # needs to look like: 
    # total_words/(match_duration_in_seconds / 60)

In [13]:
mean_words_per_minute(merged_df)

Unnamed: 0,team_number,team_size,match_id,outcome,match_duration,kills_per_minute,total_words,match_duration_in_seconds,mean_wpm
0,5.0,5.0,5062654604.0,loss,33:04,0.877,2642,1984.0,79.899
1,6.0,5.0,5064350747.0,won,19:35,1.532,3239,1175.0,165.396
2,7.0,3.0,5062628707.0,loss,30:27,0.92,1749,1827.0,57.438
3,15.0,5.0,5052427484.0,won,30:45,0.846,1754,1845.0,57.041
4,16.0,3.0,5065998241.0,won,23:12,1.681,3002,1392.0,129.397
5,19.0,3.0,3763755883.0,won,24:04,0.956,1114,1444.0,46.288
6,32.0,5.0,5108557177.0,won,29:49,1.473,3787,1789.0,127.01
7,34.0,5.0,5097517634.0,won,23:57,1.336,1420,1437.0,59.29
8,46.0,3.0,5135379230.0,won,38:46,1.032,968,2326.0,24.97


In [None]:
# get words per minute 
def get_words_per_minute(match):
    
    return words_per_minute


In [17]:
# import transcript (uncleaned)
raw_transcript_df = pd.read_csv(r'D:\Projects\UG-league-project\data\coded-transcripts\team-[19].csv')

raw_transcript_df['match_duration_in_seconds'] = raw_transcript_df['Timestamp'].apply(seconder)

raw_transcript_df.head()


Unnamed: 0,Player,Team,Timestamp,Sentence,code-1-eve,code-1-aj,code-2-eve,code-2-aj,match_duration_in_seconds
0,1,19,00:00,it turns off,0,0,,,0.0
1,2,19,00:01,"Oh yeah, yeah.",16,16,,,1.0
2,1,19,00:03,"Yeah, they should Yeah.",16,16,,,3.0
3,1,19,00:10,Where's corrupting pot?,5,5,,,10.0
4,3,19,00:11,there we go,19,14,,,11.0


In [None]:
# go through each row, if the row is 

Within team variables: 
 - mean wpm : rate of chat
 - variance wpm : distribution of chat across match duration

In [None]:
# import transcript of interest
coded_transcripts_df = pd.read_csv(r'D:\Projects\UG-league-project\data\coded-transcripts\team-[46]-sentiment-mid-cohesion.csv')

In [None]:
# create an empty list to hold all words
all_words = []

# got through each sentence 
for sentence in all_sentences:
    split_the_sentence = sentence.split()
    
    # from each sentence, extract the word
    for word in split_the_sentence:
        all_words.append(word)

print(all_words)

In [None]:
# take out all the '(inaudible)' lines

cleaned_word_list = []

list_with_inaudible_words = ["(inaudible)", "(inaduible", "cross", "talk)", "like"]
for word in all_words: 
    if word in list_with_inaudible_words :
        continue
    else: 
        cleaned_word_list.append(word)

        
print(cleaned_word_list)

# Cleaning the transcript using NLTK

In [12]:
# trying some NLTK stuff here
import nltk
nltk.download(['averaged_perceptron_tagger', 
               'stopwords'])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Evelyn\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Evelyn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
#from nltk.corpus import stopwords
stop_words = nltk.corpus.stopwords.words('english')

filtered_list = []
# 
#cleaned_word_list
for word in cleaned_words :
    if word.casefold() not in stop_words: 
        filtered_list.append(word)
        
print("total words after removing stopwords: ", len(filtered_list))        
filtered_list   

total words after removing stopwords:  583


['Basically',
 'proceed',
 'nothing.',
 'believe.',
 'Well.',
 'Wait',
 'think',
 'double',
 'click',
 'ward',
 'get',
 'one',
 'Yea',
 'works',
 'buy',
 'swaps',
 'one',
 'reasons',
 'actually',
 'never',
 'happened',
 'Surprised',
 'try',
 'level',
 'one',
 'cheese',
 'honest.',
 'Ah',
 'cause',
 'leashed.',
 'Yep.',
 'hate',
 'see.',
 'Yeah,',
 'yeah,',
 'dashes',
 'super',
 'compatible.',
 'Guardian',
 'now.',
 'stay',
 'bit',
 'cause',
 "they're",
 'shoving',
 'really',
 'enough',
 'money',
 'anything.',
 'Yeah',
 "that's",
 'unfortunate.',
 'blind?',
 'really',
 'see',
 'Poppy',
 'or?',
 'Ah',
 'hid',
 'last',
 'second',
 "there's",
 'much',
 'could',
 'done',
 'there.',
 'Oh',
 'Gragas',
 'Please',
 'go',
 'in.',
 'Oh',
 'least',
 'Rakan',
 'got',
 'both.',
 'still',
 'there?',
 'Fuck',
 'oh',
 'god.',
 'cancelled',
 'two',
 'autos.',
 'Oh',
 'sweet',
 'got',
 'two',
 'kills',
 "I'm",
 'back',
 'game.',
 'Alright',
 'Rakan',
 'still',
 'level',
 'four',
 "i'm",
 'halfway',
 'six

# Creating Frequency Distributions and Analysing Sentiment using NLTK

## Getting Frequencies

In [None]:
# get frequence distribution of words
frequency_distribution_of_filtered_list = nltk.FreqDist(filtered_list)
frequency_distribution_of_filtered_list

In [None]:
# find most common words
# number in bracket indicates the top x number (eg. 5 means the top 5 most frequent words)
frequency_distribution_of_filtered_list.most_common(10)

# visualise the distribution in a table
frequency_distribution_of_filtered_list.tabulate(10)

## Extracting Concordance and Collocations
In the context of NLP, a concordance is a collection of word locations along with their context. You can use concordances to find:

    How many times a word appears
    Where each occurrence appears
    What words surround each occurrence

In [None]:
# find what the context surrounding a word is
full_text_including_stopwords_and_punctuations = nltk.Text(cleaned_word_list)

# set word of interest in "" in brackets
full_text_including_stopwords_and_punctuations.concordance("go", lines = 10)

In [None]:
# finding collocations (sequences)

#step 1 define the number of ngrams the finder is looking for
bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(cleaned_word_list)
trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(cleaned_word_list)

# find the top 5 (the number in the bracket) most common bi grams
print("top 10 most common bigrams: ")
print(bigram_finder.ngram_fd.most_common(10))
print("")

print("top 10 most common trigrams: ")
print(trigram_finder.ngram_fd.most_common(10))

## trying to do sentiment analysis with built in nltk model VADER 
-note: model is best suited for short texts like tweets and social media things

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

# create a string of words 
string_of_cleaned_word_list = " ".join(cleaned_word_list)
# string_of_cleaned_word_list

sia = SentimentIntensityAnalyzer()
sia.polarity_scores(string_of_cleaned_word_list)

# team 6 (low cohesion, mean = 2.67, 3 person team, lost) results: {'neg': 0.121, 'neu': 0.655, 'pos': 0.224, 'compound': 0.9999}
# team 32 (high cohesion, mean = 6.83, 3 person team, won ) results: {'neg': 0.141, 'neu': 0.666, 'pos': 0.193, 'compound': 0.9996}
# team 46 (mid cohesion, mean = 4.44, 3 person team, lost) results: {'neg': 0.152, 'neu': 0.687, 'pos': 0.161, 'compound': -0.7709}

In [None]:
# A function that combines all the steps to get sentiment 
#def get_sentiment_analysis(csv_of_transcript):
    

## to-do: 
    
    all the word frequency calculations
    maybe a word cloud
    some prelimenary sentiment analysis?