# Cleaning the transcript using NLTK

In [None]:
# trying some NLTK stuff here
import nltk
nltk.download(['averaged_perceptron_tagger', 
               'stopwords'])

In [None]:
#from nltk.corpus import stopwords
stop_words = nltk.corpus.stopwords.words('english')

filtered_list = []
# 
#cleaned_word_list
for word in cleaned_words :
    if word.casefold() not in stop_words: 
        filtered_list.append(word)
        
print("total words after removing stopwords: ", len(filtered_list))        
filtered_list 

# Creating Frequency Distributions and Analysing Sentiment using NLTK

## Getting Frequencies

In [None]:
# get frequence distribution of words
frequency_distribution_of_filtered_list = nltk.FreqDist(filtered_list)
frequency_distribution_of_filtered_list

# find most common words
# number in bracket indicates the top x number (eg. 5 means the top 5 most frequent words)
frequency_distribution_of_filtered_list.most_common(10)

# visualise the distribution in a table
frequency_distribution_of_filtered_list.tabulate(10)

## Extracting Concordance and Collocations
In the context of NLP, a concordance is a collection of word locations along with their context. You can use concordances to find:

    How many times a word appears
    Where each occurrence appears
    What words surround each occurrence

In [None]:
# find what the context surrounding a word is
full_text_including_stopwords_and_punctuations = nltk.Text(cleaned_word_list)

# set word of interest in "" in brackets
full_text_including_stopwords_and_punctuations.concordance("go", lines = 10)

In [None]:
# finding collocations (sequences)

#step 1 define the number of ngrams the finder is looking for
bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(cleaned_word_list)
trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(cleaned_word_list)

# find the top 5 (the number in the bracket) most common bi grams
print("top 10 most common bigrams: ")
print(bigram_finder.ngram_fd.most_common(10))
print("")

print("top 10 most common trigrams: ")
print(trigram_finder.ngram_fd.most_common(10))

## trying to do sentiment analysis with built in nltk model VADER 
-note: model is best suited for short texts like tweets and social media things

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

# create a string of words 
string_of_cleaned_word_list = " ".join(cleaned_word_list)
# string_of_cleaned_word_list

sia = SentimentIntensityAnalyzer()
sia.polarity_scores(string_of_cleaned_word_list)

# team 6 (low cohesion, mean = 2.67, 3 person team, lost) results: {'neg': 0.121, 'neu': 0.655, 'pos': 0.224, 'compound': 0.9999}
# team 32 (high cohesion, mean = 6.83, 3 person team, won ) results: {'neg': 0.141, 'neu': 0.666, 'pos': 0.193, 'compound': 0.9996}
# team 46 (mid cohesion, mean = 4.44, 3 person team, lost) results: {'neg': 0.152, 'neu': 0.687, 'pos': 0.161, 'compound': -0.7709}