## NaNoWriMo Word Analysis

Most frequency word analysis and sentiment analysis

In [None]:
# Define variables with default values
most_freq_amount = 10
user_defined_stopwords = "the, and, a, in, i"
input_file = "orwell_1984.txt"

In [None]:
# Interactive Jupyter Notebook, running in Mercury for web application
import warnings
warnings.filterwarnings('ignore')

import plotly.express as px
import matplotlib.pyplot as plt
import pandas as pd
plt.rcParams.update({'figure.max_open_warning': 0})
import math

import nltk
nltk.download('punkt', quiet=True) # Frequency Distribution
nltk.download('stopwords', quiet=True) # Stopwords for cleanup
nltk.download('vader_lexicon', quiet=True) # Sentiment Analysis via VADER (Valence Aware Dictionary for Sentiment Reasoning)
from nltk.sentiment import SentimentIntensityAnalyzer # Sentiment Analysis via VADER

In [None]:
# Return the text file as tokens
user_defined_stopwords_to_remove = user_defined_stopwords.replace(" ", "").split(",")
with open(input_file, "r") as f:
	file_text = f.readlines()
file_text = " ".join(file_text) # combine all lines into one string

# Break into Tokens and Clean-up
string_as_tokens_lst = nltk.word_tokenize(file_text)
print("Total Token Count: {0}".format(len(string_as_tokens_lst)))
stopwords = nltk.corpus.stopwords.words("english")
punctuation = ["--", "'", "''", "``", "?", "!", ".", ",", ";", ")", "(", "‘", "●", ":", '“', '”', '○', "[", "]", "&", '’', "%", "*", "–", "·", "-"]

# Remove and combine possesive 's
s_loc = []
for i, word in enumerate(string_as_tokens_lst): 
    if word == "'s": 
        s_loc.append(i)
for j in s_loc: 
    string_as_tokens_lst[j-1] = "".join([string_as_tokens_lst[j-1], string_as_tokens_lst[j]])
for index in sorted(s_loc, reverse=True):
    del string_as_tokens_lst[index]

# User defined stopwords
string_as_tokens_lst = [w for w in string_as_tokens_lst if w not in stopwords] # remove stopwords
string_as_tokens_lst = [w.lower() for w in string_as_tokens_lst if w not in punctuation] # remove punctuation
string_as_tokens_lst = [u for u in string_as_tokens_lst if u not in user_defined_stopwords_to_remove] # removes
print("Word Count (without words to be ignored): {0}\n".format(len(string_as_tokens_lst)))

In [None]:
# Frequency Distribution
frequency_dist = nltk.FreqDist(string_as_tokens_lst)
frequencyDistribution_as_dict = dict(frequency_dist.most_common(most_freq_amount)) # convert to dict for plotting
print(frequencyDistribution_as_dict)
df = pd.DataFrame(frequencyDistribution_as_dict.items(), columns=["Word", "Occurence"])
df.set_index("Word",drop=True,inplace=True)
fig = px.bar(df, y="Occurence", title="Top {0} Words".format(most_freq_amount))
fig.show()

In [None]:
# Collocation Distribution
def plotNGram(n_gram_amount, n_gram_finder):
	# Plot N-Grams
	nGram_as_dict_temp = dict(n_gram_finder.ngram_fd.most_common(most_freq_amount)) # convert to dict for plotting

	nGram_as_dict = {}
	for k, v in nGram_as_dict_temp.items():
		nGram_as_dict[", ".join(k)] = v # rename key from ('graduate', 'division') to "graduate, division"
	df2 = pd.DataFrame(nGram_as_dict.items(), columns=["Word", "Occurence"])
	df2.set_index("Word",drop=True,inplace=True)
	print(nGram_as_dict)    
	fig = px.bar(df2, y="Occurence", title="Top {0} Word {1}".format(most_freq_amount, n_gram_amount))
	fig.show()

In [None]:
# Bigrams: Two-Word Combinations
bigram_collocation_dist = nltk.collocations.BigramCollocationFinder.from_words(string_as_tokens_lst)
plotNGram("Pairs", bigram_collocation_dist)

In [None]:
# Trigrams: Three-Word Combinations
trigram_collocation_dist = nltk.collocations.TrigramCollocationFinder.from_words(string_as_tokens_lst)
plotNGram("Triplets", trigram_collocation_dist)

In [None]:
# Quadgrams: Four-Word Combinations
quadgram_collocation_dist = nltk.collocations.QuadgramCollocationFinder.from_words(string_as_tokens_lst)
plotNGram("Quadruplets", quadgram_collocation_dist)

In [None]:
# Sentiment Analysis
def sentimentAnalysis(plot_title_from_file_name, file_as_tokens,):
	# Sentiment Analysis of Pieces of X Length
	# VADER Citation: Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
	size_of_sentiment_string = 15 # average length of a sentence
	list_of_strings_x_length = []
	for i in range(0, len(file_as_tokens), size_of_sentiment_string):
		string_sentence = " ".join(file_as_tokens[i:i+size_of_sentiment_string])
		list_of_strings_x_length.append(string_sentence)

	sentiment_analyzer = SentimentIntensityAnalyzer() # Via VADER*
	sent_dict_postive = {}
	sent_dict_neutral = {}
	sent_dict_negative = {}
	for i, string_sent in enumerate(list_of_strings_x_length):
		sent_dict_for_sentence = sentiment_analyzer.polarity_scores(string_sent)
		sent_dict_postive[i] = sent_dict_for_sentence["pos"]
		sent_dict_neutral[i] = sent_dict_for_sentence["neu"]
		sent_dict_negative[i] = sent_dict_for_sentence["neg"]

	color_plot = {"Postive": "Reds", "Negative": "Blues", "Neutral": "gray"} # colors for plot (cmap)

	# Plot Sentiment Individually
	def plotSentimentIndvidually(polarity_name, polarity_dict):
		# Plot
		fig = plt.figure(figsize=(12,12), dpi=100)
		plt.title("{0}: {1} Sentiment".format(plot_title_from_file_name, polarity_name))
		plt.scatter(polarity_dict.keys(), polarity_dict.values(), c=[i * 10 for i in polarity_dict.values()], cmap=color_plot[polarity_name])
		plt.xticks(rotation=90)
		plt.xlabel("Sentence Piece")
		plt.ylabel("{0} Sentiment %".format(polarity_name))
	plotSentimentIndvidually("Postive", sent_dict_postive)
	#plotSentimentIndvidually("Neutral", sent_dict_neutral)
	plotSentimentIndvidually("Negative", sent_dict_negative)

	# Plot as Group
	fig = plt.figure(figsize=(12,12), dpi=100)
	plt.title("{0}: Postive and Negative Sentiment".format(plot_title_from_file_name))
	plt.scatter(sent_dict_postive.keys(), sent_dict_postive.values(), c=[i * 10 for i in sent_dict_postive.values()], cmap=color_plot["Postive"])
	#plt.scatter(sent_dict_neutral.keys(), sent_dict_neutral.values(), c=[i * 10 for i in sent_dict_neutral.values()], cmap=color_plot["Neutral"])
	plt.scatter(sent_dict_negative.keys(), sent_dict_negative.values(), c=[i * 10 for i in sent_dict_negative.values()], cmap=color_plot["Negative"])
	plt.xticks(rotation=90)
	plt.xlabel("Sentence #")
	plt.ylabel("Sentiment %")

	# Plot Trends by Ploting a Line Graph for Every X
	size_of_text_chunks = len(sent_dict_postive.keys()) - 1
	average_every_x = 25 # average the sentiment of x sentences
	average_pos_dict = {}
	average_neg_dict = {}
	for i in range(0, size_of_text_chunks, math.floor(size_of_text_chunks/average_every_x)):
		pos_polarity_values = []
		neg_polarity_values = []
		for j in range(i, i+average_every_x-1):
			if j in sent_dict_postive.keys() and j in sent_dict_negative.keys():
				pos_polarity_values.append(sent_dict_postive[j])
				neg_polarity_values.append(sent_dict_negative[j])
		average_pos_dict[i] = sum(pos_polarity_values) / len(pos_polarity_values)
		average_neg_dict[i] = sum(neg_polarity_values) / len(neg_polarity_values)

	fig = plt.figure(figsize=(12,12), dpi=100)
	plt.title("{0}: Trends in Sentiment".format(plot_title_from_file_name))
	plt.scatter(sent_dict_postive.keys(), sent_dict_postive.values(), c=[i * 10 for i in sent_dict_postive.values()], cmap=color_plot["Postive"])
	plt.scatter(sent_dict_negative.keys(), sent_dict_negative.values(), c=[i * 10 for i in sent_dict_negative.values()], cmap=color_plot["Negative"])
	plt.plot(list(average_pos_dict.keys()), list(average_pos_dict.values()), c="red")
	plt.plot(list(average_neg_dict.keys()), list(average_neg_dict.values()), c="blue")
	plt.xticks(rotation=90)
	plt.xlabel("Sentence #")
	plt.ylabel("Sentiment %")
	plt.show()

In [None]:
book_title = input_file.split(".")[0]
sentimentAnalysis(book_title, string_as_tokens_lst)