In [4]:
# -*- coding: utf-8 -*-

# author: Danielle Dolan-Sanchez

#   This code was written to clean Twitter data and run a simple dictionary based affect analysis as part of my Capstone Project at UA iSchool. The 
#   MMIWG lexicon used in this analysis was created by myself, PhD student Jennifer Byram, and Dr. Lindsay Montgomery of the School of Anthropology.
# 
# The code is derived from and inspired by several sources on cleaning Tweets and doing affect analysis. 
# Here are some links for sources and inspiration: 
# https://machinelearningmastery.com/clean-text-machine-learning-python/
# https://www.geeksforgeeks.org/python-count-occurrences-of-each-word-in-given-text-file-using-dictionary/
# https://www.youtube.com/watch?v=dyN_WtjdfpA&list=PLhTjy8cBISEoOtB5_nwykvB9wfEDscuEo&index=1
# https://pythonexamples.org/python-count-occurrences-of-word-in-text-file/
# https://github.com/darlastill/ResBaz-UA-Python-Twitter
# https://github.com/DocNow/twarc
# https://github.com/nealcaren/osscabd_2018
# http://www.nltk.org/book/
# https://methodi.ca/recipes/dictionary-based-sentiment-analysis-python

In [5]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# load affect lexicon file
affect_lexicon_df = pd.read_csv('filepath/affectLexicon.csv')

In [None]:
# print a sample of words to check that it's reading csv correctly
affect_lexicon_df.sample(10)

In [None]:
# define which column to use for affect words list, change depending on which affect you want to count
affect_words_list = affect_lexicon_df['Trust'].values

In [None]:
# load tweets text file
text = open("filepath/tweets.txt", encoding="utf-8").read()

In [None]:
# remove punctuation and make lowercase 
def text_to_words(text):
    '''Transform a string to a list of words,
    removing all punctuation.'''
    text = text.lower()

    p = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    text = ''.join([ch for ch in text if ch not in p])

    return text.split()

In [None]:
# return list of tokenized words from tweets text file
text_to_words(text)

In [None]:
# count occurences of word from lexicon within text file
count_occurences(text, affect_words_list)

In [None]:
# create a bar graph to visualize data
# creating the dataset by adding number of occurences after keys
data = {'Anger':926, 'Anticipation/Expectation':763, 'Disgust':569,  
        'Fear':2012, 'Joy':944, 'Sadness':1819, 'Surprise':592, 'Trust':437} 
affects = list(data.keys()) 
values = list(data.values()) 

# creating the bar plot with custom adjustments for color and x-axis ticks
fig, ax1 = plt.subplots()
ax1.bar(affects, values, color ='coral',  
        width = 0.4)
fig.autofmt_xdate()
plt.savefig('dec2018.png')

plt.xlabel("Affects") 
plt.ylabel("No. of occurences") 
plt.title("Affects in December 2018 Tweets")

# showing the graph, save a copy
plt.show()