### Imports and downloads

In [1]:
# nltk
import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/michielkorpel/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michielkorpel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michielkorpel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/michielkorpel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/michielkorpel/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# others
import re, string, random
import pandas as pd
from os import path

### Function definitions

In [3]:
def determine_candidate(tweet_text, in_reply_to, quoted_status):
    # Find mentions and hashtags in tweet_text
    trump_mentions = re.findall('@realDonaldTrump', tweet_text, re.IGNORECASE)
    clinton_mentions = re.findall('@HillaryClinton', tweet_text, re.IGNORECASE)
    mentioned_first = re.findall('@realDonaldTrump|@HillaryClinton', tweet_text, re.IGNORECASE)
    trump_hashtags = re.findall('#maga|#trumppence16|#donaldtrump|#trump|#dumptrump|#nevertrump', tweet_text, re.IGNORECASE)
    clinton_hashtags = re.findall('#hillaryclinton|#hillary|#imwithher|#crookedhillary|#neverhillary', tweet_text, re.IGNORECASE)
    # Find mentions and hashtags in quoted_status.text
    if isinstance(quoted_status, dict):
        quoted_status_text = quoted_status['text']
        trump_mentions.extend(re.findall('@realDonaldTrump', quoted_status_text, re.IGNORECASE))
        clinton_mentions.extend(re.findall('@HillaryClinton', quoted_status_text, re.IGNORECASE))
        mentioned_first.extend(re.findall('@realDonaldTrump|@HillaryClinton', quoted_status_text, re.IGNORECASE))
        trump_hashtags.extend(re.findall('#maga|#trumppence16|#donaldtrump|#trump|#dumptrump|#nevertrump', quoted_status_text, re.IGNORECASE))
        clinton_hashtags.extend(re.findall('#hillaryclinton|#hillary|#imwithher|#crookedhillary|#neverhillary', quoted_status_text, re.IGNORECASE))
    # Find mentions in in_reply_to
    if not in_reply_to == None:
        trump_mentions.extend(re.findall('realDonaldTrump', in_reply_to, re.IGNORECASE))
        clinton_mentions.extend(re.findall('HillaryClinton', in_reply_to, re.IGNORECASE))
    # Determine candidate based on most mentions
    if len(trump_mentions) == len(clinton_mentions):
        # Equal number of mentions, check if one candidate is mentioned first
        if not mentioned_first == []:
            if mentioned_first[0] == '@realDonaldTrump':
                return 'Trump'
            else:
                return 'Clinton'
        # No mentions for either candidate, compare hashtags
        if len(trump_hashtags) == len(clinton_hashtags):
            return 'Neither'
        elif len(trump_hashtags) > len(clinton_hashtags):
            return 'Trump'
        else:
            return 'Clinton'
    elif len(trump_mentions) > len(clinton_mentions):
        return 'Trump'
    else:
        return 'Clinton'

def remove_ats_hts_urls(tweet_text):
    tweet_text = re.sub('http[s]?:[\\\\]?/[\\\\]?/(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', tweet_text)
    tweet_text = re.sub('(@[A-Za-z0-9_]+)','', tweet_text)
    tweet_text = re.sub('(#[A-Za-z0-9_]+)','', tweet_text)
    return tweet_text.strip()
    
def remove_noise(tweet_tokens, stop_words = ()):
    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?:[\\\\]?/[\\\\]?/(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub('(@[A-Za-z0-9_]+)','', token)
        token = re.sub('(#[A-Za-z0-9_]+)','', token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

### Training classifier on twitter_samples

In [4]:
# Load tweets ands stop words
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]
stop_words = stopwords.words('english')

In [5]:
# Tokenize tweets
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

In [6]:
# Remove noise from tokens
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

In [7]:
# Get and show frequency distribution of positive words
all_pos_words = get_all_words(positive_cleaned_tokens_list)
freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


In [8]:
# Build positive and negative datasets
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model]

In [12]:
# Split 70/30 into train/validate, maintaining 50/50 positive/negative for both sets
train_data = positive_dataset[:3500] + negative_dataset[:3500]
validation_data = positive_dataset[3500:] + negative_dataset[3500:]

In [34]:
print(train_data[:20])

[({'top': True, 'engage': True, 'member': True, 'community': True, 'week': True, ':)': True}, 'Positive'), ({'hey': True, 'james': True, 'odd': True, ':/': True, 'please': True, 'call': True, 'contact': True, 'centre': True, '02392441234': True, 'able': True, 'assist': True, ':)': True, 'many': True, 'thanks': True}, 'Positive'), ({'listen': True, 'last': True, 'night': True, ':)': True, 'bleed': True, 'amazing': True, 'track': True, 'scotland': True}, 'Positive'), ({'congrats': True, ':)': True}, 'Positive'), ({'yeaaaah': True, 'yippppy': True, 'accnt': True, 'verify': True, 'rqst': True, 'succeed': True, 'get': True, 'blue': True, 'tick': True, 'mark': True, 'fb': True, 'profile': True, ':)': True, '15': True, 'day': True}, 'Positive'), ({'one': True, 'irresistible': True, ':)': True}, 'Positive'), ({'like': True, 'keep': True, 'lovely': True, 'customer': True, 'wait': True, 'long': True, 'hope': True, 'enjoy': True, 'happy': True, 'friday': True, 'lwwf': True, ':)': True}, 'Positive

In [28]:
# Train classifier
classifier = NaiveBayesClassifier.train(train_data)
print("Accuracy is:", classify.accuracy(classifier, validation_data))
print(classifier.show_most_informative_features(16))

Accuracy is: 0.9933333333333333
Most Informative Features
                      :( = True           Negati : Positi =   2071.0 : 1.0
                      :) = True           Positi : Negati =   1005.4 : 1.0
                follower = True           Positi : Negati =     39.7 : 1.0
                followed = True           Negati : Positi =     34.3 : 1.0
                  arrive = True           Positi : Negati =     33.0 : 1.0
                    glad = True           Positi : Negati =     23.7 : 1.0
                     x15 = True           Negati : Positi =     23.7 : 1.0
                     sad = True           Negati : Positi =     19.9 : 1.0
                    sick = True           Negati : Positi =     19.7 : 1.0
               community = True           Positi : Negati =     16.3 : 1.0
                     ugh = True           Negati : Positi =     13.7 : 1.0
                    miss = True           Negati : Positi =     12.7 : 1.0
              definitely = True           

In [43]:
from nltk.metrics.scores import precision, recall, f_measure
import collections

refsets = collections.defaultdict(set)
valsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(validation_data):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    valsets[observed].add(i)

print("Positive precision is: ", precision(refsets['Positive'], valsets['Positive']))
print("Positive recall is: ", recall(refsets['Positive'], valsets['Positive']))
print("Positive f-measure is: ", f_measure(refsets['Positive'], valsets['Positive']))
print("Negative precision is: ", precision(refsets['Negative'], valsets['Negative']))
print("Negative recall is: ", recall(refsets['Negative'], valsets['Negative']))
print("Negative f-measure is: ", f_measure(refsets['Negative'], valsets['Negative']))

Positive precision is:  0.9920212765957447
Positive recall is:  0.9946666666666667
Positive f-measure is:  0.9933422103861517
Negative precision is:  0.9946524064171123
Negative recall is:  0.992
Negative f-measure is:  0.993324432576769


### Testing classifier on custom tweet

In [16]:
custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."
custom_tokens = remove_noise(word_tokenize(custom_tweet))
print('Tweet:\t"%s"\nClass:\t%s' % (custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens))))

Tweet:	"I ordered just once from TerribleCo, they screwed up, never used the app again."
Class:	Negative


### Classifying election tweets

In [17]:
# Load tweets into dataframe if there is no pickle available
if path.exists('./full_df.pkl'):
    df = pd.read_pickle('./full_df.pkl')
else:
    df=pd.read_json('geotagged_tweets_20160812-0912.jsons', lines=True)
    df.to_pickle('./full_df.pkl')

In [46]:
# Filter out tweets in other languages than English
df_filtered = df[df['lang']=='en']
# Select only the columns needed for analysis
df_filtered = df_filtered[['id','text','in_reply_to_screen_name','quoted_status','source','geo','coordinates','place']]
df_filtered = df_filtered.reset_index(drop=True)
# Remove mentions, hashtags and links from text
df_filtered['text_clean'] = df_filtered['text'].apply(lambda txt: remove_ats_hts_urls(txt))
df_filtered.shape

(563329, 9)

In [51]:
# Build list of classified tweets
data = []
for tweety in range(len(df_filtered['text'])):
    # Extract some column values
    txt = df_filtered['text'][tweety]
    reply_to = df_filtered['in_reply_to_screen_name'][tweety]
    quoted_status = df_filtered['quoted_status'][tweety]
    # Determine the candidate the tweet is aimed at
    candidate = determine_candidate(txt, reply_to, quoted_status)
    # Use cleaned text for tokenizing
    txt_clean = df_filtered['text_clean'][tweety]
    # Only consider tweets that contain text other than mentions, hashtags and links
    if len(txt_clean) > 0:
        tweety_c = remove_noise(word_tokenize(txt_clean))
        data.append([txt, tweety_c, classifier.classify(dict([token, True] for token in tweety_c)), candidate])

In [52]:
# Convert into dataframe and analyse
df_p_n = pd.DataFrame(data, columns =['Full tweet','Clean tweet tokens','Sentiment','Candidate'])

In [53]:
print('Dataframe shape:\n') 
df_p_n.shape

Dataframe shape:



(563254, 4)

In [72]:
print('\nDataframe head:\n')
df_p_n.head()


Dataframe head:



Unnamed: 0,Full tweet,Clean tweet tokens,Sentiment,Candidate
0,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,"[all, in, collusion, together]",Positive,Trump
1,@HillaryClinton he will do in one year all the...,"[he, will, do, in, one, year, all, the, thing,...",Negative,Clinton
2,#CNN #newday clear #Trump deliberately throwin...,"[clear, deliberately, throw, this, race, in, 2...",Positive,Trump
3,"@realDonaldTrump, you wouldn't recognize a lie...","[you, would, n't, recognize, a, lie, if, it, c...",Negative,Trump
4,"""Kid, you know, suing someone? Thats the most ...","[``, kid, you, know, sue, someone, thats, the,...",Positive,Trump


In [77]:
print(df_p_n['Full tweet'][4])
print('Tokens: ', df_p_n['Clean tweet tokens'][4])
print('Sentiment: ', df_p_n['Sentiment'][4])

"Kid, you know, suing someone? Thats the most beautiful thing 1 human being could do to another human being" @funnyordie @realDonaldTrump😂💩s
Tokens:  ['``', 'kid', 'you', 'know', 'sue', 'someone', 'thats', 'the', 'most', 'beautiful', 'thing', '1', 'human', 'be', 'could', 'do', 'to', 'another', 'human', 'be', "''", '😂💩s']
Sentiment:  Positive


In [55]:
print('\nDataframe described:\n')
df_p_n.describe


Dataframe described:



<bound method NDFrame.describe of                                                Full tweet  \
0       @BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...   
1       @HillaryClinton he will do in one year all the...   
2       #CNN #newday clear #Trump deliberately throwin...   
3       @realDonaldTrump, you wouldn't recognize a lie...   
4       "Kid, you know, suing someone? Thats the most ...   
...                                                   ...   
563249  @CNBC @SquawkAlley @realDonaldTrump Kudlow is ...   
563250  TRUMP U, TAXES ,WEIRD MEDICAL REPORT WITH A WH...   
563251  @CarolCNN if MSM were honest watch any utube v...   
563252  It's interesting that Hillary Clinton's crowds...   
563253  @TeamTrump @KellyannePolls @realDonaldTrump @f...   

                                       Clean tweet tokens Sentiment Candidate  
0                          [all, in, collusion, together]  Positive     Trump  
1       [he, will, do, in, one, year, all, the, thing,...  Negative   Cli

In [56]:
df_p_n_candidates = df_p_n[df_p_n['Candidate']!='Neither']
df_p_n_neither = df_p_n[df_p_n['Candidate']=='Neither']
print(df_p_n_candidates.shape)
print(df_p_n_neither.shape)

(550435, 4)
(12819, 4)


In [57]:
df_p_n_candidates[(df_p_n_candidates['Candidate']=='Trump')&(df_p_n_candidates['Sentiment']=='Positive')].shape

(197560, 4)

In [58]:
df_p_n_candidates[(df_p_n_candidates['Candidate']=='Clinton')&(df_p_n_candidates['Sentiment']=='Positive')].shape

(86005, 4)

In [59]:
df_p_n_candidates[(df_p_n_candidates['Candidate']=='Trump')&(df_p_n_candidates['Sentiment']=='Negative')].shape

(185487, 4)

In [60]:
df_p_n_candidates[(df_p_n_candidates['Candidate']=='Clinton')&(df_p_n_candidates['Sentiment']=='Negative')].shape

(81383, 4)

In [61]:
df_p_n_neither[(df_p_n_neither['Sentiment']=='Positive')].shape

(6825, 4)

In [62]:
df_p_n_neither[(df_p_n_neither['Sentiment']=='Negative')].shape

(5994, 4)

In [63]:
# Export dataframes to pickle for further analyses
df_p_n_candidates.to_pickle('./sentiment_analysis_w_candidate.pkl')
df_p_n_neither.to_pickle('./sentiment_analysis_neither.pkl')