# Importing data
Importing data from tweets.csv and splitting into training and test sets. Data fields used are "airline_sentiment" and "text".

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re

In [2]:
data = pd.read_csv('Tweets.csv')

In [3]:
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
x = data["text"]
y = data["airline_sentiment"]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

# Bag of Words
Generating a vocabulary using functions for tokenizing, stemming, and lemmatizing tweets using nltk stopwords, SnowBallStemmer, WordnetLemmatizer, and a tokenization function inspired by freeCodeCamp (Dubey, 2018).

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download("stopwords")
nltk.download('wordnet')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Espen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Espen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
def extract_words(sentence, stopwords):
    words = re.sub("[^\w]", " ",  sentence).split()
    numbers = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
    cleaned_text = [w.lower() for w in words if w not in stopwords and w[0] not in numbers] 
    return cleaned_text

In [7]:
def tokenize(sentences, stopwords):
    words = []
    
    for sentence in sentences:
        extracted_words = extract_words(sentence, stopwords)
        words.extend(extracted_words)
        
    words = sorted(list(set(words)))
    
    return words

In [8]:
def stem(word_list):
    ss = SnowballStemmer("english")
    stemmed = []
    for w in word_list:
        w_stemmed = ss.stem(w)
        if w_stemmed not in stemmed:
            stemmed.append(w_stemmed)
    return stemmed

In [9]:
def lemmatize(word_list):
    lemmatizer = WordNetLemmatizer()
    lemmatized = []
    for w in word_list:
        w_lemmatized = lemmatizer.lemmatize(w)
        if w_lemmatized not in lemmatized:
            lemmatized.append(w_lemmatized)
    return lemmatized

In [10]:
def generate_bag(sentences, stopwords):
    tokenized = tokenize(sentences, stopwords)
    stemmed = stem(tokenized)
    lemmatized = lemmatize(stemmed)
    
    return lemmatized

In [11]:
bag_of_words = generate_bag(data["text"], stopwords)

In [12]:
len(bag_of_words)

10578

# Naive Bayes
Creating the Naive Bayes Classifier based on the equations and pseudocode included in chapter 6 of *Speech and Language Processing* (Jurafsky & Martin, 2017).

In [13]:
import math

In [14]:
classes = ["positive", "neutral", "negative"]

## Prior Probability
Number of tweets labeled a particular class divided by total number of tweets

In [15]:
def calculate_prior(tweets, sentiments, classes):
    prior = dict()
    doc_count = tweets.count()
    
    for c in classes:
        class_count = 0
        for sentiment in y_train:
            if sentiment == c:
                class_count += 1
        class_prior = class_count / doc_count
        log_prior = math.log(class_prior)
        prior[c] = log_prior
    
    return prior

## Likelihood
Calculates loglikelihood[w, c] with laplace smoothing

In [16]:
def calculate_likelihood(word_count, word_sums, vocab, classes):
    likelihood = dict()
    
    for c in classes:
        likelihood[c] = {}
        for word in vocab:
            word_likelihood = (word_count[c][word] + 1) / (word_sums[c] + 1)
            likelihood[c][word] = math.log(word_likelihood)
    
    return likelihood

### Functions for Likelihood
Calculating likelihood relies on the following functions:
    <ul>
        <li>create_big_doc - Creates one long string for each class containing all tweets belonging to respective class</li>
        <li>count_words - counts the amount of times a word appears in a class</li>
        <li>sum_class_word_count - counts the number of words in a class's bigdoc</li>
    </ul>

In [17]:
def create_big_doc(tweets, sentiments, classes):
    big_doc = dict()
    for c in classes:
        big_doc[c] = ""

    for tweet, sentiment in zip(tweets, sentiments):
        big_doc[sentiment] += tweet

    return big_doc

In [18]:
def count_words(big_doc, vocab, classes):
    word_count = dict()
    for i in classes:
        word_count[i] = {}

    for c in big_doc:
        for word in vocab:
            word_count[c][word] = big_doc[c].count(word)
            
    return word_count

In [19]:
def sum_class_word_count(vocab, big_doc, classes):
    word_sums = dict()
    for c in classes:
        x = 0
        
        for word in vocab:
            x = x + big_doc[c].count(word)
        
        word_sums[c] = x
    
    
    return word_sums

## Train Naive Bayes
Function that returns:
    <ul>
        <li>prior - Dictionary of every class's prior probability</li>
        <li>likelihood - Dictionary of every word's likelihood given a class</li>
        <li>vocab - List of the training set's vocabulary</li>
    </ul>

In [20]:
def train_naive_bayes(x_train, y_train, classes):
    vocab = generate_bag(x_train, stopwords)
    
    # Calculates P(c)
    prior = calculate_prior(x_train, y_train, classes)
    
    # All tweets separated by class
    big_doc = create_big_doc(x_train, y_train, classes)
    
    # Amount of times a word appears in a class
    word_count = count_words(big_doc, vocab, classes)
    
    # Sum of count(w, c)
    word_sums = sum_class_word_count(vocab, big_doc, classes)
    
    #P(w | c)
    likelihood = calculate_likelihood(word_count, word_sums, vocab, classes)
        
    return prior, likelihood, vocab

In [21]:
prior, likelihood, vocab = train_naive_bayes(x_train, y_train, classes)

## Evaluating the Classifier
Evaluates the the classifier using the function test_naive_bayes, which returns a test score from 0 to 1. Additionally prints all tweets in the test set with a "correct/incorrect"-label.

In [22]:
def test_naive_bayes(x_test, y_test, classes, prior, likelihood): 
    
    counter = 0
    correct_count = 0
    classified = []
    for sentence in x_test:
        
        probability = dict()
        
        for c in classes:
            probability[c] = prior[c]
            for word in lemmatize(stem(extract_words(sentence, stopwords))):
                if word in likelihood[c].keys():
                    probability[c] = probability[c] + likelihood[c][word]
        max_prob = -1000000
        for class_probability in probability:
            if probability[class_probability] > max_prob:
                max_prob = probability[class_probability]
                max_prob_class = class_probability
        
        classified.append((max_prob_class, sentence))
    
    for estimated_class, actual_class in zip(classified, y_test):

        if estimated_class[0] == actual_class:
            guess = "CORRECT"
            correct_count += 1
        #else:
         #   guess ="INCORRECT"

        #print(" #####", guess, "##### \n # Tweet:", estimated_class[1], "\n # Prediction:", estimated_class[0], "\n # Actual:", actual_class, "\n")
        counter += 1
    
    test_score = correct_count / counter
    return test_score, classified
            

In [23]:
# Function for printing out tweets witht their predicted class and actual class.

def print_correct_incorrect(classified_tweets, tweets_sentiment, print_start, print_num):
    
    for estimated_class, actual_class in zip(classified_tweets[print_start:print_num], tweets_sentiment[print_start:print_num]):

        if estimated_class[0] == actual_class:
            guess = "CORRECT"

        else:
             guess ="INCORRECT"

        print(" #####", guess, "##### \n # Tweet:", estimated_class[1], "\n # Prediction:", estimated_class[0], "\n # Actual:", actual_class, "\n")

In [24]:
test_score, classified = test_naive_bayes(x_test, y_test, classes, prior, likelihood)

In [25]:
print_correct_incorrect(classified, y_test, 21, 26)

 ##### CORRECT ##### 
 # Tweet: @JetBlue No, he didn't have more info. I was more infuriated by the way previous rep treated me. how she can possibly work as a JetBlue rep? 
 # Prediction: negative 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @united is a money sucking airline with terrible terrible customer service 
 # Prediction: negative 
 # Actual: negative 

 ##### INCORRECT ##### 
 # Tweet: @AmericanAir and btwn gate a8 &amp; a15 I lost a diamond earring #dayjustgotWORSE! Pls have maintenance look for it!! http://t.co/UieSR3GHHO 
 # Prediction: positive 
 # Actual: neutral 

 ##### CORRECT ##### 
 # Tweet: @USAirways #ShoutOut 2 Kristie(sp?) from Gate4 @ PVD today. She's a #RockStar, was a tremendous help in a tough situation. #PromoteThatGirl 
 # Prediction: positive 
 # Actual: positive 

 ##### CORRECT ##### 
 # Tweet: @USAirways My family, friends and colleagues will NEVER  fly USAir again. Bad weather happens. The good airlines seem to communicate better. 
 # Predi

In [26]:
# Accuracy score using test score from test_naive_bayes
print("Test score: {:.2f}".format(test_score))

Test score: 0.67


In [27]:
# Accuracy score using sklearns accuracy_score
from sklearn.metrics import accuracy_score
print("Test score: {:.2f}".format(accuracy_score(y_test, [x[0] for x in classified])))

Test score: 0.67


# Command Line Utility
Takes a tweet as user input and computes the sentiment

In [31]:
def prompt_tweet(prior, likelihood, classes):
    user_tweet = str(input(">Type in your tweet: "))
    probability = dict()
    
    for c in classes:
        probability[c] = prior[c]
        for word in lemmatize(stem(extract_words(user_tweet, stopwords))):
            if word in likelihood[c].keys():
                    probability[c] = probability[c] + likelihood[c][word]
        

    max_prob = -1000000
    for class_probability in probability:
        if probability[class_probability] > max_prob:
            max_prob = probability[class_probability]
            max_prob_class = class_probability
        
    classified = (max_prob_class, user_tweet)
    print("Class:", classified[0])
    

In [32]:
prompt_tweet(prior, likelihood, classes)

>Type in your tweet: @JetBlue No, he didn't have more info. I was more infuriated by the way previous rep treated me. how she can possibly work as a JetBlue rep?
Class: negative


# Explanation Generator
A function that takes a tweet and its classification as user input and returns the words that have contributed in classifying the tweet

In [33]:
def generate_explanation(classes, likelihood):
    tweet = input(">Type in your tweet: ")
    predicted_class = input(">Type in predicted class {}: ".format(classes))
    while predicted_class not in classes:
        predicted_class = input("Please enter a valid class {}".format(classes))
    split_words = extract_words(tweet, stopwords)
    class_words = []
    word_weights = dict()
    original_words = dict()
    for c in classes:
        word_weights[c] = {}
    
    i = 0
    for word in lemmatize(stem(split_words)):
        
        original_words[word] = split_words[i]
        i += 1
        for c in classes:
            if word in likelihood[c].keys():
                word_weights[c][word] = likelihood[c][word]
                
    for word in word_weights[predicted_class].keys():
        max_score = word_weights[predicted_class][word]
        
        for c in classes: 
            if word_weights[c][word] > max_score:
                max_score = word_weights[c][word]
        
        if max_score == word_weights[predicted_class][word]: 
            class_words.append((word, max_score))
    
    n = len(class_words)
                
    print("\n TWEET: \n '" + tweet + "' \n \n has been classified as", predicted_class.upper(), "\n \
because the following", n, "words are likely to appear in a", predicted_class, "tweet:")
    for item in class_words:
        print("        - " + original_words[item[0]])
    

In [34]:
generate_explanation(classes, likelihood)

>Type in your tweet: @united not 100% sure, however my ticket included one checked bag, therefore this charge was extra and completely unanticipated.
>Type in predicted class ['positive', 'neutral', 'negative']: negative

 TWEET: 
 '@united not 100% sure, however my ticket included one checked bag, therefore this charge was extra and completely unanticipated.' 
 
 has been classified as NEGATIVE 
 because the following 5 words are likely to appear in a negative tweet:
        - one
        - bag
        - charge
        - extra
        - completely


# Results
## Correctly predicted
The tweet "@SouthwestAir Already signed up!  Thanks!  Looking forward to trying the Southwest experience." was correctly predicted as positive, because of words such as "already", "thanks", "looking", "forward", and "experience".

"@AmericanAir @yvonneokaka When do I get my personal response and apology for your crew's having forgotten to load baggage onto my flight?" was correctly predicted as negative, since it included words such as "personal", "apology", "load", and "baggage" which commonly appear in negative tweets.

## Incorrectly predicted

The tweet "@USAirways Will do :)" is wrongly classified as negative when it actually should be classified as positive instead. This error can be a result of several factors, one being that it is too short to properly benefit from the likelihood of words, making the prediction too much based upon the prior probability, which is biased towards the class "negative" because the amount of negative tweets in the training set is significantly larger than the other classes. In addition, the smiley face is ignored, which for humans is usually a sign of positivity.

"@AmericanAir thanks.  I actually made it, my connection flight was delayed.  Guess all delays are not a bad thing." was classified as negative, when in fact its sentiment is positive. This error occured because of the presence of words such as "delayed", "delays", and "bad", which are words that usually indicate that the tweet likely is negative.


# References
Dubey, P. (2018, 18. December). An introduction to Bag of Words and how to code it in Python for NLP. *freeCodeCamp*. Retrieved from https://www.freecodecamp.org/

Jurafsky, D., & Martin, J. H. (2017). Speech and language processing. Upper Saddle River, NJ: Prentice Hall.