# Importing data
Importing data from tweets.csv and splitting into training and test sets. Data fields used are "airline_sentiment" and "text".

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re

In [2]:
data = pd.read_csv('Tweets.csv')

In [3]:
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
x = data["text"]
y = data["airline_sentiment"]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

# Bag of Words
Generating a vocabulary using functions for tokenizing, stemming, and lemmatizing tweets using nltk stopwords, SnowBallStemmer, WordnetLemmatizer, and a tokenization function inspired by freeCodeCamp (Dubey, 2018).

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download("stopwords")
nltk.download('wordnet')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Espen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Espen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
def extract_words(sentence, stopwords):
    words = re.sub("[^\w]", " ",  sentence).split()
    numbers = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
    cleaned_text = [w.lower() for w in words if w not in stopwords and w[0] not in numbers] 
    return cleaned_text

In [7]:
def tokenize(sentences, stopwords):
    words = []
    
    for sentence in sentences:
        extracted_words = extract_words(sentence, stopwords)
        words.extend(extracted_words)
        
    words = sorted(list(set(words)))
    
    return words

In [8]:
def stem(word_list):
    ss = SnowballStemmer("english")
    stemmed = []
    for w in word_list:
        w_stemmed = ss.stem(w)
        if w_stemmed not in stemmed:
            stemmed.append(w_stemmed)
    return stemmed

In [9]:
def lemmatize(word_list):
    lemmatizer = WordNetLemmatizer()
    lemmatized = []
    for w in word_list:
        w_lemmatized = lemmatizer.lemmatize(w)
        if w_lemmatized not in lemmatized:
            lemmatized.append(w_lemmatized)
    return lemmatized

In [10]:
def generate_bag(sentences, stopwords):
    tokenized = tokenize(sentences, stopwords)
    stemmed = stem(tokenized)
    lemmatized = lemmatize(stemmed)
    
    return lemmatized

In [11]:
bag_of_words = generate_bag(data["text"], stopwords)

In [12]:
len(bag_of_words)

10578

# Naive Bayes
Creating the Naive Bayes Classifier based on the equations and pseudocode included in chapter 6 of *Speech and Language Processing* (Jurafsky & Martin, 2017).

In [13]:
import math

In [14]:
classes = ["positive", "neutral", "negative"]

## Prior Probability
Number of tweets labeled a particular class divided by total number of tweets

In [15]:
def calculate_prior(tweets, sentiments, classes):
    prior = dict()
    doc_count = tweets.count()
    
    for c in classes:
        class_count = 0
        for sentiment in y_train:
            if sentiment == c:
                class_count += 1
        class_prior = class_count / doc_count
        log_prior = math.log(class_prior)
        prior[c] = log_prior
    
    return prior

## Likelihood
Calculates loglikelihood[w, c] with laplace smoothing

In [16]:
def calculate_likelihood(word_count, word_sums, vocab, classes):
    likelihood = dict()
    
    for c in classes:
        likelihood[c] = {}
        for word in vocab:
            word_likelihood = (word_count[c][word] + 1) / (word_sums[c] + 1)
            likelihood[c][word] = math.log(word_likelihood)
    
    return likelihood

### Functions for Likelihood
Calculating likelihood relies on the following functions:
    <ul>
        <li>create_big_doc - Creates one long string for each class containing all tweets belonging to respective class</li>
        <li>count_words - counts the amount of times a word appears in a class</li>
        <li>sum_class_word_count - counts the number of words in a class's bigdoc</li>
    </ul>

In [17]:
def create_big_doc(tweets, sentiments, classes):
    big_doc = dict()
    for c in classes:
        big_doc[c] = ""

    for tweet, sentiment in zip(tweets, sentiments):
        big_doc[sentiment] += tweet

    return big_doc

In [18]:
def count_words(big_doc, vocab, classes):
    word_count = dict()
    for i in classes:
        word_count[i] = {}

    for c in big_doc:
        for word in vocab:
            word_count[c][word] = big_doc[c].count(word)
            
    return word_count

In [19]:
def sum_class_word_count(vocab, big_doc, classes):
    word_sums = dict()
    for c in classes:
        x = 0
        
        for word in vocab:
            x = x + big_doc[c].count(word)
        
        word_sums[c] = x
    
    
    return word_sums

## Train Naive Bayes
Function that returns:
    <ul>
        <li>prior - Dictionary of every class's prior probability</li>
        <li>likelihood - Dictionary of every word's likelihood given a class</li>
        <li>vocab - List of the training set's vocabulary</li>
    </ul>

In [20]:
def train_naive_bayes(x_train, y_train, classes):
    vocab = generate_bag(x_train, stopwords)
    
    # Calculates P(c)
    prior = calculate_prior(x_train, y_train, classes)
    
    # All tweets separated by class
    big_doc = create_big_doc(x_train, y_train, classes)
    
    # Amount of times a word appears in a class
    word_count = count_words(big_doc, vocab, classes)
    
    # Sum of count(w, c)
    word_sums = sum_class_word_count(vocab, big_doc, classes)
    
    #P(w | c)
    likelihood = calculate_likelihood(word_count, word_sums, vocab, classes)
        
    return prior, likelihood, vocab

In [21]:
prior, likelihood, vocab = train_naive_bayes(x_train, y_train, classes)

## Evaluating the Classifier
Evaluates the the classifier using the function test_naive_bayes, which returns a test score from 0 to 1. Additionally prints all tweets in the test set with a "correct/incorrect"-label.

In [22]:
def test_naive_bayes(x_test, y_test, classes, prior, likelihood): 
    
    counter = 0
    correct_count = 0
    classified = []
    for sentence in x_test:
        
        probability = dict()
        
        for c in classes:
            probability[c] = prior[c]
            for word in lemmatize(stem(extract_words(sentence, stopwords))):
                if word in likelihood[c].keys():
                    probability[c] = probability[c] + likelihood[c][word]
        max_prob = -1000000
        for class_probability in probability:
            if probability[class_probability] > max_prob:
                max_prob = probability[class_probability]
                max_prob_class = class_probability
        
        classified.append((max_prob_class, sentence))
    
    for estimated_class, actual_class in zip(classified, y_test):

        if estimated_class[0] == actual_class:
            guess = "CORRECT"
            correct_count += 1
        else:
            guess ="INCORRECT"

        print(" #####", guess, "##### \n # Tweet:", estimated_class[1], "\n # Prediction:", estimated_class[0], "\n # Actual:", actual_class, "\n")
        counter += 1
    
    test_score = correct_count / counter
    return test_score, classified
            

In [23]:
test_score, classified = test_naive_bayes(x_test, y_test, classes, prior, likelihood)

 ##### CORRECT ##### 
 # Tweet: @SouthwestAir you're my early frontrunner for best airline! #oscars2016 
 # Prediction: positive 
 # Actual: positive 

 ##### INCORRECT ##### 
 # Tweet: @USAirways how is it that my flt to EWR was Cancelled Flightled yet flts to NYC from USAirways are still flying? 
 # Prediction: neutral 
 # Actual: negative 

 ##### INCORRECT ##### 
 # Tweet: @JetBlue what is going on with your BDL to DCA flights yesterday and today?! Why is every single one getting delayed? 
 # Prediction: positive 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @JetBlue do they have to depart from Washington, D.C.?? 
 # Prediction: neutral 
 # Actual: neutral 

 ##### CORRECT ##### 
 # Tweet: @JetBlue I can probably find some of them. Are the ticket #s on there? 
 # Prediction: negative 
 # Actual: negative 

 ##### INCORRECT ##### 
 # Tweet: @united still waiting to hear back. My wallet was stolen from one of your planes so would appreciate a resolution here 
 # Prediction: 

 ##### INCORRECT ##### 
 # Tweet: @united - Group 2 line gets longer every week. Almost no one left for Groups 3-5 anymore. Time to make Explorer Card Group 3? 
 # Prediction: negative 
 # Actual: neutral 

 ##### INCORRECT ##### 
 # Tweet: @SouthwestAir it's not letting me DM you !! 
 # Prediction: neutral 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @united trying to reach him for the number. At last they have him on standbye and gave him 1 meal voucher for a potential 2 day standbye! 
 # Prediction: negative 
 # Actual: negative 

 ##### INCORRECT ##### 
 # Tweet: @united thanks for the reply. To clarify, the airfare is similar to your likely intended peer group. The $3 beer charge, however, is not 
 # Prediction: positive 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @united No but u cld explain how such a disorganized and inefficient company w surly and obviously unhappy employees stays in business. 
 # Prediction: negative 
 # Actual: negative 

 ##### INCORREC

 # Prediction: neutral 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @SouthwestAir flight 1614 FLL to ATL. My wife traveling with infant. Line to check baggage 200 people long due to your computer system crash 
 # Prediction: negative 
 # Actual: negative 

 ##### INCORRECT ##### 
 # Tweet: @SouthwestAir Pres/CEO Gary Kelly at #TheRoFo addressing 2000 of his BWI based employees. @cbsbaltimore http://t.co/OI32uq2tTZ 
 # Prediction: positive 
 # Actual: neutral 

 ##### CORRECT ##### 
 # Tweet: @united yea they been booked on 10 next avalible flights since sat 7am. And when time comes no plane 2nd day of work missed #hotelliving 
 # Prediction: negative 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @JetBlue I am heading to JFK now just on principle alone to deal w my lost &amp; damaged bag. #jetblue #jetbluesucks #jfk #badservice #fail 
 # Prediction: negative 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @VirginAmerica Adds Pillows Instead of Lie-Flat Seats in

 # Actual: neutral 

 ##### INCORRECT ##### 
 # Tweet: @AmericanAir can you use the aa credit platinum select world mastercard for Alaska flights?  Or just american? 
 # Prediction: negative 
 # Actual: neutral 

 ##### CORRECT ##### 
 # Tweet: @united once again your lack of customer service astounds me! You are the worst airline in the history of airlines! Train your staff! 
 # Prediction: negative 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @SouthwestAir employees spreading a bit of #MardiGras cheer at 7:30 this morning in Atlanta  #NFTYConvention http://t.co/ttbtY89Ilo 
 # Prediction: neutral 
 # Actual: neutral 

 ##### CORRECT ##### 
 # Tweet: @SouthwestAir's CEO Kelly draws record crowd to @BWI_Airport Business Partnership breakfast http://t.co/CVba4olcBl http://t.co/wTa5pX70A5 
 # Prediction: neutral 
 # Actual: neutral 

 ##### CORRECT ##### 
 # Tweet: @AmericanAir that's the number I called -it wouldn't let me speak to an agent because of the issues with weather to

 # Prediction: negative 
 # Actual: negative 

 ##### INCORRECT ##### 
 # Tweet: @USAirways is there nothing that can be done online to help? i bought these as a birthday present, just trying to be able to afford a change 
 # Prediction: negative 
 # Actual: neutral 

 ##### CORRECT ##### 
 # Tweet: @AmericanAir thank you for replying. Trying to figure out how to get from there from DFW 
 # Prediction: positive 
 # Actual: positive 

 ##### CORRECT ##### 
 # Tweet: @USAirways any way to tell our gate we are going? Stuck on tarmack, int'l flt FLT 878 going to ELM Gate F6 
 # Prediction: negative 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @VirginAmerica - too many apologies! You r the worse airlines! Don't even respond to your cudtomers 
 # Prediction: negative 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @SouthwestAir me again! I was just trying to rebook fare on wanna get away pricing and it disappeared in time I refreshed. Is this normal? 
 # Prediction: negative

 ##### CORRECT ##### 
 # Tweet: @USAirways what happens if it isn't located 
 # Prediction: negative 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @JetBlue @jeff_hofmann @DeniseJTaylor @LaurieAMeacham Good one! And indeed, it's JetBlue's finest day in history! 
 # Prediction: positive 
 # Actual: positive 

 ##### CORRECT ##### 
 # Tweet: @USAirways FUK U US AIRWAYS WITH YO SHITTY CHICKEN SALAD SANDWICH THAT SO OVERPRICED AND U FUKING MAKE ME WAIT IN A 6 HR LAYOVER FUK U AND 
 # Prediction: negative 
 # Actual: negative 

 ##### INCORRECT ##### 
 # Tweet: @AmericanAir the motor that usually starts the engine is broke and @CVGairport ground crew failed to do it on time, per pilot #maintenance 
 # Prediction: negative 
 # Actual: neutral 

 ##### INCORRECT ##### 
 # Tweet: @SouthwestAir customer service at FLL, BWI,and PIT have been terrible.no one knows where his bags are.his is on a job with no clothes &amp; gear 
 # Prediction: positive 
 # Actual: negative 

 ##### CORRECT #

 # Prediction: positive 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @united too long for 140 characters 
 # Prediction: negative 
 # Actual: negative 

 ##### INCORRECT ##### 
 # Tweet: @united i think he actually did not like your screen  @campilley 😃😃😃 
 # Prediction: neutral 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @united Flight has been delayed for another hour so only have 24 mins to transit at LAX... Extremely unlikely I will make it! 
 # Prediction: negative 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @AmericanAir thinks it's a good cs to get everyone through the gate for flight 4275 then we wait bc plane not ready http://t.co/rxmjWoo7Qi 
 # Prediction: negative 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @JetBlue I wish you all the best of luck :-) I'm enjoying the luxurious free!! amount of leg space rn. Thanks 
 # Prediction: positive 
 # Actual: positive 

 ##### CORRECT ##### 
 # Tweet: @JetBlue your agent told us TSA didn't h

 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @AmericanAir Do you have any sort of live chat feature? We're in the UK right now and that call would cost us alot. :( 
 # Prediction: neutral 
 # Actual: neutral 

 ##### INCORRECT ##### 
 # Tweet: @united 

There must be 100 people waiting in line for customer service at DEN to deal with flight Cancelled Flightations. 
 # Prediction: positive 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @AmericanAir too Late Flight now. Boarded, exit rows taken. Could not have gotten worse. Guy next 2 me should've bought 2 seats. #miserablemorning 
 # Prediction: negative 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @AmericanAir appreciate update. Have also appreciated our pilots effort to explain to us just now. Accurate, authoritative comms is vital. 
 # Prediction: positive 
 # Actual: positive 

 ##### CORRECT ##### 
 # Tweet: @USAirways @garywerk poor excuse. I was towards front of plane, no sandwiches, we have to pay for 

 # Prediction: neutral 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @AmericanAir I picked the nonstop flight bc I had things to get to. Should’ve taken diff route or airline I suppose! 
 # Prediction: negative 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @united as for volunteers to give up seats, people did! Now we sit for 25 minutes on plane waiting 4 them to add more people!  #letsgo 
 # Prediction: negative 
 # Actual: negative 

 ##### INCORRECT ##### 
 # Tweet: @SouthwestAir Salted or honey roasted? I vote to bring back the salted peanuts. I dread a year of the honey roasted!😖 http://t.co/RHw78ktQFO 
 # Prediction: positive 
 # Actual: neutral 

 ##### CORRECT ##### 
 # Tweet: @JetBlue thanks. I appreciate your prompt response. 
 # Prediction: positive 
 # Actual: positive 

 ##### CORRECT ##### 
 # Tweet: @united chase says no referral bonus. Thanks. 
 # Prediction: neutral 
 # Actual: neutral 

 ##### CORRECT ##### 
 # Tweet: @united this delay of flight UA4

 ##### CORRECT ##### 
 # Tweet: @USAirways is the @comcast of airlines #2hrs35minOnHold 
 # Prediction: negative 
 # Actual: negative 

 ##### INCORRECT ##### 
 # Tweet: @USAirways I packed a carry on for a reason. Thanks for making me check it. I hoped for better customer service in CLE also. #nothappy 
 # Prediction: positive 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @JetBlue with 3 kids and 11 days 340 + doesn't work and it's months in advance 
 # Prediction: neutral 
 # Actual: neutral 

 ##### CORRECT ##### 
 # Tweet: @SouthwestAir So far so good! http://t.co/16c9ex79Rk 
 # Prediction: positive 
 # Actual: positive 

 ##### CORRECT ##### 
 # Tweet: @USAirways @nanceebing 4 hour hold times at the moment...and counting. #disgrace 
 # Prediction: negative 
 # Actual: negative 

 ##### INCORRECT ##### 
 # Tweet: @united sorry, I hate united. i won't get to see my family until Thursday 
 # Prediction: positive 
 # Actual: negative 

 ##### INCORRECT ##### 
 # Tweet: @Virgi


 ##### INCORRECT ##### 
 # Tweet: .@united too much info to share via tweet. Please send me your name and contact info. Happy to supply you with images and CS rep names. 
 # Prediction: neutral 
 # Actual: negative 

 ##### INCORRECT ##### 
 # Tweet: @USAirways Oh certainly. And now I have two $275 pending transactions on my bank account. Really happy that I was charged double. 
 # Prediction: neutral 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @USAirways then she had to get her own hotel so ur agents did nothing to help and to top it off you sent her luggage on the wrong flight!!! 
 # Prediction: negative 
 # Actual: negative 

 ##### INCORRECT ##### 
 # Tweet: @united @UCtraveladvisor - I would have loved to respond to your website until I saw the really long form. In business the new seats are bad 
 # Prediction: neutral 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @JetBlue The Opal Dragon book The Dragon (ALI) has woven his murdering ways from the Philippines 

 # Prediction: positive 
 # Actual: neutral 

 ##### CORRECT ##### 
 # Tweet: @SouthwestAir @karajusto OK we will! Thank you!!! 
 # Prediction: positive 
 # Actual: positive 

 ##### INCORRECT ##### 
 # Tweet: @united of course but they were just as helpless as everyone else . 
 # Prediction: positive 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @USAirways any direct number to dividend miles account info? Every Time I try them I get actually hung up on 
 # Prediction: negative 
 # Actual: negative 

 ##### INCORRECT ##### 
 # Tweet: @SouthwestAir Have a cup coffee and relax while you check out the New Deals and Promotions at Avon, twice a month at Doug @dcoadavon 
 # Prediction: positive 
 # Actual: neutral 

 ##### INCORRECT ##### 
 # Tweet: @USAirways and if the flight is full? 
 # Prediction: negative 
 # Actual: neutral 

 ##### CORRECT ##### 
 # Tweet: @united I have proof that my situation now is due to company mistake 
 # Prediction: negative 
 # Actual: negative 

 ##


 ##### INCORRECT ##### 
 # Tweet: @AmericanAir Great seats on this aircraft! 
 # Prediction: negative 
 # Actual: positive 

 ##### CORRECT ##### 
 # Tweet: @AmericanAir THANK YOU!!! 👍👍👍👍👍 
 # Prediction: positive 
 # Actual: positive 

 ##### CORRECT ##### 
 # Tweet: @united if I pay you $25 and $35 for my luggage to be delivered when I arrive. Why should I have to wait 3 additional days for its delivery? 
 # Prediction: negative 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @SouthwestAir If you could get me on the 12:15 flight! My 10:10 has been delayed until 1:05. 
 # Prediction: negative 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @SouthwestAir this is ridiculous. It's been 2 hours on hold to rebook Cancelled Flightled flight. 
 # Prediction: negative 
 # Actual: negative 

 ##### INCORRECT ##### 
 # Tweet: @SouthwestAir Had a very unpleasant experience over the phone with one of your agents re: a Cancelled Flightled flight. I have the name &amp; agent ID. 
 # 

 ##### CORRECT ##### 
 # Tweet: @united your announcement for pre boarding only addresses mobility. My disability requires me to travel with a lot of stuff. Do I preboard? 
 # Prediction: neutral 
 # Actual: neutral 

 ##### CORRECT ##### 
 # Tweet: @AmericanAir that all AA is for USAir Elite members. It's one big disappointment in the way we are treated. #epicfailure 
 # Prediction: negative 
 # Actual: negative 

 ##### CORRECT ##### 
 # Tweet: @SouthwestAir I WANNAA GO TO THE VEGAS SHOW SO BAD ID DOO ANYTHING #DestinationDragons 
 # Prediction: neutral 
 # Actual: neutral 

 ##### INCORRECT ##### 
 # Tweet: @united service so far has been horrid. We wanted to end the trip on a high note. Guess that's not an option. 
 # Prediction: positive 
 # Actual: negative 

 ##### INCORRECT ##### 
 # Tweet: @SouthwestAir it keeps saying that mobile boarding passes are unavailable despite having checked in and everything 
 # Prediction: neutral 
 # Actual: negative 

 ##### CORRECT ##### 
 # Twe

In [24]:
# Accuracy score using test score from test_naive_bayes
print("Test score: {:.2f}".format(test_score))

Test score: 0.67


In [25]:
# Accuracy score using sklearns accuracy_score
from sklearn.metrics import accuracy_score
print("Test score: {:.2f}".format(accuracy_score(y_test, [x[0] for x in classified])))

Test score: 0.67


# Command Line Utility
Takes a tweet as user input and computes the sentiment

In [26]:
def prompt_tweet(prior, likelihood, classes):
    user_tweet = str(input(">Type in your tweet: "))
    probability = dict()
    
    for c in classes:
        probability[c] = prior[c]
        for word in lemmatize(stem(extract_words(user_tweet, stopwords))):
            if word in likelihood[c].keys():
                    probability[c] = probability[c] + likelihood[c][word]
        

    max_prob = -1000000
    for class_probability in probability:
        if probability[class_probability] > max_prob:
            max_prob = probability[class_probability]
            max_prob_class = class_probability
        
    classified = (max_prob_class, user_tweet)
    print("Tweet:", classified[1], "\n Class:", classified[0])
    

In [27]:
prompt_tweet(prior, likelihood, classes)

>Type in your tweet: @united not 100% sure, however my ticket included one checked bag, therefore this charge was extra and completely unanticipated.
Tweet: @united not 100% sure, however my ticket included one checked bag, therefore this charge was extra and completely unanticipated. 
 Class: negative


# Explanation Generator
A function that takes a tweet and its classification as user input and returns the words that have contributed in classifying the tweet

In [28]:
def generate_explanation(classes, likelihood):
    tweet = input(">Type in your tweet: ")
    predicted_class = input(">Type in predicted class {}: ".format(classes))
    while predicted_class not in classes:
        predicted_class = input("Please enter a valid class {}".format(classes))
    split_words = extract_words(tweet, stopwords)
    class_words = []
    word_weights = dict()
    original_words = dict()
    for c in classes:
        word_weights[c] = {}
    
    i = 0
    for word in lemmatize(stem(split_words)):
        
        original_words[word] = split_words[i]
        i += 1
        for c in classes:
            if word in likelihood[c].keys():
                word_weights[c][word] = likelihood[c][word]
                
    for word in word_weights[predicted_class].keys():
        max_score = word_weights[predicted_class][word]
        
        for c in classes: 
            if word_weights[c][word] > max_score:
                max_score = word_weights[c][word]
        
        if max_score == word_weights[predicted_class][word]: 
            class_words.append((word, max_score))
    
    n = len(class_words)
                
    print("\n TWEET: \n '" + tweet + "' \n \n has been classified as", predicted_class.upper(), "\n \
because the following", n, "words are likely to appear in a", predicted_class, "tweet:")
    for item in class_words:
        print("        - " + original_words[item[0]])
    

In [29]:
generate_explanation(classes, likelihood)

>Type in your tweet: @united not 100% sure, however my ticket included one checked bag, therefore this charge was extra and completely unanticipated.
>Type in predicted class ['positive', 'neutral', 'negative']: negative

 TWEET: 
 '@united not 100% sure, however my ticket included one checked bag, therefore this charge was extra and completely unanticipated.' 
 
 has been classified as NEGATIVE 
 because the following 5 words are likely to appear in a negative tweet:
        - one
        - bag
        - charge
        - extra
        - completely


# Results
## Correctly predicted
The tweet "@SouthwestAir Already signed up!  Thanks!  Looking forward to trying the Southwest experience." was correctly predicted as positive, because of words such as "already", "thanks", "looking", "forward", and "experience".

"@AmericanAir @yvonneokaka When do I get my personal response and apology for your crew's having forgotten to load baggage onto my flight?" was correctly predicted as negative, since it included words such as "personal", "apology", "load", and "baggage" which commonly appear in negative tweets.

## Incorrectly predicted

The tweet "@USAirways Will do :)" is wrongly classified as negative when it actually should be classified as positive instead. This error can be a result of several factors, one being that it is too short to properly benefit from the likelihood of words, making the prediction too much based upon the prior probability, which is biased towards the class "negative" because the amount of negative tweets in the training set is significantly larger than the other classes. In addition, the smiley face is ignored, which for humans is usually a sign of positivity.

"@AmericanAir thanks.  I actually made it, my connection flight was delayed.  Guess all delays are not a bad thing." was classified as negative, when in fact its sentiment is positive. This error occured because of the presence of words such as "delayed", "delays", and "bad", which are words that usually indicate that the tweet likely is negative.


# References
Dubey, P. (2018, 18. December). An introduction to Bag of Words and how to code it in Python for NLP. *freeCodeCamp*. Retrieved from https://www.freecodecamp.org/

Jurafsky, D., & Martin, J. H. (2017). Speech and language processing. Upper Saddle River, NJ: Prentice Hall.