In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

data = pd.read_csv('Tweets.csv')

[nltk_data] Downloading package stopwords to C:\Users\Einar
[nltk_data]     Haaland\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Einar
[nltk_data]     Haaland\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Einar
[nltk_data]     Haaland\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#drop duplicates and irrelevant columns
data = data.drop_duplicates()
data = data.drop(['airline_sentiment_gold', 'negativereason_gold'], axis=1)

In [3]:
#remove stopwords
stop = stopwords.words('english')
data['text'].replace(to_replace='I', value="",regex=True)
for i in stop :
    data['text'] = data['text'].replace(to_replace=r'\b%s\b'%i, value="",regex=True)

In [4]:
#remove everything but a-z
data['text'] = data['text'].str.replace("[^A-Za-z ]", '')

In [5]:
#make every tweet into list of words
data['text'] = data.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

In [6]:
#stem all words
sno = nltk.SnowballStemmer("english")
data['text'] = [[sno.stem(word) for word in tweet] for tweet in data['text']]

In [7]:
#split into train and test
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(data['text'], data['airline_sentiment'], test_size = 0.2)

In [8]:
#make vocab
vocab = {}
for tweet in xTrain:
    for word in tweet:
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1

In [9]:
#group training set by sentiment class
grouped = xTrain.groupby(yTrain)

#prior probability for each sentiment class
pos_prob = (len(grouped.get_group('positive'))) / len(yTrain)
neg_prob = (len(grouped.get_group('negative'))) / len(yTrain)
neu_prob = (len(grouped.get_group('neutral'))) / len(yTrain)

In [10]:
#make dict for total number of words in each sentiment class
total_words_in_sentiment = {'positive' : 0, 'negative' : 0, 'neutral' : 0}
for sentiment in total_words_in_sentiment.keys():
    for tweet in grouped.get_group(sentiment):
            for word in tweet:
                total_words_in_sentiment[sentiment] += 1

def get_likelihood(word, sentiment):
    count = 0
    for tweet in grouped.get_group(sentiment):
        for index in tweet:
            if index == word:
                count += 1
    return count / total_words_in_sentiment[sentiment]

In [11]:
#this function can also be used as a command line utility when given tweet in format ['This', 'is', 'a', 'tweet']
#note that words in given tweet will not be stemmed or cleaned in any other way, this might affect the result
def predict_tweet(tweet):
    #set each to 1 to avoid affecting multiplying
    positive = 1
    negative = 1
    neutral = 1
    for word in tweet:
        if word in vocab:
            positive *= get_likelihood(word, 'positive')
            negative *= get_likelihood(word, 'negative')
            neutral *= get_likelihood(word, 'neutral')
    positive *= pos_prob
    negative *= neg_prob
    neutral *= neu_prob
    if max(positive, negative, neutral) == positive:
        return 'positive'
    elif max(positive, negative, neutral) == negative:
        return 'negative'
    else:
        return 'neutral'

def predict(set_of_tweets):
    result = []
    for tweet in set_of_tweets:
        result.append(predict_tweet(tweet))
    return result

In [12]:
#run prediction on test-set. process takes several minutes..
yPred = predict(xTest)

In [13]:
#check accuracy on test set
from sklearn.metrics import accuracy_score
print(accuracy_score(yTest, yPred))
print("error rate:", 1.0 - accuracy_score(yTest, yPred))

0.7226977062649778
error rate: 0.27730229373502224


In [14]:
def explanation_generator(tweet):
    prediction = predict_tweet(tweet)
    for word in tweet:
        if word in vocab:
            pos = get_likelihood(word, 'positive') 
            neg = get_likelihood(word, 'negative')
            neu = get_likelihood(word, 'neutral')
            if max(pos, neg, neu) == pos:
                print(word, ': mostly used in positive tweets')
            elif max(pos, neg, neu) == neg:
                print(word, ': mostly used in negative tweets')
            else:
                print(word, ': mostly used in neutral tweets')
        else:
            print(word, ': of unknown sentiment')
    print('all of this classifies the tweet as:', prediction)

In [15]:
incorrect_pred = []
correct_pred = []

In [16]:
#fill incorrect_pred and correct_pred with two tweets each
for i in range(len(yTest)):
    if len(incorrect_pred) >= 2 and len(correct_pred) >= 2:
        break
    if i in yTest.index:
        if yTest[i] == yPred[i] and len(correct_pred) < 2:
            correct_pred.append(xTest[i])
        elif yTest[i] != yPred[i] and len(incorrect_pred) < 2:
            incorrect_pred.append(xTest[i])

In [17]:
#explain first incorrectly predicted tweet
explanation_generator(incorrect_pred[0])

virginamerica : mostly used in positive tweets
yes : mostly used in positive tweets
near : mostly used in negative tweets
everi : mostly used in negative tweets
time : mostly used in negative tweets
i : mostly used in neutral tweets
fli : mostly used in positive tweets
vx : mostly used in neutral tweets
ear : mostly used in negative tweets
worm : of unknown sentiment
go : mostly used in neutral tweets
away : mostly used in negative tweets
all of this classifies the tweet as: negative


In [18]:
#explain first incorrectly predicted tweet
explanation_generator(incorrect_pred[1])

virginamerica : mostly used in positive tweets
i : mostly used in neutral tweets
lt : mostly used in positive tweets
pretti : mostly used in positive tweets
graphic : mostly used in positive tweets
much : mostly used in positive tweets
better : mostly used in positive tweets
minim : of unknown sentiment
iconographi : of unknown sentiment
d : mostly used in positive tweets
all of this classifies the tweet as: positive


In [19]:
#explain first correctly predicted tweet
explanation_generator(correct_pred[0])

virginamerica : mostly used in positive tweets
sfopdx : of unknown sentiment
schedul : mostly used in neutral tweets
still : mostly used in negative tweets
mia : mostly used in negative tweets
all of this classifies the tweet as: negative


In [20]:
#explain first correctly predicted tweet
explanation_generator(incorrect_pred[1])

virginamerica : mostly used in positive tweets
i : mostly used in neutral tweets
lt : mostly used in positive tweets
pretti : mostly used in positive tweets
graphic : mostly used in positive tweets
much : mostly used in positive tweets
better : mostly used in positive tweets
minim : of unknown sentiment
iconographi : of unknown sentiment
d : mostly used in positive tweets
all of this classifies the tweet as: positive
