In [1]:
from google.colab import files
uploaded = files.upload()

Saving Tweets.csv to Tweets.csv


In [2]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag


def word_type(word):
    if word.startswith('J'):
        return wordnet.ADJ
    elif word.startswith('V'):
        return wordnet.VERB
    elif word.startswith('N'):
        return wordnet.NOUN
    elif word.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


def text_lemmatizer(word):
    lemmatizer = WordNetLemmatizer().lemmatize
    word = word_tokenize(str(word))  # Init the Wordnet Lemmatizer
    word_pos = pos_tag(word)
    lemm_words = [lemmatizer(w[0], word_type(w[1])) for w in word_pos]
    return ' '.join(lemm_words)


def cleanText(tweet):
    word_list = tweet.lower().split()
    stop_list = set(stopwords.words("english"))
    important_words = [w for w in word_list if not w in stop_list]

    clean_word_list = []
    for word in important_words:
        word = re.sub('@[A-Za-z0–9]+', '', word)  # Removing @mentions
        word = re.sub('#', '', word)  # Removing '#' hash tag
        word = re.sub('RT[\s]+', '', word)  # Removing RT
        word = re.sub('https?:\/\/\S+', '', word)  # Removing hyperlink
        word = re.sub('\d+', '', word)  # remove number
        word = re.sub(r'[^a-zA-Z]', '', word)
        word = re.sub(r'\s+', ' ', word)  # remove white space
        word = text_lemmatizer(word)
        clean_word_list.append(word)

    return " ".join(clean_word_list)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [3]:
import pandas as pd
import math


# calculate the IDF values for each unique word
def idf(unique_words, bagOfWords):
    n = len(bagOfWords)
    idf_val = {}
    for word in unique_words:
        doc_containing_word = 0
        for bagOfWord in bagOfWords:
            if word in bagOfWord:
                doc_containing_word += 1
        idf_val[word] = math.log(n / doc_containing_word)
    return idf_val


# calculate the TF-IDF for all the tweets
# calculate the TF for words and multiply it with corresponding IDF values
def tf_idf(words, bagOfWords, idf_val):
    tf_idf_list = []
    for bagOfWord in bagOfWords:
        tf_idt_dict = dict.fromkeys(words, 0.0)
        size_of_document = len(bagOfWord)
        my_dict = {i:bagOfWord.count(i) for i in bagOfWord}
        for word, count in my_dict.items():
            tf_val = 0.5 + 0.5*(float(count)/size_of_document)
            tf_idt_dict[word] = tf_val * idf_val[word]
        tf_idf_list.append(tf_idt_dict)
    return tf_idf_list


# vectorize all the tweets using TF-IDF
# convert the list of TF-IDF values in data frame for further classification
def vectorizer(documents):
    # create bag of words
    bagOfWords = []
    unique = set()
    for document in documents:
        bagOfWord = document.split(' ')
        bagOfWords.append(bagOfWord)
        for word in bagOfWord:
            unique.add(word)

    idfs = idf(unique, bagOfWords)
    tf_idfs = tf_idf(idfs.keys(), bagOfWords, idfs)
    print("number of rows: " + str(len(tf_idfs)) + "\nnumer of cols: " + str(len(tf_idfs[0])))

    # convert list to data frame
    feature_names = list(unique)
    df = pd.DataFrame(tf_idfs, columns=feature_names)
    return df


In [4]:
import operator
import math
import numpy as np


def fit(X_train):
    n = len(X_train)
    sentiment_dict = {}
    for i, row in X_train.iterrows():
        if row[-1] not in sentiment_dict:
            sentiment_dict[row[-1]] = []
        sentiment_dict[row[-1]].append(row[:-1])

    prior_prob = {}
    model = {}
    for sentiment_value, rows in sentiment_dict.items():
        prior_prob[sentiment_value] = len(rows) / float(n)
        model[sentiment_value] = [(np.mean(column), np.std(column)) for column in zip(*rows)]
    return model, prior_prob


def predict(model, prior_prob, X_test):
    y_pred = []
    for index, row in X_test.iterrows():
        probabilities = {}

        for sentiment_class, class_summaries in model.items():
            probabilities[sentiment_class] = prior_prob[sentiment_class]

            for i in range(len(class_summaries)):
                mean, std_dev = class_summaries[i]
                probability = (1 / (math.sqrt(2 * math.pi) * std_dev)) * math.exp(
                    -((row[i] - mean) ** 2 / (2 * std_dev ** 2)))
                probabilities[sentiment_class] *= probability

        y_pred.append(max(probabilities.items(), key=operator.itemgetter(1))[0])
        
    return y_pred



In [5]:
import pandas as pd


# method to calculate scores for classification
def print_result(predictions, y_test):
    # calculate and print confusion matrix
    y_actual = pd.Series(predictions)
    y_expected = pd.Series(y_test)
    conf_matrix = pd.crosstab(y_expected, y_actual).values
    print("confusion matrix:")
    print(conf_matrix, "\n")

    # calculate precision for all sentiment classes
    conf_matrix = conf_matrix

    sum_rows = conf_matrix.sum(axis=0)
    sum_cols = conf_matrix.sum(axis=1)

    precision_a = round(conf_matrix[0][0] / sum_rows[0], 2)
    precision_b = round(conf_matrix[1][1] / sum_rows[1], 2)
    precision_c = round(conf_matrix[2][2] / sum_rows[2], 2)
    weighted_avg_precision = ((precision_a * sum_cols[0]) + (precision_b * sum_cols[1]) + (precision_c * sum_cols[2])) / len(y_test)

    # calculate recall for all sentiment classes
    recall_a = round(conf_matrix[0][0] / sum_cols[0], 2)
    recall_b = round(conf_matrix[1][1] / sum_cols[1], 2)
    recall_c = round(conf_matrix[2][2] / sum_cols[2], 2)
    weighted_avg_recall = ((recall_a * sum_cols[0]) + (recall_b * sum_cols[1]) + (recall_c * sum_cols[2])) / len(y_test)

    # calculate f1-score for all sentiment classes
    f_score_a = round(2 * precision_a * recall_a / (precision_a + recall_a), 2)
    f_score_b = round(2 * precision_b * recall_b / (precision_b + recall_b), 2)
    f_score_c = round(2 * precision_c * recall_c / (precision_c + recall_c), 2)
    weighted_avg_f1 = ((f_score_a * sum_cols[0]) + (f_score_b * sum_cols[1]) + (f_score_c * sum_cols[2])) / len(y_test)

    # print table containing precision, recall and f1-score for all sentiment classes
    d = {0: [precision_a, recall_a, f_score_a], 1: [precision_b, recall_b, f_score_b],
         2: [precision_c, recall_c, f_score_c], 'wt_avg': [round(weighted_avg_precision, 2), round(weighted_avg_recall, 2), round(weighted_avg_f1, 2)]}
    print("{:<10} {:<10} {:<10} {:<10}".format(' ', 'Precision', 'Recall', 'f1-score'))
    for k, v in d.items():
        p, r, f = v
        print("{:<10} {:<10} {:<10} {:<10}".format(k, p, r, f))

    # calculate and print total accuracy
    accuracy = round((conf_matrix[0][0] + conf_matrix[1][1] + conf_matrix[2][2]) / len(predictions), 2)
    print("accuracy: ", accuracy)


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA



df = pd.read_csv('Tweets.csv')
df = df.loc[:, ['airline_sentiment', 'text']]
# df = df[:10000]
print(df.head(5))



  airline_sentiment                                               text
0           neutral                @VirginAmerica What @dhepburn said.
1          positive  @VirginAmerica plus you've added commercials t...
2           neutral  @VirginAmerica I didn't today... Must mean I n...
3          negative  @VirginAmerica it's really aggressive to blast...
4          negative  @VirginAmerica and it's a really big bad thing...


In [7]:
df['text'] = df['text'].apply(lambda x: cleanText(x))
df['label_nb'] = df['airline_sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})
df.head()

Unnamed: 0,airline_sentiment,text,label_nb
0,neutral,say,1
1,positive,plus add commercial experience tacky,2
2,neutral,today must mean need take another trip,1
3,negative,really aggressive blast obnoxious entertainme...,0
4,negative,really big bad thing,0


In [8]:
vector_df = vectorizer(df['text'])

number of rows: 14640
numer of cols: 10694


In [10]:
pca = PCA(n_components=50)
principalComponents = pca.fit_transform(vector_df)
columns = ['pca_%i' % i for i in range(50)] # principalComponents.shape[1]
principalDf = pd.DataFrame(data=principalComponents, columns=columns)


In [11]:
principalDf['label_nb'] = df['label_nb']
X_train = principalDf.sample(frac=0.8, random_state=0)
X_test = principalDf.drop(X_train.index)
y_test = X_test['label_nb']
del X_test['label_nb']
print(X_train.shape, X_test.shape, y_test.shape)


(11712, 51) (2928, 50) (2928,)


In [12]:
model, prior = fit(X_train)
y_pred = predict(model, prior, X_test)
predictions = pd.DataFrame(y_pred, columns=['target'])
print_result(y_pred, y_test.tolist())

confusion matrix:
[[1241  465  110]
 [ 180  392   54]
 [ 127  122  237]] 

           Precision  Recall     f1-score  
0          0.8        0.68       0.74      
1          0.4        0.63       0.49      
2          0.59       0.49       0.54      
wt_avg     0.68       0.64       0.65      
accuracy:  0.64
