# Traditional Models

In [1]:
import pickle

def load_tweets(path):
    with open(path, "rb") as f:
        tweets = pickle.load(f)
    return tweets

labels = load_tweets(path = "data/Sentiment_Labels.pkl")

In [2]:
clean_tweets = load_tweets(path = "data/clean_text.pkl")
clean_data =  process_tweets(clean_tweets, labels)
y_train, y_test = clean_data['labels']

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif

def process_tweets(tweets, labels, k = 20000, ngrams = (1, 2)):
    train, test, y_train, y_test = train_test_split(tweets, labels, train_size=0.8, test_size=0.2, random_state=42)
    
    vectorizer = CountVectorizer(ngram_range = ngrams, stop_words='english')
    train_bow = vectorizer.fit_transform(train)
    test_bow = vectorizer.transform(test)
    
    selector = SelectKBest(f_classif, k).fit(train_bow, y_train)
    train_topK_bow = selector.transform(train_bow).astype('float32')
    test_topK_bow = selector.transform(test_bow).astype('float32')
    
    tfidf_vectorizer = TfidfVectorizer(ngram_range = ngrams, stop_words='english')
    train_tfidf = tfidf_vectorizer.fit_transform(train)
    test_tfidf = tfidf_vectorizer.transform(test)
    
    selector_tfidf = SelectKBest(f_classif, k).fit(train_tfidf, y_train)
    train_topK_tfidf = selector_tfidf.transform(train_tfidf).astype('float32')
    test_topK_tfidf = selector.transform(test_tfidf).astype('float32')
    
    data = {}
    data['bow'] = train_bow, test_bow, train_topK_bow, test_topK_bow
    data['tfidf'] = train_tfidf, test_tfidf, train_topK_tfidf, test_topK_tfidf
    data['labels'] = y_train, y_test
    
    
    return data

In [4]:
train_bow, test_bow, train_topK_bow, test_topK_bow = clean_data['bow']
train_tfidf, test_tfidf, train_topK_tfidf, test_topK_tfidf = clean_data['tfidf']

print(f"BoW Train Features shape: {train_bow.shape}, BoW Test Features shape: {test_bow.shape} " )
print(f"TF-IDF Train Features shape: {train_tfidf.shape}, TF-IDF Test Features shape: {test_tfidf.shape} " )

BoW Train Features shape: (34753, 178008), BoW Test Features shape: (8689, 178008) 
TF-IDF Train Features shape: (34753, 178008), TF-IDF Test Features shape: (8689, 178008) 


# Logistic Regression

In [5]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
 
def fit_and_metrics(classifier, train, test):
    clf = classifier.fit(train, y_train)
    y_predicted = clf.predict(test)
    
    
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

In [6]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(multi_class='multinomial', max_iter=500)

In [7]:
accuracy_bow_lr, precision_bow_lr, recall_bow_lr, f1_bow_lr = fit_and_metrics(lr, train_bow, test_bow)

accuracy_k_bow_lr, precision_k_bow_lr, recall_k_bow_lr, f1_k_bow_lr = fit_and_metrics(lr, train_topK_bow,
                                                                                      test_topK_bow)
print("BoW Model: accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_bow_lr, 
                                                                                  precision_bow_lr,
                                                                                  recall_bow_lr, 
                                                                                  f1_bow_lr))

print("Top-K BoW: accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_k_bow_lr, 
                                                                                  precision_k_bow_lr,
                                                                                  recall_k_bow_lr, 
                                                                                  f1_k_bow_lr))

BoW Model: accuracy = 0.844, precision = 0.846, recall = 0.844, f1 = 0.843
Top-K BoW: accuracy = 0.852, precision = 0.851, recall = 0.852, f1 = 0.851


In [8]:
accuracy_tfidf_lr, precision_tfidf_lr, recall_tfidf_lr, f1_tfidf_lr = fit_and_metrics(lr, train_tfidf, test_tfidf)

accuracy_k_tfidf_lr, precision_k_tfidf_lr, recall_k_tfidf_lr, f1_k_tfidf_lr = fit_and_metrics(lr, train_topK_tfidf,
                                                                                      test_topK_tfidf)

print("TF-IDF Model: accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_tfidf_lr, 
                                                                                  precision_tfidf_lr,
                                                                                  recall_tfidf_lr, 
                                                                                  f1_tfidf_lr))

print("Top-K TF-IDF: accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_k_tfidf_lr, 
                                                                                  precision_k_tfidf_lr,
                                                                                  recall_k_tfidf_lr, 
                                                                                  f1_k_tfidf_lr))

TF-IDF Model: accuracy = 0.813, precision = 0.810, recall = 0.813, f1 = 0.807
Top-K TF-IDF: accuracy = 0.300, precision = 0.421, recall = 0.300, f1 = 0.304


# SVM

In [9]:
from sklearn.linear_model import SGDClassifier
svm = SGDClassifier()

In [10]:
accuracy_bow_svm, precision_bow_svm, recall_bow_svm, f1_bow_svm = fit_and_metrics(svm, train_bow, test_bow)

accuracy_k_bow_svm, precision_k_bow_svm, recall_k_bow_svm, f1_k_bow_svm = fit_and_metrics(svm, train_topK_bow,
                                                                                      test_topK_bow)

print("BoW Model: accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_bow_svm, 
                                                                                  precision_bow_svm, 
                                                                                  recall_bow_svm, 
                                                                                  f1_bow_svm))

print("Top-K BoW: accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_k_bow_svm, 
                                                                                  precision_k_bow_svm,
                                                                                  recall_k_bow_svm, 
                                                                                  f1_k_bow_svm))

BoW Model: accuracy = 0.848, precision = 0.852, recall = 0.848, f1 = 0.848
Top-K BoW: accuracy = 0.863, precision = 0.865, recall = 0.863, f1 = 0.863


In [11]:
accuracy_tfidf_svm, precision_tfidf_svm, recall_tfidf_svm, f1_tfidf_svm = fit_and_metrics(svm, train_tfidf, test_tfidf)

accuracy_k_tfidf_svm, precision_k_tfidf_svm, recall_k_tfidf_svm, f1_k_tfidf_svm = fit_and_metrics(svm,train_topK_tfidf,
                                                                                                  test_topK_tfidf)
print("TF-IDF Model: accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_tfidf_svm, 
                                                                                     precision_tfidf_svm, 
                                                                                     recall_tfidf_svm, 
                                                                                     f1_tfidf_svm))
print("Top-K TF-IDF: accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_k_tfidf_svm, 
                                                                                  precision_k_tfidf_svm,
                                                                                  recall_k_tfidf_svm, 
                                                                                  f1_k_tfidf_svm))

TF-IDF Model: accuracy = 0.788, precision = 0.789, recall = 0.788, f1 = 0.772
Top-K TF-IDF: accuracy = 0.410, precision = 0.430, recall = 0.410, f1 = 0.414


# Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [13]:
accuracy_bow_rf, precision_bow_rf, recall_bow_rf, f1_bow_rf = fit_and_metrics(rf, train_bow, test_bow)

accuracy_k_bow_rf, precision_k_bow_rf, recall_k_bow_rf, f1_k_bow_rf = fit_and_metrics(rf, train_topK_bow,
                                                                                      test_topK_bow)

print("BoW Model: accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_bow_rf, 
                                                                                  precision_bow_rf, 
                                                                                  recall_bow_rf, 
                                                                                  f1_bow_rf))

print("Top-K BoW: accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_k_bow_rf, 
                                                                                  precision_k_bow_rf,
                                                                                  recall_k_bow_rf, 
                                                                                  f1_k_bow_rf))

BoW Model: accuracy = 0.802, precision = 0.811, recall = 0.802, f1 = 0.802
Top-K BoW: accuracy = 0.808, precision = 0.806, recall = 0.808, f1 = 0.804


In [14]:
accuracy_tfidf_rf, precision_tfidf_rf, recall_tfidf_rf, f1_tfidf_rf = fit_and_metrics(rf, train_tfidf, test_tfidf)

accuracy_k_tfidf_rf, precision_k_tfidf_rf, recall_k_tfidf_rf, f1_k_tfidf_rf = fit_and_metrics(rf, train_topK_tfidf,
                                                                                      test_topK_tfidf)

print("TF-IDF Model: accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_tfidf_rf, 
                                                                                     precision_tfidf_rf, 
                                                                                     recall_tfidf_rf, 
                                                                                     f1_tfidf_rf))

print("Top-K TF-IDF: accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_k_tfidf_rf, 
                                                                                  precision_k_tfidf_rf,
                                                                                  recall_k_tfidf_rf, 
                                                                                  f1_k_tfidf_rf))

TF-IDF Model: accuracy = 0.803, precision = 0.808, recall = 0.803, f1 = 0.802
Top-K TF-IDF: accuracy = 0.243, precision = 0.412, recall = 0.243, f1 = 0.177


# Stemmed Tweets

In [15]:
Classifiers = [
    LogisticRegression(multi_class='multinomial', max_iter=500),
    SGDClassifier(),
    RandomForestClassifier()]

In [16]:
stem_tweets = load_tweets(path = "data/stemmed_text.pkl")
stem_data =  process_tweets(stem_tweets, labels)
stem_train, stem_test = stem_data['bow'][0], stem_data['bow'][1]
y_train, y_test = stem_data['labels']

In [17]:
for clf in Classifiers:
    fit = clf.fit(stem_train, y_train)
    pred = fit.predict(stem_test) 
    acc = accuracy_score(y_test, pred)
    print("Accuracy of " + clf.__class__.__name__ + " is " + str(acc))

Accuracy of LogisticRegression is 0.8346184831396017
Accuracy of SGDClassifier is 0.8377258602831166
Accuracy of RandomForestClassifier is 0.8030843595350443


# Lemmatized Tweets

In [18]:
lem_tweets = load_tweets(path = "data/lemmatized_text.pkl")
lem_data =  process_tweets(lem_tweets, labels)
lem_train, lem_test = lem_data['bow'][0], lem_data['bow'][1]
y_train, y_test = lem_data['labels']

In [19]:
for clf in Classifiers:
    fit = clf.fit(lem_train, y_train)
    pred = fit.predict(lem_test) 
    acc = accuracy_score(y_test, pred)
    print("Accuracy of " + clf.__class__.__name__ + " is " + str(acc))

Accuracy of LogisticRegression is 0.8394521809184026
Accuracy of SGDClassifier is 0.8440557026124985
Accuracy of RandomForestClassifier is 0.8083784094832547
