In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

import nltk, string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import xgboost as xgb


  from pandas import MultiIndex, Int64Index


# Preprocess Data

In [2]:
df = pd.read_csv('labelled_tweets.csv')

df = df[df['Score'] != 2].reset_index(drop=True)
print(df['Score'].value_counts())


1    686
0    373
Name: Score, dtype: int64


In [3]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
ps = PorterStemmer()

# very basic text preprocessing
def preprocess_tweets(raw_tweet):
    # lowercase
    raw_tweet = raw_tweet.lower()

    # remove punctuation
    raw_tweet = ''.join([c for c in raw_tweet if c not in string.punctuation])

    # remove stopwords and apply stemming
    raw_tweet = ' '.join([ps.stem(w) for w in raw_tweet.split() if w not in stop_words])

    return raw_tweet


df["preprocessed_tweet"] = df["Cleaned Tweet"].apply(preprocess_tweets)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chen-\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#VADER (pretrained sentiment analyzer)
nltk.download('vader_lexicon')
vader = SentimentIntensityAnalyzer()

# Iterate through the headlines and get the polarity scores using vader
scores = df['preprocessed_tweet'].apply(vader.polarity_scores).tolist()

# Convert the 'scores' list of dicts into a DataFrame
scores_df = pd.DataFrame(scores)

# Join the DataFrames of the news and the list of dicts
df_vader = df.join(scores_df, rsuffix='_right')
df_vader.head()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\chen-\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,Score,Cleaned Tweet,preprocessed_tweet,neg,neu,pos,compound
0,1,"Most people say when DXY peaks, the bottom wil...",peopl say dxi peak bottom well octob 2000 dxi ...,0.046,0.825,0.129,0.3612
1,1,I stopped trading stocks Im now day/swing trad...,stop trade stock im daysw trade nasdaq amp swi...,0.053,0.788,0.16,0.6705
2,1,Sri Lanka's government officials from the fina...,sri lanka govern offici financ ministri centra...,0.0,1.0,0.0,0.0
3,1,And the Dow Jones goes up about 900 points as ...,dow jone goe 900 point result mayb sport gambl...,0.0,0.791,0.209,0.4404
4,0,Highest Outflow - 10/12/22 $SKLZ - 98% BEARISH...,highest outflow 101222 sklz 98 bearish ko 90 b...,0.0,1.0,0.0,0.0


# Train-test split

In [5]:
X, y = df.drop(columns=['Score']), df['Score']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=1
)

print(len(X_train), len(X_test))


847 212


In [6]:
# vectorize the preprocessed strings
cv = CountVectorizer(max_features=500)
cv.fit(X_train["preprocessed_tweet"].to_numpy())

X_train = cv.transform(X_train["preprocessed_tweet"].to_numpy()).toarray()
y_train = y_train.to_numpy()
X_test = cv.transform(X_test["preprocessed_tweet"].to_numpy()).toarray()
y_test = y_test.to_numpy()


In [7]:
# another vectorizer
tfidf_vect = TfidfVectorizer()


# Baseline Models  

In [15]:
models = [
    RandomForestClassifier(random_state=0),
    LinearSVC(random_state=0),
    MultinomialNB(),
    LogisticRegression(random_state=0),
    DecisionTreeClassifier(random_state=0),
    GradientBoostingClassifier(random_state=0),
    xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False),
    KNeighborsClassifier()
]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

for model in models:
    model_name = model.__class__.__name__
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    accuracies = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=CV)

    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))

    print(f"Accuracy of {model_name}:\t{accuracy_score(y_test, pred)}")
    print(f"Classification Report of {model_name}:\n{classification_report(y_test, pred)}")
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
print(f"Mean accuracies:\n{cv_df.groupby('model_name').accuracy.mean()}\n")
print(f"Full training table:\n{cv_df}")


Accuracy of RandomForestClassifier:	0.75
Classification Report of RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.65      0.74      0.69        80
           1       0.83      0.76      0.79       132

    accuracy                           0.75       212
   macro avg       0.74      0.75      0.74       212
weighted avg       0.76      0.75      0.75       212

Accuracy of LinearSVC:	0.6981132075471698
Classification Report of LinearSVC:
              precision    recall  f1-score   support

           0       0.60      0.62      0.61        80
           1       0.77      0.74      0.75       132

    accuracy                           0.70       212
   macro avg       0.68      0.68      0.68       212
weighted avg       0.70      0.70      0.70       212

Accuracy of MultinomialNB:	0.7028301886792453
Classification Report of MultinomialNB:
              precision    recall  f1-score   support

           0       0.58      0.74    

# Fine-tune models

### XGB

In [14]:
model_xgb = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)
model_xgb_name = model_xgb.__class__.__name__

boosters = ['gbtree', 'dart']
learning_rates = [1, 0.5, 0.2, 0.1, 0.05, 0.01, 0, None]
max_depths = [5, 20, 50, None]
min_child_weights = [0, 5, 10, 50, None]
max_delta_steps = list(range(10))
subsamples = [0.5, 1]
lambdas = [0, 5, 10, None]
grid = {
        'booster': boosters,
        'eta': learning_rates,
        'max_depth': max_depths,
        'min_child_weight': min_child_weights,
        'max_delta_step': max_delta_steps,
        'subsample': subsamples,
        'lambda': lambdas
}

classifier = RandomizedSearchCV(model_xgb, grid, scoring='accuracy', error_score=0, n_jobs=-1)
grid_search = classifier.fit(X_train, y_train)
y_train_pred = grid_search.predict(X_train)
y_test_pred = grid_search.predict(X_test)

print(f"Classification Report of {model_xgb_name}:\n{classification_report(y_test,y_test_pred)}")
print(f"Best Train Accuracy: {grid_search.best_score_*100:.2f}% using {grid_search.best_params_}")
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred)*100:.2f}%")
print(f"Accuracy of {model_xgb_name}: {accuracy_score(y_test, y_test_pred):.4f}")


Classification Report of XGBClassifier:
              precision    recall  f1-score   support

           0       0.70      0.64      0.67        80
           1       0.79      0.83      0.81       132

    accuracy                           0.76       212
   macro avg       0.74      0.74      0.74       212
weighted avg       0.76      0.76      0.76       212

Best Train Accuracy: 74.37% using {'subsample': 1, 'min_child_weight': 0, 'max_depth': 5, 'max_delta_step': 1, 'lambda': None, 'eta': 0.5, 'booster': 'dart'}
Test Accuracy: 75.94%
Accuracy of XGBClassifier: 0.7594
