In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import joblib

import nltk, string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn import neural_network
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import MinMaxScaler

from imblearn.over_sampling import SMOTE
from sentence_transformers import SentenceTransformer

import xgboost as xgb


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# Preprocess Data

In [3]:
df = pd.read_csv('labelled_tweets.csv')

df = df[df['Score'] != 2].reset_index(drop=True)
print(df['Score'].value_counts())


1    686
0    373
Name: Score, dtype: int64


In [4]:
df_all = pd.read_csv('tweets_14031_20221017_232441.csv')
df_labelled = pd.read_csv('labelled_tweets.csv')

df_rem = pd.merge(
    left=df_all, 
    right=df_labelled['Cleaned Tweet'],
    how='left',
    on=['Cleaned Tweet'],
    indicator=True
)

df_rem = df_rem[df_rem['_merge'] == 'left_only'].reset_index(drop=True).drop(columns=['_merge'])


In [5]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop_words = stopwords.words('english')
ps = PorterStemmer()
wnl = WordNetLemmatizer()

# very basic text preprocessing
def preprocess_tweets(raw_tweet):
    # lowercase
    raw_tweet = raw_tweet.lower()

    # remove punctuation
    raw_tweet = ''.join([c for c in raw_tweet if c not in string.punctuation])

    # remove stopwords and apply stemming
    # raw_tweet = ' '.join([ps.stem(w) for w in raw_tweet.split() if w not in stop_words])
    raw_tweet = ' '.join([wnl.lemmatize(w) for w in raw_tweet.split() if w not in stop_words])

    return raw_tweet


df["preprocessed_tweet"] = df["Cleaned Tweet"].apply(preprocess_tweets)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chen-\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chen-\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\chen-\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [28]:
#VADER (pretrained sentiment analyzer)
nltk.download('vader_lexicon')
vader = SentimentIntensityAnalyzer()

# Iterate through the headlines and get the polarity scores using vader
scores = df['preprocessed_tweet'].apply(vader.polarity_scores).tolist()

# Convert the 'scores' list of dicts into a DataFrame
scores_df = pd.DataFrame(scores)

# Join the DataFrames of the news and the list of dicts
df_vader = df.join(scores_df, rsuffix='_right')

# Anything more than > or equal to 0 is positive
df_vader['comp_score'] = df_vader['compound'].apply(lambda c: 1 if c >= 0 else 0)

df_vader.head()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\chen-\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,Score,Cleaned Tweet,preprocessed_tweet,neg,neu,pos,compound,comp_score
0,1,"Most people say when DXY peaks, the bottom wil...",people say dxy peak bottom well october 2000 d...,0.046,0.825,0.129,0.3612,1
1,1,I stopped trading stocks Im now day/swing trad...,stopped trading stock im dayswing trading nasd...,0.047,0.848,0.105,0.5106,1
2,1,Sri Lanka's government officials from the fina...,sri lankas government official finance ministr...,0.0,1.0,0.0,0.0,1
3,1,And the Dow Jones goes up about 900 points as ...,dow jones go 900 point result maybe sport gamb...,0.0,0.775,0.225,0.4927,1
4,0,Highest Outflow - 10/12/22 $SKLZ - 98% BEARISH...,highest outflow 101222 sklz 98 bearish ko 90 b...,0.0,1.0,0.0,0.0,1


In [33]:
print(f"Vader accuracy = {(df_vader['Score'] == df_vader['comp_score']).mean()*100:.4f}%")


Vader accuracy = 70.4438%


# Train-test split

In [7]:
X, y = df['preprocessed_tweet'].tolist(), df['Score']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=1
)

print(len(X_train), len(X_test))


847 212


# Text vectorizers 

In [8]:
# vectorize the preprocessed strings
cv = CountVectorizer(max_features=500)

X_CV = CountVectorizer().fit_transform(X)
X_train_1 = cv.fit_transform(X_train)
X_test_1 = cv.fit_transform(X_test)

# Oversample count vectorizer
X_train_1_O, y_train_1_O = SMOTE(random_state=1).fit_resample(X_train_1, y_train)


In [9]:
# TFIDF
X_train_tfidf = TfidfTransformer().fit_transform(X_train_1)
X_test_tfidf = TfidfTransformer().fit_transform(X_test_1)

# Oversample TFIDF
X_train_tfidf_O = TfidfTransformer().fit_transform(X_train_1_O)
y_train_tfidf_O = y_train_1_O


In [10]:
#BERT
model = SentenceTransformer('bert-base-nli-mean-tokens')
# model = SentenceTransformer('all-mpnet-base-v2')
X_train_BERT = model.encode(X_train)
X_test_BERT = model.encode(X_test)

X_train_BERT = MinMaxScaler().fit_transform(X_train_BERT)
X_test_BERT = MinMaxScaler().fit_transform(X_test_BERT)

#Oversample BERT
X_train_BERT_O, y_train_BERT_O = SMOTE(random_state=1).fit_resample(X_train_BERT, y_train)


# Baseline Models  

#### CountVectorizer

In [11]:
models = [
    RandomForestClassifier(random_state=0),
    LinearSVC(random_state=0),
    MultinomialNB(),
    LogisticRegression(random_state=0),
    DecisionTreeClassifier(random_state=0),
    GradientBoostingClassifier(random_state=0),
    xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False),
    neural_network.MLPClassifier(random_state=0),
    KNeighborsClassifier()
]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

for model in models:
    model_name = model.__class__.__name__
    model.fit(X_train_1, y_train)
    pred = model.predict(X_test_1)
    accuracies = cross_val_score(model, X_train_1, y_train, scoring='accuracy', cv=CV)

    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))

    print(f"Accuracy of {model_name}:\t{accuracy_score(y_test, pred)}")
    print(f"Classification Report of {model_name}:\n{classification_report(y_test, pred)}")
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
print(f"Mean accuracies:\n{cv_df.groupby('model_name').accuracy.mean()}\n")
print(f"Full training table:\n{cv_df}")


Accuracy of RandomForestClassifier:	0.5
Classification Report of RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.39      0.56      0.46        80
           1       0.64      0.46      0.54       132

    accuracy                           0.50       212
   macro avg       0.51      0.51      0.50       212
weighted avg       0.54      0.50      0.51       212

Accuracy of LinearSVC:	0.6037735849056604
Classification Report of LinearSVC:
              precision    recall  f1-score   support

           0       0.47      0.46      0.47        80
           1       0.68      0.69      0.68       132

    accuracy                           0.60       212
   macro avg       0.58      0.58      0.58       212
weighted avg       0.60      0.60      0.60       212

Accuracy of MultinomialNB:	0.5801886792452831
Classification Report of MultinomialNB:
              precision    recall  f1-score   support

           0       0.44      0.40     

CountVectorizer with oversampling

In [12]:
models = [
    RandomForestClassifier(random_state=0),
    LinearSVC(random_state=0),
    MultinomialNB(),
    LogisticRegression(random_state=0),
    DecisionTreeClassifier(random_state=0),
    GradientBoostingClassifier(random_state=0),
    xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False),
    neural_network.MLPClassifier(random_state=0),
    KNeighborsClassifier()
]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

for model in models:
    model_name = model.__class__.__name__
    model.fit(X_train_1_O, y_train_1_O)
    pred = model.predict(X_test_1)
    accuracies = cross_val_score(model, X_train_1_O, y_train_1_O, scoring='accuracy', cv=CV)

    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))

    print(f"Accuracy of {model_name}:\t{accuracy_score(y_test, pred)}")
    print(f"Classification Report of {model_name}:\n{classification_report(y_test, pred)}")
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
print(f"Mean accuracies:\n{cv_df.groupby('model_name').accuracy.mean()}\n")
print(f"Full training table:\n{cv_df}")


Accuracy of RandomForestClassifier:	0.5707547169811321
Classification Report of RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.43      0.42      0.43        80
           1       0.65      0.66      0.66       132

    accuracy                           0.57       212
   macro avg       0.54      0.54      0.54       212
weighted avg       0.57      0.57      0.57       212

Accuracy of LinearSVC:	0.5754716981132075
Classification Report of LinearSVC:
              precision    recall  f1-score   support

           0       0.45      0.62      0.53        80
           1       0.71      0.55      0.62       132

    accuracy                           0.58       212
   macro avg       0.58      0.59      0.57       212
weighted avg       0.61      0.58      0.58       212

Accuracy of MultinomialNB:	0.5754716981132075
Classification Report of MultinomialNB:
              precision    recall  f1-score   support

           0       0.44

#### TFIDF

In [13]:
models = [
    RandomForestClassifier(random_state=0),
    LinearSVC(random_state=0),
    MultinomialNB(),
    LogisticRegression(random_state=0),
    DecisionTreeClassifier(random_state=0),
    GradientBoostingClassifier(random_state=0),
    xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False),
    neural_network.MLPClassifier(random_state=0),
    KNeighborsClassifier()
]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

for model in models:
    model_name = model.__class__.__name__
    model.fit(X_train_tfidf, y_train)
    pred = model.predict(X_test_tfidf)
    accuracies = cross_val_score(model, X_train_tfidf, y_train, scoring='accuracy', cv=CV)

    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))

    print(f"Accuracy of {model_name}:\t{accuracy_score(y_test, pred)}")
    print(f"Classification Report of {model_name}:\n{classification_report(y_test, pred)}")
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
print(f"Mean accuracies:\n{cv_df.groupby('model_name').accuracy.mean()}\n")
print(f"Full training table:\n{cv_df}")


Accuracy of RandomForestClassifier:	0.5660377358490566
Classification Report of RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.42      0.42      0.42        80
           1       0.65      0.65      0.65       132

    accuracy                           0.57       212
   macro avg       0.54      0.54      0.54       212
weighted avg       0.57      0.57      0.57       212

Accuracy of LinearSVC:	0.6273584905660378
Classification Report of LinearSVC:
              precision    recall  f1-score   support

           0       0.51      0.35      0.41        80
           1       0.67      0.80      0.73       132

    accuracy                           0.63       212
   macro avg       0.59      0.57      0.57       212
weighted avg       0.61      0.63      0.61       212

Accuracy of MultinomialNB:	0.6509433962264151
Classification Report of MultinomialNB:
              precision    recall  f1-score   support

           0       0.58

TFIDF with oversampling

In [14]:
models = [
    RandomForestClassifier(random_state=0),
    LinearSVC(random_state=0),
    MultinomialNB(),
    LogisticRegression(random_state=0),
    DecisionTreeClassifier(random_state=0),
    GradientBoostingClassifier(random_state=0),
    xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False),
    neural_network.MLPClassifier(random_state=0),
    KNeighborsClassifier()
]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

for model in models:
    model_name = model.__class__.__name__
    model.fit(X_train_tfidf_O, y_train_tfidf_O)
    pred = model.predict(X_test_tfidf)
    accuracies = cross_val_score(model, X_train_tfidf_O, y_train_tfidf_O, scoring='accuracy', cv=CV)

    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))

    print(f"Accuracy of {model_name}:\t{accuracy_score(y_test, pred)}")
    print(f"Classification Report of {model_name}:\n{classification_report(y_test, pred)}")
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
print(f"Mean accuracies:\n{cv_df.groupby('model_name').accuracy.mean()}\n")
print(f"Full training table:\n{cv_df}")


Accuracy of RandomForestClassifier:	0.5801886792452831
Classification Report of RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.46      0.64      0.53        80
           1       0.71      0.55      0.62       132

    accuracy                           0.58       212
   macro avg       0.59      0.59      0.58       212
weighted avg       0.62      0.58      0.59       212

Accuracy of LinearSVC:	0.6132075471698113
Classification Report of LinearSVC:
              precision    recall  f1-score   support

           0       0.49      0.54      0.51        80
           1       0.70      0.66      0.68       132

    accuracy                           0.61       212
   macro avg       0.60      0.60      0.60       212
weighted avg       0.62      0.61      0.62       212

Accuracy of MultinomialNB:	0.5518867924528302
Classification Report of MultinomialNB:
              precision    recall  f1-score   support

           0       0.42

#### BERT

In [15]:
models = [
    RandomForestClassifier(random_state=0),
    LinearSVC(random_state=0),
    MultinomialNB(),
    LogisticRegression(random_state=0),
    DecisionTreeClassifier(random_state=0),
    GradientBoostingClassifier(random_state=0),
    xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False),
    neural_network.MLPClassifier(random_state=0),
    KNeighborsClassifier()
]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

for model in models:
    model_name = model.__class__.__name__
    model.fit(X_train_BERT, y_train)
    pred = model.predict(X_test_BERT)
    accuracies = cross_val_score(model, X_train_BERT, y_train, scoring='accuracy', cv=CV)

    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))

    print(f"Accuracy of {model_name}:\t{accuracy_score(y_test, pred)}")
    print(f"Classification Report of {model_name}:\n{classification_report(y_test, pred)}")
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
print(f"Mean accuracies:\n{cv_df.groupby('model_name').accuracy.mean()}\n")
print(f"Full training table:\n{cv_df}")


Accuracy of RandomForestClassifier:	0.7735849056603774
Classification Report of RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.76      0.59      0.66        80
           1       0.78      0.89      0.83       132

    accuracy                           0.77       212
   macro avg       0.77      0.74      0.75       212
weighted avg       0.77      0.77      0.77       212

Accuracy of LinearSVC:	0.6792452830188679
Classification Report of LinearSVC:
              precision    recall  f1-score   support

           0       0.58      0.54      0.56        80
           1       0.73      0.77      0.75       132

    accuracy                           0.68       212
   macro avg       0.66      0.65      0.65       212
weighted avg       0.67      0.68      0.68       212

Accuracy of MultinomialNB:	0.7452830188679245
Classification Report of MultinomialNB:
              precision    recall  f1-score   support

           0       0.72

BERT with oversampling

In [16]:
models = [
    RandomForestClassifier(random_state=0),
    LinearSVC(random_state=0),
    MultinomialNB(),
    LogisticRegression(random_state=0),
    DecisionTreeClassifier(random_state=0),
    GradientBoostingClassifier(random_state=0),
    xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False),
    neural_network.MLPClassifier(random_state=0),
    KNeighborsClassifier()
]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

for model in models:
    model_name = model.__class__.__name__
    model.fit(X_train_BERT_O, y_train_BERT_O)
    pred = model.predict(X_test_BERT)
    accuracies = cross_val_score(model, X_train_BERT_O, y_train_BERT_O, scoring='accuracy', cv=CV)

    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))

    print(f"Accuracy of {model_name}:\t{accuracy_score(y_test, pred)}")
    print(f"Classification Report of {model_name}:\n{classification_report(y_test, pred)}")
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
print(f"Mean accuracies:\n{cv_df.groupby('model_name').accuracy.mean()}\n")
print(f"Full training table:\n{cv_df}")


Accuracy of RandomForestClassifier:	0.7877358490566038
Classification Report of RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.77      0.62      0.69        80
           1       0.80      0.89      0.84       132

    accuracy                           0.79       212
   macro avg       0.78      0.76      0.76       212
weighted avg       0.79      0.79      0.78       212

Accuracy of LinearSVC:	0.7122641509433962
Classification Report of LinearSVC:
              precision    recall  f1-score   support

           0       0.63      0.56      0.60        80
           1       0.75      0.80      0.78       132

    accuracy                           0.71       212
   macro avg       0.69      0.68      0.69       212
weighted avg       0.71      0.71      0.71       212

Accuracy of MultinomialNB:	0.7358490566037735
Classification Report of MultinomialNB:
              precision    recall  f1-score   support

           0       0.67

# Ensemble models

In [35]:
# exclude KNN, DT, MNB
ensemble = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(random_state=0)),
        ('lsvc', LinearSVC(random_state=0)),
        ('log', LogisticRegression(random_state=0)),
        ('gb', GradientBoostingClassifier(random_state=0)),
        # ('xgb', xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)),
        ('mlp', neural_network.MLPClassifier(random_state=0))
    ],
    final_estimator=None,
    cv=5,
    n_jobs=-1,
)

ensemble = ensemble.fit(X_train_BERT_O, y_train_BERT_O)
y_test_pred = ensemble.predict(X_test_BERT)

print(f"Classification Report of ensemble model:\n{classification_report(y_test, y_test_pred)}")
print(f"Accuracy of ensemble model: {accuracy_score(y_test, y_test_pred):.4f}")


Classification Report of ensemble model:
              precision    recall  f1-score   support

           0       0.78      0.56      0.65        80
           1       0.77      0.90      0.83       132

    accuracy                           0.77       212
   macro avg       0.77      0.73      0.74       212
weighted avg       0.77      0.77      0.76       212

Accuracy of ensemble model: 0.7736


In [36]:
joblib.dump(ensemble, 'model.pkl')


['model.pkl']

# Fine-tune models

In [None]:
model_gb = GradientBoostingClassifier(random_state=0)
model_gb_name = model_gb.__class__.__name__
n_estimators = [int(x) for x in np.linspace(start=50, stop=150, num=3)]
learning_rate = [1, 0.5, 0.25, 0.1, 0.05, 0.01]
max_depth = [5, 20, 50, None]
min_samples_split = list(range(1, 16))
min_samples_leaf = list(range(1, 16))
max_features = ['auto', 'sqrt']
grid = {'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_features': max_features
}

classifier = RandomizedSearchCV(model_gb, grid, n_iter=50, scoring='accuracy', error_score=0, n_jobs=-1)
grid_search = classifier.fit(X_train_BERT, y_train)
y_train_pred = grid_search.predict(X_train_BERT)
y_test_pred = grid_search.predict(X_test_BERT)

print(f"Classification Report of {model_gb_name}:\n{classification_report(y_test, y_test_pred)}")
print(f"Best Train Accuracy: {grid_search.best_score_*100:.2f}% using {grid_search.best_params_}")
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred)*100:.2f}%")
print(f"Accuracy of {model_gb_name}: {accuracy_score(y_test, y_test_pred):.4f}")


#### XGB

In [None]:
model_xgb = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)
model_xgb_name = model_xgb.__class__.__name__

boosters = ['gbtree', 'dart']
learning_rates = [1, 0.5, 0.2, 0.1, 0.05, 0.01, 0, None]
max_depths = [5, 20, 50, None]
min_child_weights = [0, 5, 10, 50, None]
max_delta_steps = list(range(10))
subsamples = [0.5, 1]
lambdas = [0, 5, 10, None]
grid = {
        'booster': boosters,
        'eta': learning_rates,
        'max_depth': max_depths,
        'min_child_weight': min_child_weights,
        'max_delta_step': max_delta_steps,
        'subsample': subsamples,
        'lambda': lambdas
}

classifier = RandomizedSearchCV(model_xgb, grid, n_iter=50, scoring='accuracy', error_score=0, n_jobs=-1)
grid_search = classifier.fit(X_train_BERT, y_train)
y_train_pred = grid_search.predict(X_train_BERT)
y_test_pred = grid_search.predict(X_test_BERT)

print(f"Classification Report of {model_xgb_name}:\n{classification_report(y_test, y_test_pred)}")
print(f"Best Train Accuracy: {grid_search.best_score_*100:.2f}% using {grid_search.best_params_}")
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred)*100:.2f}%")
print(f"Accuracy of {model_xgb_name}: {accuracy_score(y_test, y_test_pred):.4f}")
