In [None]:
!pip install fsspec
!pip install gcsfs
!pip install gcloud
!pip install --upgrade scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fsspec
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 5.2 MB/s 
[?25hInstalling collected packages: fsspec
Successfully installed fsspec-2022.5.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gcsfs
  Downloading gcsfs-2022.5.0-py2.py3-none-any.whl (25 kB)
Collecting aiohttp<4
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 7.4 MB/s 
Collecting aiosignal>=1.1.2
  Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)
[K     |████████████████████████████████| 14

In [None]:
import pandas as pd
import fsspec
import gcsfs
import nltk
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error, f1_score, precision_score, recall_score, roc_auc_score, classification_report
from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split, HalvingGridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import matplotlib.pyplot as plt
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem.porter import *
nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
# Paths to CSV files
path_to_gcs_1 = 'https://storage.googleapis.com/csv-etl-fyp/model/training/cnbc_headlines.csv'
path_to_gcs_2 = 'https://storage.googleapis.com/csv-etl-fyp/model/training/guardian_headlines.csv'
path_to_gcs_3 = 'https://storage.googleapis.com/csv-etl-fyp/model/training/reuters_headlines.csv'

In [None]:
df_1 = pd.read_csv(path_to_gcs_1)
df_2 = pd.read_csv(path_to_gcs_2)
df_3 = pd.read_csv(path_to_gcs_3)

In [None]:
# Concat 3 DataFrames to One
dataframes = [df_1, df_2, df_3]
df = pd.concat(dataframes)
len(df)

53650

In [None]:
# Preprocessing 1 - Drop NA/Null Values

df = df.dropna()
df.head()

Unnamed: 0,Headlines,Time,Description
0,Jim Cramer: A better way to invest in the Covi...,"7:51 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer recommended buying..."
1,Cramer's lightning round: I would own Teradyne,"7:33 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer rings the lightnin..."
3,"Cramer's week ahead: Big week for earnings, ev...","7:25 PM ET Fri, 17 July 2020","""We'll pay more for the earnings of the non-Co..."
4,IQ Capital CEO Keith Bliss says tech and healt...,"4:24 PM ET Fri, 17 July 2020","Keith Bliss, IQ Capital CEO, joins ""Closing Be..."
5,Wall Street delivered the 'kind of pullback I'...,"7:36 PM ET Thu, 16 July 2020","""Look for the stocks of high-quality companies..."


In [None]:
# Preprocessing 2 - Lowercase

df['Headlines'] = [x.lower() for x in df['Headlines']]

In [None]:
# Preprocessing 3 - Stemming (PorterStemmer)

st = PorterStemmer()
df['Headlines'] = [st.stem(x) for x in df['Headlines']]
df

Unnamed: 0,Headlines,Time,Description
0,jim cramer: a better way to invest in the covi...,"7:51 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer recommended buying..."
1,cramer's lightning round: i would own teradyn,"7:33 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer rings the lightnin..."
3,"cramer's week ahead: big week for earnings, ev...","7:25 PM ET Fri, 17 July 2020","""We'll pay more for the earnings of the non-Co..."
4,iq capital ceo keith bliss says tech and healt...,"4:24 PM ET Fri, 17 July 2020","Keith Bliss, IQ Capital CEO, joins ""Closing Be..."
5,wall street delivered the 'kind of pullback i'...,"7:36 PM ET Thu, 16 July 2020","""Look for the stocks of high-quality companies..."
...,...,...,...
32765,malaysia says never hired british data firm at...,Mar 20 2018,The Malaysian government and the ruling party ...
32766,prosecutors search volkswagen headquarters in ...,Mar 20 2018,German prosecutors said on Tuesday they had se...
32767,mcdonald's sets greenhouse gas reduction target,Mar 20 2018,McDonald's Corp on Tuesday announced an approv...
32768,pratt & whitney to deliver spare a320neo engin...,Mar 20 2018,Pratt & Whitney will soon begin deliveries of ...


In [None]:
# Preprocessing 4 - Stopwords & Punctuation removal

from nltk.corpus import stopwords
nltk.download('stopwords')
import string
stop = set(stopwords.words('english'))
df['Headlines'] = [t for t in df['Headlines'] if t not in stop and t not in string.punctuation]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# NLTK Sentiment Analyzer for Training data labels
sia = SentimentIntensityAnalyzer()

df['compound'] = [sia.polarity_scores(x)['compound'] for x in df['Headlines']]


In [None]:
# Drop unused columns
df = df.drop(['Time', 'Description'], axis=1)

In [None]:
# Sentiment Threshold (>0.05 = Positive, >-0.05 and <0.05 = Neutral, <0.05 = Negative)
df['sentiment'] = ['pos' if x >= 0.05 else 'neu' if x >-0.05 and x <0.05 else 'neg' for x in df['compound']]

In [None]:
df.head()

Unnamed: 0,Headlines,compound,sentiment
0,jim cramer: a better way to invest in the covi...,0.4404,pos
1,cramer's lightning round: i would own teradyn,0.0,neu
3,"cramer's week ahead: big week for earnings, ev...",0.0,neu
4,iq capital ceo keith bliss says tech and healt...,0.5719,pos
5,wall street delivered the 'kind of pullback i'...,0.0,neu


In [None]:

# Fit CountVectorizer for XGBoost Input
cv = CountVectorizer(binary = True)
train = cv.fit_transform(df['Headlines'])

In [None]:
# Train/Test Split
xsplit, xtest, ysplit, ytest = train_test_split(train, df['sentiment'], test_size=0.20, random_state=42)

In [None]:
# Train/Validation Split
xtrain, xval, ytrain, yval = train_test_split(xsplit, ysplit, test_size=0.20, random_state=33)

In [None]:
# Base XGB model
xgb_model = xgb.XGBClassifier()
xgb_model.fit(xtrain, ytrain)

XGBClassifier(objective='multi:softprob')

In [None]:
pred = xgb_model.predict(xval)

In [None]:
pred_probability = xgb_model.predict_proba(xval)
pred = xgb_model.predict(xval)


In [None]:
roc_auc_score(yval, pred_probability, multi_class="ovr")

0.8160843497886366

In [None]:
# Function for model evaluation

def model_eval_wrapper(y_test, y_pred_test, xgb_clf, clf=False):\

    print("model Prediction : \n %s" %(y_pred_test[:5]))
    print("Expected Prediction: \n %s" %(y_test.head().values))
   
    if clf:
        accuracy = accuracy_score(y_test, y_pred_test)
        print("Accuracy : %.2f%%" % (accuracy * 100.0))
        F1 = f1_score(y_test,y_pred_test, average='macro')
        print("F1_score : %.2f%%" % (F1 * 100.0))
        precision = precision_score(y_test, y_pred_test, average='macro')
        print("Precision : %.2f%%" % (precision * 100.0))
        recall = recall_score(y_test, y_pred_test, average='macro')
        print("Recall : %.2f%%" % (recall * 100.0))
        print("Confusion Matrix : \n %s" %(confusion_matrix(y_test, y_pred_test)))
        
        return pd.DataFrame(classification_report(y_test,y_pred_test, output_dict=True))

In [None]:
model_eval_wrapper(yval, pred, xgb_model, clf=True)

model Prediction : 
 ['neu' 'neu' 'neu' 'neg' 'pos']
Expected Prediction: 
 ['pos' 'neu' 'neg' 'pos' 'pos']
Accuracy : 66.29%
F1_score : 63.92%
Precision : 75.19%
Recall : 63.10%
Confusion Matrix : 
 [[ 765  828  121]
 [  39 2174   52]
 [  92  787  834]]


Unnamed: 0,neg,neu,pos,accuracy,macro avg,weighted avg
precision,0.853795,0.573766,0.828203,0.66286,0.751921,0.734662
recall,0.446324,0.959823,0.486865,0.66286,0.631004,0.66286
f1-score,0.586207,0.718203,0.613235,0.66286,0.639215,0.646866
support,1714.0,2265.0,1713.0,0.66286,5692.0,5692.0


N_estimator Hyperparameter Tuning

In [None]:
param_grid_nestimator = {
    'n_estimators': [400, 800, 1200, 2000]
}

In [None]:
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid_nestimator,
    scoring = 'f1_macro',
    n_jobs = -1,
    cv = 3,
    verbose=True
)
grid_search.fit(xsplit, ysplit)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


KeyboardInterrupt: ignored

In [None]:
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
params = grid_search.cv_results_['params']

In [None]:
plt.errorbar(param_grid_nestimator['n_estimators'], means, yerr=stds)
plt.title("XGBoost n_estimators / F1 score")
plt.xlabel('n_estimators')
plt.ylabel('F1 Score')

Max_Depth Hyperparameter Tuning

In [None]:
param_grid_maxdepth = {
    'max_depth': [5, 10, 25, 35]
}

In [None]:
grid_search_md = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid_maxdepth,
    scoring = 'f1_macro',
    n_jobs = -1,
    cv = 3,
    verbose=True
)
grid_search_md.fit(xsplit, ysplit)

In [None]:
means = grid_search_md.cv_results_['mean_test_score']
stds = grid_search_md.cv_results_['std_test_score']
params = grid_search_md.cv_results_['params']

In [None]:
plt.errorbar(param_grid_maxdepth['max_depth'], means, yerr=stds)
plt.title("XGBoost max_depth / F1 score")
plt.xlabel('max_depth')
plt.ylabel('F1 Score')

Learning Rate

In [None]:
param_grid_lr = {
  'learning_rate' : [0.05, 0.10, 0.20]
}

In [None]:
grid_search_md = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid_lr,
    scoring = 'f1_macro',
    n_jobs = -1,
    cv = 3,
    verbose=True
)
grid_search_md.fit(xsplit, ysplit)

In [None]:
grid_search_md.best_params_

Final Model

In [None]:
xgb_model = xgb.XGBClassifier(learning_rate=0.2, n_estimator=2000, max_depth=35)
xgb_model.fit(xtrain, ytrain)

XGBClassifier(learning_rate=0.2, max_depth=35, n_estimator=2000,
              objective='multi:softprob')

In [None]:
pred = xgb_model.predict(xtest)
model_eval_wrapper(ytest, pred, xgb_model, clf=True)

model Prediction : 
 ['pos' 'neg' 'pos' 'neu' 'neu']
Expected Prediction: 
 ['pos' 'neg' 'pos' 'neu' 'neu']
Accuracy : 86.44%
F1_score : 85.98%
Precision : 87.26%
Recall : 85.34%
Confusion Matrix : 
 [[1647  315  148]
 [  61 2752   70]
 [ 128  243 1750]]


Unnamed: 0,neg,neu,pos,accuracy,macro avg,weighted avg
precision,0.897059,0.83142,0.889228,0.864352,0.872569,0.868123
recall,0.780569,0.954561,0.825083,0.864352,0.853404,0.864352
f1-score,0.834769,0.888745,0.855955,0.864352,0.859823,0.86296
support,2110.0,2883.0,2121.0,0.864352,7114.0,7114.0


In [None]:
# ROC
pred_probability = xgb_model.predict_proba(xtest)
roc_auc_score(ytest, pred_probability, multi_class="ovr")

0.9454165908894089

Pickling to GCS

In [None]:
import pickle
model_file = open('xgb-base.pkl', 'wb')
pickle.dump(xgb_model, model_file)
model_file.close()

In [None]:
from google.cloud import storage
from google.oauth2 import service_account
import os

# Load XGB Model to GCS

path = '/content/final-347314-04047fbe1c8e.json'
credentials = service_account.Credentials.from_service_account_file(path)
client = storage.Client(credentials=credentials, project='final-347314')
bucket = client.get_bucket('csv-etl-fyp')
blob = bucket.blob('model/ml/xgb-base.pkl')
blob.upload_from_filename('/content/xgb-base.pkl')

In [None]:
blob.public_url

'https://storage.googleapis.com/csv-etl-fyp/model/ml/xgb-base.pkl'

In [None]:
# Load CountVectorizer to GCS

cv_file = open('cv.pkl', 'wb')
pickle.dump(cv, cv_file)
cv_file.close()

In [None]:

bucket = client.get_bucket('csv-etl-fyp')
blob = bucket.blob('model/cv/cv.pkl')
blob.upload_from_filename('/content/cv.pkl')

In [None]:
# Retrieval Test

blob.download_to_filename('/content/loaded-xgb.pkl')
loaded_file = open('/content/loaded-xgb.pkl', 'rb')
mdl_ = pickle.load(loaded_file)
pred2 = mdl_.predict(xtest)
accuracy_score(ytest, pred2)

0.8643519820073096