In [320]:
import pandas as pd
import numpy as np
from matplotlib import pyplot
#from pandas import read_csv, set_option
from pandas import Series, datetime
from pandas.tools.plotting import scatter_matrix, autocorrelation_plot

# Machine Learning libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import roc_curve, auc, recall_score, precision_score,log_loss


import matplotlib.pyplot as plt
import random
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from xgboost import XGBClassifier

# Graphs
import seaborn as sns
import plotly.graph_objs as go
import plotly.offline as py

# Surpress all warnings
import warnings
warnings.filterwarnings("ignore")

In [261]:
aggregated = pd.read_json('aggregated_sentiment.json')

aggregated.head()

Unnamed: 0,close,compound,like,replies,retweets,vol
2019-05-01 00:00:00,5335.62,0.266588,3955,493,743,1362.165784
2019-05-01 01:00:00,5338.85,0.225017,1764,380,345,631.481774
2019-05-01 02:00:00,5355.07,0.264474,1395,312,216,1012.867995
2019-05-01 03:00:00,5348.95,0.274383,1387,223,199,1427.585822
2019-05-01 04:00:00,5341.13,0.305488,840,120,155,512.101053


# Add new features

In [262]:
def hourly_return(adj_close):
    returns = []
    for i in range(0, len(adj_close)-1):
        future = adj_close[i+1]
        current = adj_close[i]
        daily_return = (future - current)/current*100
        returns.append(daily_return)
    return returns

returns = hourly_return(aggregated['close'])
# There needs to be a zero at index 0 because we can't comput return for that day
returns.insert(0,0)


#mu = np.mean(returns)*252.           # drift coefficient
#sig = np.std(returns)*np.sqrt(252.)  # diffusion coefficient

def classification(fee, returns):
    outcome = []
    for i in returns:
        if i > fee:
            outcome.append(1)
        else:
            outcome.append(0)
    return outcome

def add_ma(source, var, list_mean):
    df = source.copy()
    for i in list_mean:
        temp = source[var].rolling(i*24).mean()
        #source[var].shift(i)
        df[str('ma_'+var+'_'+str(i)+'d')] = temp
        
    return df

def add_lag(source, var, lags):
    df = source.copy()
    for i in range(1, lags+1):
        temp = source[var].shift(i)
        df[str('lag_'+var+'_'+str(i)+'h')] = temp
        
    return df

# classification data prep

outcome = classification(0.1, returns)

aggregated = aggregated.assign(returns = returns, buy_signal = outcome)

model_df = aggregated.drop(['replies','retweets'], axis = 1)

# Add moving averages
list_mean = [3,7,14]
model_df = add_ma(model_df, 'compound', list_mean)
model_df = add_ma(model_df, 'like', list_mean)

# Add Laggs
model_df = add_lag(model_df, 'returns', 3)

model_df = add_lag(model_df, 'vol', 3)

# Fill NAs
model_df = model_df.fillna(0)

financial = model_df.copy()

# dorp returns
model_df = model_df.drop(['returns','vol', 'close'], axis = 1)


# put outcome variable last
y = model_df['buy_signal']

model_df = model_df.drop('buy_signal', axis =1)
model_df['buy_signal'] = y



# Dataset for classification
print('class data')
print(model_df.head())


class data
                     compound  like  ma_compound_3d  ma_compound_7d  \
2019-05-01 00:00:00  0.266588  3955             0.0             0.0   
2019-05-01 01:00:00  0.225017  1764             0.0             0.0   
2019-05-01 02:00:00  0.264474  1395             0.0             0.0   
2019-05-01 03:00:00  0.274383  1387             0.0             0.0   
2019-05-01 04:00:00  0.305488   840             0.0             0.0   

                     ma_compound_14d  ma_like_3d  ma_like_7d  ma_like_14d  \
2019-05-01 00:00:00              0.0         0.0         0.0          0.0   
2019-05-01 01:00:00              0.0         0.0         0.0          0.0   
2019-05-01 02:00:00              0.0         0.0         0.0          0.0   
2019-05-01 03:00:00              0.0         0.0         0.0          0.0   
2019-05-01 04:00:00              0.0         0.0         0.0          0.0   

                     lag_returns_1h  lag_returns_2h  lag_returns_3h  \
2019-05-01 00:00:00        0

In [263]:
# Sanity Check for missing values in compound
model_df[model_df['compound'] == 0]

Unnamed: 0,compound,like,ma_compound_3d,ma_compound_7d,ma_compound_14d,ma_like_3d,ma_like_7d,ma_like_14d,lag_returns_1h,lag_returns_2h,lag_returns_3h,lag_vol_1h,lag_vol_2h,lag_vol_3h,buy_signal


In [264]:
# financial data prep

financial_only = financial.loc[:, ['returns','vol']]

# Add lags
financial_only = add_lag(financial_only, 'returns', 3)

# Add lags
financial_only = add_lag(financial_only, 'vol', 3)

financial_only = financial_only.drop(['returns','vol'], axis =1)

# Set outcome variable last
financial_only['buy_signal'] =  y

# Fill NAs
financial_only = financial_only.fillna(0)

# Dataset for classification
print(financial_only.head())

                     lag_returns_1h  lag_returns_2h  lag_returns_3h  \
2019-05-01 00:00:00        0.000000        0.000000        0.000000   
2019-05-01 01:00:00        0.000000        0.000000        0.000000   
2019-05-01 02:00:00        0.060537        0.000000        0.000000   
2019-05-01 03:00:00        0.303811        0.060537        0.000000   
2019-05-01 04:00:00       -0.114284        0.303811        0.060537   

                      lag_vol_1h   lag_vol_2h   lag_vol_3h  buy_signal  
2019-05-01 00:00:00     0.000000     0.000000     0.000000           0  
2019-05-01 01:00:00  1362.165784     0.000000     0.000000           0  
2019-05-01 02:00:00   631.481774  1362.165784     0.000000           1  
2019-05-01 03:00:00  1012.867995   631.481774  1362.165784           0  
2019-05-01 04:00:00  1427.585822  1012.867995   631.481774           0  


In [265]:
# drop return
#senti = model_df.drop('returns', axis=1)
#fin = financial_only.drop('returns', axis=1)

In [266]:
def train_test_split_own(df, days):
    values = df.values
    n_train_hours = days * 24
    train = values[:n_train_hours, :]
    test = values[n_train_hours:, :]
    # split into input and outputs
    train_X, train_y = train[:, :-1], train[:, -1]
    test_X, test_y = test[:, :-1], test[:, -1]
    
    return train_X, train_y, test_X, test_y

In [335]:
def models(train_X, train_y, test_X, test_y):
    # Append the models to the models list
    models = []
    models.append(('LR' , LogisticRegression()))
    models.append(('KNN' , KNeighborsClassifier()))
    models.append(('CART' , DecisionTreeClassifier()))
    models.append(('NB' , GaussianNB()))
    models.append(('SVM' , SVC(random_state=7)))
    models.append(('RF' , RandomForestClassifier(n_estimators=150, random_state=7)))
    models.append(('XGBoost', XGBClassifier(seed=7)))
    
    # Evaluate each algorithm for accuracy
    results = []
    names = []
    '''
    for name, model in models:
        kfold = KFold(n_splits=num_folds, random_state=42)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg) '''
    
    accu_scores = []
    recall_scores = []
    
    for name, model in models:
        clf = model
        clf.fit(train_X, train_y)
        y_pred = clf.predict(test_X)
        accu_score = round(accuracy_score(test_y, y_pred),4) *100
        accu_scores.append(accu_score) 
        names.append(name)
        recall = recall_score(test_y, y_pred)
        recall_scores.append(recall*100)
        print(name + ": " + str(accu_score))

    return accu_scores, names, recall_scores
    
def run_before_opt(df, days):
    train_X, train_y, test_X, test_y = train_test_split_own(df, days)
    
    results, names, recall_scores = models(train_X, train_y, test_X, test_y)
    
    return results, names, recall_scores

In [336]:
print('Sentiment data')
results_sent, names, recall_scores_sent = run_before_opt(model_df, 60)

print('')
print('Financial data')
results_fin, names, recall_scores_fin = run_before_opt(financial_only, 60)

delta_accu = []
for i in range(0,len(results_sent)):
    temp= results_sent[i] - results_fin[i]
    delta_accu.append(temp)
    
delta_recall = []
for i in range(0,len(recall_scores_sent)):
    temp= recall_scores_sent[i] - recall_scores_fin[i]
    delta_recall.append(temp)

    
col = ['Accuracy Sentiment', 'Recall Sentiment','Accuracy no Sentiment', 'Recall no Sentiment', 'Delta Accuracy', 'Delta Recall']
comparison = pd.DataFrame([results_sent, recall_scores_sent, results_fin, recall_scores_fin, delta_accu, delta_recall], columns = names).transpose()
comparison.columns = col
comparison

Sentiment data
LR: 54.55
KNN: 52.7
CART: 52.959999999999994
NB: 55.60000000000001
SVM: 54.15
RF: 55.06999999999999
XGBoost: 54.02

Financial data
LR: 56.65
KNN: 50.99
CART: 49.41
NB: 54.55
SVM: 54.15
RF: 52.959999999999994
XGBoost: 56.52


Unnamed: 0,Accuracy Sentiment,Recall Sentiment,Accuracy no Sentiment,Recall no Sentiment,Delta Accuracy,Delta Recall
LR,54.55,14.655172,56.65,17.241379,-2.1,-2.586207
KNN,52.7,47.701149,50.99,44.252874,1.71,3.448276
CART,52.96,48.850575,49.41,46.83908,3.55,2.011494
NB,55.6,24.137931,54.55,27.873563,1.05,-3.735632
SVM,54.15,0.0,54.15,0.0,0.0,0.0
RF,55.07,44.827586,52.96,42.528736,2.11,2.298851
XGBoost,54.02,35.91954,56.52,45.977011,-2.5,-10.057471


In [329]:
def feature_importance(df, model, title):
    # Feature importance for XGB
    features = df.drop('buy_signal', axis = 1).columns.values

    x, y = (list(x) for x in zip(*sorted(zip(model.feature_importances_, features), 
                                                                reverse = False)))
    trace2 = go.Bar(
        x=x ,
        y=y,
        marker=dict(
            color=x,
            colorscale = 'Viridis',
            reversescale = True
        ),
        name='Feature importances for '+ str(title),
        orientation='h',
    )

    layout = dict(
        title='Feature importances for '+ str(title),
         width = 600, height = 600,
        yaxis=dict(
            showgrid=False,
            showline=False,
            showticklabels=True,
    #         domain=[0, 0.85],
        ))

    fig1 = go.Figure(data=[trace2])
    fig1['layout'].update(layout)
    py.iplot(fig1, filename='plots')


In [330]:
X_train, y_train, X_test, y_test = train_test_split_own(model_df, 60)

# scale the data
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)

# Scaled XGB

model_xgb = XGBClassifier()
model_xgb.fit(rescaledX, y_train)
# estimate accuracy on validation dataset
rescaledValidationX = scaler.transform(X_test)
predictions = model_xgb.predict(rescaledValidationX)
print("accuracy score:")
print(accuracy_score(y_test, predictions))
print("recall score:")
print(recall_score(y_test, predictions))
print("confusion matrix: ")
print(confusion_matrix(y_test, predictions))
print("classification report: ")
print(classification_report(y_test, predictions))


# Scaling Random Forests
print('RF')
model_rf = RandomForestClassifier(n_estimators=1000)
model_rf.fit(rescaledX, y_train)
# estimate accuracy on validation dataset
rescaledValidationX = scaler.transform(X_test)
predictions = model_rf.predict(rescaledValidationX)
print("accuracy score:")
print(accuracy_score(y_test, predictions))
print("recall:")
print(recall_score(y_test, predictions))
print("confusion matrix: ")
print(confusion_matrix(y_test, predictions))
print("classification report: ")
print(classification_report(y_test, predictions))

feature_importance(model_df, model_xgb, 'XGB sentiment')

feature_importance(model_df, model_rf, 'Random Forest Sentiment')

accuracy score:
0.5401844532279315
recall score:
0.35919540229885055
confusion matrix: 
[[285 126]
 [223 125]]
classification report: 
              precision    recall  f1-score   support

         0.0       0.56      0.69      0.62       411
         1.0       0.50      0.36      0.42       348

   micro avg       0.54      0.54      0.54       759
   macro avg       0.53      0.53      0.52       759
weighted avg       0.53      0.54      0.53       759

RF
accuracy score:
0.5573122529644269
recall:
0.4367816091954023
confusion matrix: 
[[271 140]
 [196 152]]
classification report: 
              precision    recall  f1-score   support

         0.0       0.58      0.66      0.62       411
         1.0       0.52      0.44      0.48       348

   micro avg       0.56      0.56      0.56       759
   macro avg       0.55      0.55      0.55       759
weighted avg       0.55      0.56      0.55       759



In [None]:
X_train, y_train, X_test, y_test = train_test_split_own(financial_only, 60)


# prepare the model
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)

# Scaled XGB

model_xgb = XGBClassifier()
model_xgb.fit(rescaledX, y_train)
# estimate accuracy on validation dataset
rescaledValidationX = scaler.transform(X_test)
predictions = model_xgb.predict(rescaledValidationX)
print("accuracy score:")
print(accuracy_score(y_test, predictions))
print("accuracy score:")
print(recall_score(y_test, predictions))
print("confusion matrix: ")
print(confusion_matrix(y_test, predictions))
print("classification report: ")
print(classification_report(y_test, predictions))


# Scaling Random Forests

model_rf = RandomForestClassifier(n_estimators=1000)
model_rf.fit(rescaledX, y_train)
# estimate accuracy on validation dataset
rescaledValidationX = scaler.transform(X_test)
predictions = model_rf.predict(rescaledValidationX)
print("accuracy score:")
print(accuracy_score(y_test, predictions))
print("accuracy score:")
print(recall_score(y_test, predictions))
print("confusion matrix: ")
print(confusion_matrix(y_test, predictions))
print("classification report: ")
print(classification_report(y_test, predictions))

feature_importance(financial_only, model_xgb, 'XGB no sentiment')

feature_importance(financial_only, model_rf, 'Random Forest no Sentiment')

In [338]:
# Ensemble pipeline with finetuning

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import matplotlib

def filter_important_feat(df, model, treshhold, days):
    features = df.drop('buy_signal', axis = 1).columns.values
    feat_before = len(features)
    print('Before feature removal: ', len(features))
    imp_feat = pd.DataFrame([model.feature_importances_], columns = features).transpose()
    imp_feat = imp_feat.sort_values(by=0, ascending=False)

    clean_imp = imp_feat[imp_feat[0] > treshhold].transpose()
    clean_imp = list(clean_imp.columns)
    feat_after = len(clean_imp)
    print('After feature removal: ', len(clean_imp))
    
    imp_df = df[clean_imp]    
    y = model_df['buy_signal']
    imp_df['buy_signal'] = y
    
    # Return new train and test sets
    X_train, y_train, X_test, y_test = train_test_split_own(imp_df, days)
    
    return X_train, y_train, X_test, y_test, feat_before, feat_after

def optimizer(X_train, y_train, X_test, y_test, clf):
    matplotlib.use('Agg')
    model = clf
    n_estimators = [150, 200, 250, 450, 500, 550, 1000]
    max_depth = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    print('')
    best_depth = 0
    best_estimator = 0
    max_score = 0
    max_recall = 0
    for n in n_estimators:
        for md in max_depth:
            model = clf(n_estimators=n, max_depth=md, learning_rate = 0.15)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = accuracy_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            if recall > max_recall:
                max_recall = recall
                max_score = score
                best_depth = md
                best_estimator = n    
            print("Recall: " + str(round(recall,4)) + " Accu: " + str(round(score,2)) + " Depth: " + str(md) + " Estimator: " + str(n))
    print('')
    print("Best Recall is " + str(max_recall) + " Accu: " + str(max_score) +" Depth: " + str(best_depth) + " estimator: " + str(best_estimator))

    return max_score, best_depth, best_estimator, model    
  
def opt_model_fit(X_train, y_train, X_test, y_test, best_depth, best_estimator,clf):
    model = clf(n_estimators=best_estimator, max_depth=best_depth, learning_rate=0.15)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print('Accuracy: ', score)
    print(recall_score(y_test, y_pred))
    print("confusion matrix: ")
    print(confusion_matrix(y_test, y_pred))
    print("classification report: ")
    print(classification_report(y_test, y_pred))
    
    return model
    
def scaler(train_X, test_X):
    scaler = StandardScaler().fit(train_X)
    rescaledX = scaler.transform(train_X)
    rescaledValidationX = scaler.transform(test_X)

    return rescaledX, rescaledValidationX    
    
def run_opt(df, clf, days, threshhold):
    X_train, y_train, X_test, y_test = train_test_split_own(df, days)
    
    print('')
    print('scaling...')
    X_train, X_test = scaler(X_train, X_test)
    
    print('')
    print('Finding best depth and estimator...')
    max_score, best_depth, best_estimator, model = optimizer(X_train, y_train, X_test, y_test,clf)

    print('')
    print('Filter features...')    
    X_train, y_train, X_test, y_test, before, after = filter_important_feat(df, model, threshhold, 60)
    
    X_train, X_test = scaler(X_train, X_test)

    if before != after:
        print('')
        print('Finding best depth and estimator...')
        max_score, best_depth, best_estimator, model = optimizer(X_train, y_train, X_test, y_test,clf)
    else:
        pass

In [339]:
feat_thresh = 0.045

print('')
print('XGB Sentiment')
clf = XGBClassifier
run_opt(model_df, clf, 60, feat_thresh)

print('')
print('XGB No Sentiment')
run_opt(financial_only, clf, 60, feat_thresh)


XGB Sentiment

scaling...

Finding best depth and estimator...

Recall: 0.3103 Accu: 0.57 Depth: 1 Estimator: 150
Recall: 0.3822 Accu: 0.55 Depth: 2 Estimator: 150
Recall: 0.4167 Accu: 0.55 Depth: 3 Estimator: 150
Recall: 0.4511 Accu: 0.53 Depth: 4 Estimator: 150
Recall: 0.4425 Accu: 0.54 Depth: 5 Estimator: 150
Recall: 0.4856 Accu: 0.54 Depth: 6 Estimator: 150
Recall: 0.4195 Accu: 0.52 Depth: 7 Estimator: 150
Recall: 0.4914 Accu: 0.54 Depth: 8 Estimator: 150
Recall: 0.5172 Accu: 0.55 Depth: 9 Estimator: 150
Recall: 0.4713 Accu: 0.53 Depth: 10 Estimator: 150
Recall: 0.523 Accu: 0.55 Depth: 11 Estimator: 150
Recall: 0.454 Accu: 0.51 Depth: 12 Estimator: 150
Recall: 0.3247 Accu: 0.57 Depth: 1 Estimator: 200
Recall: 0.3908 Accu: 0.54 Depth: 2 Estimator: 200
Recall: 0.4425 Accu: 0.57 Depth: 3 Estimator: 200
Recall: 0.4425 Accu: 0.53 Depth: 4 Estimator: 200
Recall: 0.4598 Accu: 0.53 Depth: 5 Estimator: 200
Recall: 0.4799 Accu: 0.54 Depth: 6 Estimator: 200
Recall: 0.4224 Accu: 0.52 Depth: 7

Recall: 0.4971 Accu: 0.54 Depth: 3 Estimator: 1000
Recall: 0.4914 Accu: 0.54 Depth: 4 Estimator: 1000
Recall: 0.477 Accu: 0.54 Depth: 5 Estimator: 1000
Recall: 0.4828 Accu: 0.53 Depth: 6 Estimator: 1000
Recall: 0.4828 Accu: 0.54 Depth: 7 Estimator: 1000
Recall: 0.4799 Accu: 0.54 Depth: 8 Estimator: 1000
Recall: 0.4511 Accu: 0.52 Depth: 9 Estimator: 1000
Recall: 0.4626 Accu: 0.54 Depth: 10 Estimator: 1000
Recall: 0.477 Accu: 0.54 Depth: 11 Estimator: 1000
Recall: 0.4655 Accu: 0.54 Depth: 12 Estimator: 1000

Best Recall is 0.5057471264367817 Accu: 0.549407114624506 Depth: 2 estimator: 1000

Filter features...
Before feature removal:  6
After feature removal:  6


In [327]:
# Ensemble pipeline with finetuning

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import matplotlib

def filter_important_feat(df, model, treshhold, days):
    features = df.drop('buy_signal', axis = 1).columns.values
    feat_before = len(features)
    print('Before feature removal: ', len(features))
    imp_feat = pd.DataFrame([model.feature_importances_], columns = features).transpose()
    imp_feat = imp_feat.sort_values(by=0, ascending=False)

    clean_imp = imp_feat[imp_feat[0] > treshhold].transpose()
    clean_imp = list(clean_imp.columns)
    feat_after = len(clean_imp)
    print('After feature removal: ', len(clean_imp))
    
    imp_df = df[clean_imp]    
    y = model_df['buy_signal']
    imp_df['buy_signal'] = y
    
    # Return new train and test sets
    X_train, y_train, X_test, y_test = train_test_split_own(imp_df, days)
    
    return X_train, y_train, X_test, y_test, feat_before, feat_after

def optimizer(X_train, y_train, X_test, y_test, clf):
    matplotlib.use('Agg')
    model = clf
    n_estimators = [150, 200, 250, 450, 500, 550, 1000]
    max_depth = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    print('')
    best_depth = 0
    best_estimator = 0
    max_score = 0
    max_recall = 0
    for n in n_estimators:
        for md in max_depth:
            model = clf(n_estimators=n, max_depth=md)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = accuracy_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            if recall > max_recall:
                max_recall = recall
                max_score = score
                best_depth = md
                best_estimator = n    
            print("Recall: " + str(round(recall,4)) + " Accu: " + str(round(score,2)) + " Depth: " + str(md) + " Estimator: " + str(n))
    print('')
    print("Best Recall is " + str(max_recall) + " Accu: " + str(max_score) +" Depth: " + str(best_depth) + " estimator: " + str(best_estimator))

    return max_score, best_depth, best_estimator, model    
  
def opt_model_fit(X_train, y_train, X_test, y_test, best_depth, best_estimator,clf):
    model = clf(n_estimators=best_estimator, max_depth=best_depth)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print('Accuracy: ', score)
    print(recall_score(y_test, y_pred))
    print("confusion matrix: ")
    print(confusion_matrix(y_test, y_pred))
    print("classification report: ")
    print(classification_report(y_test, y_pred))
    
    return model
    
def scaler(train_X, test_X):
    scaler = StandardScaler().fit(train_X)
    rescaledX = scaler.transform(train_X)
    rescaledValidationX = scaler.transform(test_X)

    return rescaledX, rescaledValidationX    
    
def run_opt(df, clf, days, threshhold):
    X_train, y_train, X_test, y_test = train_test_split_own(df, days)
    
    print('')
    print('scaling...')
    X_train, X_test = scaler(X_train, X_test)
    
    print('')
    print('Finding best depth and estimator...')
    max_score, best_depth, best_estimator, model = optimizer(X_train, y_train, X_test, y_test,clf)

    print('')
    print('Filter features...')    
    X_train, y_train, X_test, y_test, before, after = filter_important_feat(df, model, threshhold, 60)
    
    X_train, X_test = scaler(X_train, X_test)

    if before != after:
        print('')
        print('Finding best depth and estimator...')
        max_score, best_depth, best_estimator, model = optimizer(X_train, y_train, X_test, y_test,clf)
    else:
        pass
    

In [328]:
print('')
print('RF Sentiment')
clf = RandomForestClassifier
run_opt(model_df, clf, 60, feat_thresh)

print('')
print('RF No Sentiment')
run_opt(financial_only, clf, 60, feat_thresh)


RF Sentiment

scaling...

Finding best depth and estimator...

Recall: 0.0259 Accu: 0.55 Depth: 1 Estimator: 150
Recall: 0.1121 Accu: 0.56 Depth: 2 Estimator: 150
Recall: 0.1839 Accu: 0.57 Depth: 3 Estimator: 150
Recall: 0.2443 Accu: 0.55 Depth: 4 Estimator: 150
Recall: 0.2471 Accu: 0.55 Depth: 5 Estimator: 150
Recall: 0.3132 Accu: 0.58 Depth: 6 Estimator: 150
Recall: 0.3678 Accu: 0.58 Depth: 7 Estimator: 150
Recall: 0.3764 Accu: 0.57 Depth: 8 Estimator: 150
Recall: 0.3448 Accu: 0.56 Depth: 9 Estimator: 150
Recall: 0.4397 Accu: 0.56 Depth: 10 Estimator: 150
Recall: 0.3908 Accu: 0.55 Depth: 11 Estimator: 150
Recall: 0.4454 Accu: 0.56 Depth: 12 Estimator: 150
Recall: 0.0115 Accu: 0.54 Depth: 1 Estimator: 200
Recall: 0.1092 Accu: 0.56 Depth: 2 Estimator: 200
Recall: 0.1753 Accu: 0.55 Depth: 3 Estimator: 200
Recall: 0.2299 Accu: 0.58 Depth: 4 Estimator: 200
Recall: 0.25 Accu: 0.56 Depth: 5 Estimator: 200
Recall: 0.2443 Accu: 0.56 Depth: 6 Estimator: 200
Recall: 0.3736 Accu: 0.57 Depth: 7 

Recall: 0.3621 Accu: 0.59 Depth: 3 Estimator: 1000
Recall: 0.3908 Accu: 0.58 Depth: 4 Estimator: 1000
Recall: 0.3966 Accu: 0.58 Depth: 5 Estimator: 1000
Recall: 0.4282 Accu: 0.59 Depth: 6 Estimator: 1000
Recall: 0.3937 Accu: 0.56 Depth: 7 Estimator: 1000
Recall: 0.4052 Accu: 0.55 Depth: 8 Estimator: 1000
Recall: 0.4224 Accu: 0.56 Depth: 9 Estimator: 1000
Recall: 0.4167 Accu: 0.55 Depth: 10 Estimator: 1000
Recall: 0.4195 Accu: 0.55 Depth: 11 Estimator: 1000
Recall: 0.4224 Accu: 0.54 Depth: 12 Estimator: 1000

Best Recall is 0.4454022988505747 Accu: 0.5428194993412385 Depth: 11 estimator: 250

Filter features...
Before feature removal:  6
After feature removal:  6


# XGB Fine-Tuning

In [340]:
def para_model_lr(X_train, y_train, X_test, y_test, par_val):
    best_depth = 8
    best_estimator =1000
    clf = XGBClassifier
    
    max_recall = 0
    max_score = 0
    best_para = 0
    best_cm = 0
    
    for i in par_val:
        model = clf(n_estimators=best_estimator, max_depth=best_depth, learning_rate=i)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        if recall > max_recall:
            max_recall = recall
            max_score = score
            best_para = i
            best_cm = confusion_matrix(y_test, y_pred)
        #print('recall: ' + str(recall) + ' at ' + str(i))

        #print("confusion matrix: ")
        #print(confusion_matrix(y_test, y_pred))
        
    return max_recall, max_score, best_para, best_cm

def para_model_subsample(X_train, y_train, X_test, y_test, par_val):
    best_depth = 8
    best_estimator =1000
    clf = XGBClassifier
    
    max_recall = 0
    max_score = 0
    best_para = 0
    best_cm = 0
    model = 0
    
    for i in par_val:
        model = clf(n_estimators=best_estimator, max_depth=best_depth, subsample=i)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        if recall > max_recall:
            max_recall = recall
            max_score = score
            best_para = i
            best_cm = confusion_matrix(y_test, y_pred)
            model = model
       # print('recall: ' + str(recall) + ' at ' + str(i))

        #print("confusion matrix: ")
        #print(confusion_matrix(y_test, y_pred))
        
    return max_recall, max_score, best_para, best_cm

def para_model_gamma(X_train, y_train, X_test, y_test, par_val):
    best_depth = 8
    best_estimator =1000
    clf = XGBClassifier
    
    max_recall = 0
    max_score = 0
    best_para = 0
    best_cm = 0
        
    for i in par_val:
        model = clf(n_estimators=best_estimator, max_depth=best_depth, gamma=i)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        if recall > max_recall:
            max_recall = recall
            max_score = score
            best_para = i
            best_cm = confusion_matrix(y_test, y_pred)
        #print('recall: ' + str(recall) + ' at ' + str(i))

        #print("confusion matrix: ")
        #print(confusion_matrix(y_test, y_pred))
        
    return max_recall, max_score, best_para, best_cm

def para_model_alpha(X_train, y_train, X_test, y_test, par_val):
    best_depth = 8
    best_estimator =1000
    clf = XGBClassifier
    
    max_recall = 0
    max_score = 0
    best_para = 0
    best_cm = 0
        
    for i in par_val:
        model = clf(n_estimators=best_estimator, max_depth=best_depth, reg_alpha=i)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        if recall > max_recall:
            max_recall = recall
            max_score = score
            best_para = i
            best_cm = confusion_matrix(y_test, y_pred)
        #print('recall: ' + str(recall) + ' at ' + str(i))

        #print("confusion matrix: ")
        #print(confusion_matrix(y_test, y_pred))
        
    return max_recall, max_score, best_para, best_cm

# Optimizing Sentiment XGB

In [341]:
X_train, y_train, X_test, y_test = train_test_split_own(model_df, 60)

scaler = StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

par_val = [i/100 for i in range(0,50, 5)]

print('learning rate ...')
max_recall, max_score, best_para, cm = para_model_lr(X_train, y_train, X_test, y_test, par_val)
print('Recall ', max_recall)
print('Accuracy ', max_score)
print('Parameter ',best_para)
print(cm)
print('')

par_val = [i/100 for i in range(0,100, 5)]
print('gamma ...')
max_recall, max_score, best_para, cm = para_model_gamma(X_train, y_train, X_test, y_test, par_val)
print('Recall ', max_recall)
print('Accuracy ', max_score)
print('Parameter ',best_para)
print(cm)
print('')

print('subsample...')
max_recall, max_score, best_para, cm = para_model_subsample(X_train, y_train, X_test, y_test, par_val)
print('Recall ', max_recall)
print('Accuracy ', max_score)
print('Parameter ',best_para)
print(cm)
print('')

print('alpha...')
max_recall, max_score, best_para, cm = para_model_alpha(X_train, y_train, X_test, y_test, par_val)
print('Recall ', max_recall)
print('Accuracy ', max_score)
print('Parameter ',best_para)
print(cm)
print('')

learning rate ...
Recall  0.5545977011494253
Accuracy  0.5520421607378129
Parameter  0.3
[[226 185]
 [155 193]]

gamma ...
Recall  0.5316091954022989
Accuracy  0.5480895915678524
Parameter  0.0
[[231 180]
 [163 185]]

subsample...
Recall  0.5833333333333334
Accuracy  0.5401844532279315
Parameter  0.05
[[207 204]
 [145 203]]

alpha...
Recall  0.5316091954022989
Accuracy  0.5480895915678524
Parameter  0.0
[[231 180]
 [163 185]]



# Optimizing financial XGB

In [342]:
X_train, y_train, X_test, y_test = train_test_split_own(financial_only, 60)

scaler = StandardScaler().fit(X_train)


X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

par_val = [i/100 for i in range(0,50, 5)]

print('learning rate ...')
max_recall, max_score, best_para, cm = para_model_lr(X_train, y_train, X_test, y_test, par_val)
print('Recall ', max_recall)
print('Accuracy ', max_score)
print('Parameter ',best_para)
print(cm)
print('')

par_val = [i/100 for i in range(0,100, 5)]
print('gamma ...')
max_recall, max_score, best_para, cm = para_model_gamma(X_train, y_train, X_test, y_test, par_val)
print('Recall ', max_recall)
print('Accuracy ', max_score)
print('Parameter ',best_para)
print(cm)
print('')

print('subsample...')
max_recall, max_score, best_para, cm = para_model_subsample(X_train, y_train, X_test, y_test, par_val)
print('Recall ', max_recall)
print('Accuracy ', max_score)
print('Parameter ',best_para)
print(cm)
print('')

print('alpha...')
max_recall, max_score, best_para, cm = para_model_alpha(X_train, y_train, X_test, y_test, par_val)
print('Recall ', max_recall)
print('Accuracy ', max_score)
print('Parameter ',best_para)
print(cm)
print('')

learning rate ...
Recall  0.4885057471264368
Accuracy  0.5428194993412385
Parameter  0.2
[[242 169]
 [178 170]]

gamma ...
Recall  0.49712643678160917
Accuracy  0.546772068511199
Parameter  0.45
[[242 169]
 [175 173]]

subsample...
Recall  0.5114942528735632
Accuracy  0.541501976284585
Parameter  0.1
[[233 178]
 [170 178]]

alpha...
Recall  0.47988505747126436
Accuracy  0.541501976284585
Parameter  0.0
[[244 167]
 [181 167]]



In [331]:
# Final Model
X_train, y_train, X_test, y_test = train_test_split_own(model_df, 60)

scaler = StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

best_depth = 8
best_estimator = 1000
sub = 0.05

model = XGBClassifier(n_estimators=best_estimator, max_depth=best_depth, subsample=sub)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
score = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Recall ', recall)
print(confusion_matrix(y_test, y_pred))

feature_importance(model_df, model, 'optimized sentiment XGB')

Recall  0.5833333333333334
[[207 204]
 [145 203]]
