In [27]:
# What version of Python do you have?
import sys

from collections import Counter

import tensorflow.keras
import pandas as pd
import sklearn as sk
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
import tensorflow as tf
import seaborn as sns
import math
import matplotlib.pyplot as plt
import numpy as np
import re
import demoji
import datetime

print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tensorflow.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
print("GPU is", "available" if tf.test.is_gpu_available() else "NOT AVAILABLE")

Tensor Flow Version: 1.13.1
Keras Version: 2.2.4-tf

Python 3.6.10 |Anaconda, Inc.| (default, Jan  7 2020, 15:01:53) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
Pandas 1.0.1
Scikit-Learn 0.22.1
GPU is NOT AVAILABLE


In [2]:
stock_data_df = pd.read_csv('GOOG.csv')

In [3]:
stock_data_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2019-02-06,1139.569946,1147.0,1112.77002,1115.22998,1115.22998,2105600
1,2019-02-07,1104.160034,1104.839966,1086.0,1098.709961,1098.709961,2044800
2,2019-02-08,1087.0,1098.910034,1086.550049,1095.060059,1095.060059,1075800
3,2019-02-11,1096.949951,1105.944946,1092.859985,1095.01001,1095.01001,1065200
4,2019-02-12,1106.800049,1125.295044,1105.849976,1121.369995,1121.369995,1609100


In [4]:
dates = stock_data_df.loc[:, 'Date'].to_list()
close_prices = stock_data_df.loc[:, 'Adj Close'].to_list()
volumes = stock_data_df.loc[:, 'Volume'].to_list()

- stock price feature fields:
    - Date
    - Past N price
    - Past N - 1 price
    - ...
    - Past 1 price
    - Label

In [5]:
time_window = 2

In [6]:
def stock_price_feature_label_generator(time_window = 3):
    if time_window <= 0: 
        raise ValueError("argument time_window has to be at least 1")
    # use "time_window" previous days to predict the movement of the next day
    label = [-1] + [1 if close_prices[i] > close_prices[i - 1] else 0 for i in range(1, len(close_prices))]
    result = pd.DataFrame(dates[time_window: ], columns = ['Date'])
    result.insert(loc=result.shape[1], column="Label", value=label[time_window: ])
    for i in range(time_window):
        result.insert(loc=result.shape[1], column="Past " + str(time_window - i) + " Day Adj. Price", value=np.divide(close_prices[i: len(close_prices) - time_window + i], volumes[i: len(volumes) - time_window + i]))
#     for i in range(time_window):
#         result.insert(loc=result.shape[1], column="Past " + str(time_window - i) + " Day Price", value=close_prices[i: len(close_prices) - time_window + i])
#     for i in range(time_window):
#         result.insert(loc=result.shape[1], column="Past " + str(time_window - i) + " Volume", value=volumes[i: len(volumes) - time_window + i])
    return result

In [7]:
# stock_price_feature = stock_price_feature_label_generator(time_window)
# stock_price_feature.head()

In [9]:
only_date_and_label = stock_price_feature_label_generator(1).drop(columns=['Past 1 Day Adj. Price'])

In [10]:
only_date_and_label.head()

Unnamed: 0,Date,Label
0,2019-02-07,0
1,2019-02-08,0
2,2019-02-11,0
3,2019-02-12,1
4,2019-02-13,0


In [34]:
stock_price_feature = stock_data_df[['Date', 'Adj Close']]
stock_price_feature['Label'] = [-1] + only_date_and_label.Label.to_list()
stock_price_feature = stock_price_feature[['Date', 'Label', 'Adj Close']]
stock_price_feature.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Date,Label,Adj Close
0,2019-02-06,-1,1115.22998
1,2019-02-07,0,1098.709961
2,2019-02-08,0,1095.060059
3,2019-02-11,0,1095.01001
4,2019-02-12,1,1121.369995


In [11]:
aggregated_vector = pd.read_csv("aggregated_vector.csv")

In [12]:
aggregated_vector.shape

(366, 801)

In [13]:
only_date_and_label.shape

(252, 2)

In [35]:
feature_with_label_date = stock_price_feature.merge(aggregated_vector, left_on="Date", right_on="Date")

In [38]:
feature_with_label_date.head()

Unnamed: 0,Date,Label,Adj Close,0,1,2,3,4,5,6,...,num_pos_VB,num_pos_NN,num_neg_JJ,num_neg_RB,num_neg_VB,num_neg_NN,total_score_JJ,total_score_RB,total_score_VB,total_score_NN
0,2019-02-06,-1,1115.22998,-31.065374,26.964167,207.587202,-29.100772,95.531243,-118.985815,-24.375886,...,3.998121,55.419917,53.80289,42.127785,0.0,59.779481,23.410036,4.487775,1.999061,9.4315
1,2019-02-07,0,1098.709961,-1.954738,-14.44165,63.646112,-10.731836,39.55415,-40.86359,-10.272327,...,2.0,18.0,37.0,8.0,2.0,12.0,5.026641,2.214286,0.675,6.566667
2,2019-02-08,0,1095.060059,5.805648,-10.267495,53.630592,0.306161,39.197539,-29.663042,-12.035486,...,3.0,24.0,16.0,1.0,1.0,12.0,8.259351,2.35,0.9,9.3
3,2019-02-11,0,1095.01001,12.10934,-1.015043,59.91978,-12.517981,36.486913,-34.995952,-12.47168,...,2.0,22.0,20.0,4.0,0.0,10.0,9.891429,2.069444,1.6,8.6
4,2019-02-12,1,1121.369995,6.390824,-82.13035,211.913261,20.25217,122.859405,-164.221742,-45.784676,...,3.419543,45.26685,134.814683,4.0,1.0,12.0,16.288132,10.861708,1.267817,18.083712


In [41]:
feature_without_label_date = feature_with_label_date.iloc[:, 3:]

In [42]:
feature_without_label_date.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,num_pos_VB,num_pos_NN,num_neg_JJ,num_neg_RB,num_neg_VB,num_neg_NN,total_score_JJ,total_score_RB,total_score_VB,total_score_NN
0,-31.065374,26.964167,207.587202,-29.100772,95.531243,-118.985815,-24.375886,138.741649,-99.359529,-59.658105,...,3.998121,55.419917,53.80289,42.127785,0.0,59.779481,23.410036,4.487775,1.999061,9.4315
1,-1.954738,-14.44165,63.646112,-10.731836,39.55415,-40.86359,-10.272327,47.880675,-21.62084,-11.568443,...,2.0,18.0,37.0,8.0,2.0,12.0,5.026641,2.214286,0.675,6.566667
2,5.805648,-10.267495,53.630592,0.306161,39.197539,-29.663042,-12.035486,48.976293,-21.195179,-13.73842,...,3.0,24.0,16.0,1.0,1.0,12.0,8.259351,2.35,0.9,9.3
3,12.10934,-1.015043,59.91978,-12.517981,36.486913,-34.995952,-12.47168,36.736582,-16.731461,-14.205209,...,2.0,22.0,20.0,4.0,0.0,10.0,9.891429,2.069444,1.6,8.6
4,6.390824,-82.13035,211.913261,20.25217,122.859405,-164.221742,-45.784676,166.307355,-66.180203,-45.345804,...,3.419543,45.26685,134.814683,4.0,1.0,12.0,16.288132,10.861708,1.267817,18.083712


- adding lagged info

In [None]:
# regenerate feature with time window

historial_feature_columns = [aggregated_vector.columns.to_list()[0]]
for lag_num in range(1, time_window + 1):
    historial_feature_columns += [fea + '_lag' + str(lag_num) for fea in aggregated_vector.columns.to_list()[1:]] 
    
feature_with_lagged_info = pd.DataFrame(columns=historial_feature_columns)

for i in range(len(feature_without_label_date) - 1, time_window - 1, -1):
    sys.stdout.write('\r')
    # the exact output you're looking for:
    sys.stdout.write("Generating: %.02f" % ((len(feature_without_label_date) - 1 - i) / (len(feature_without_label_date) - 1 - time_window) * 100))
    sys.stdout.flush()
    
    # concatenate lagged feature horizontally to add temporal info
    prev_N_days_sum_vector = feature_without_label_date.loc[i - 1].to_numpy()
    for delta in range(2, 1 + time_window):
        prev_N_days_sum_vector = np.concatenate((prev_N_days_sum_vector, 
                                                 feature_without_label_date.loc[i - delta].to_numpy()), 
                                                axis=None)

    # normalize
    # prev_N_days_sum_vector = prev_N_days_sum_vector / np.linalg.norm(prev_N_days_sum_vector)
    
    # add date
    feature_with_date = np.concatenate((np.array([feature_with_label_date.iloc[i, 0]]), prev_N_days_sum_vector))
    new_row_df = pd.DataFrame(feature_with_date).transpose()
    
    # generate column name for lagged feature
    new_row_df.columns = historial_feature_columns
    
    # insert row
    feature_with_lagged_info = pd.concat([new_row_df, feature_with_lagged_info], ignore_index=True)

Generating: 43.60

In [53]:
feature_with_lagged_info = pd.DataFrame(columns=aggregated_vector.columns)
feature_with_lagged_info.head()

Unnamed: 0,Date,0,1,2,3,4,5,6,7,8,...,num_pos_VB,num_pos_NN,num_neg_JJ,num_neg_RB,num_neg_VB,num_neg_NN,total_score_JJ,total_score_RB,total_score_VB,total_score_NN


In [50]:
feature_with_lagged_info.haed()

Unnamed: 0,Date,0_lag1,1_lag1,2_lag1,3_lag1,4_lag1,5_lag1,6_lag1,7_lag1,8_lag1,...,num_pos_VB,num_pos_NN,num_neg_JJ,num_neg_RB,num_neg_VB,num_neg_NN,total_score_JJ,total_score_RB,total_score_VB,total_score_NN
0,2019-02-08,-1.9547378430000009,-14.441649583200002,63.64611239099997,-10.731835941300002,39.55415032889999,-40.86358978569999,-10.272326656500006,47.88067524670001,-21.6208396538,...,,,,,,,,,,
1,2019-02-11,5.805648479300003,-10.267494661,53.630592168999996,0.30616061276000045,39.197539260200024,-29.6630420611,-12.035485623449993,48.976292749400024,-21.195179270000004,...,,,,,,,,,,
2,2019-02-12,12.109339575400007,-1.0150432949599992,59.91977977999998,-12.5179810023,36.486913223,-34.9959524369,-12.471680062899999,36.736581938,-16.731460817000002,...,,,,,,,,,,
3,2019-02-13,6.390823674223785,-82.13035037952076,211.91326138538446,20.25216950397589,122.85940453449659,-164.22174159824536,-45.784676431630146,166.3073545051445,-66.18020295728107,...,,,,,,,,,,
4,2019-02-14,0.17688897399999778,-9.64450826,68.011613363,-8.693842723800001,35.41786913270001,-39.69494913759999,-15.748714003899995,35.07303002029999,-13.761980877699996,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,2020-01-31,-18.916264538515502,-58.80860428688371,153.36144815963658,-34.2866146670432,52.54305183443851,-106.2027230531557,-15.091874646179605,145.87655446653062,-89.85925749506737,...,,,,,,,,,,
247,2020-02-03,10.971949541299997,-18.3032363486,78.12543942999999,-13.254866407300007,50.7353863082,-46.220989427299976,-17.041574638199993,52.73887802830003,-34.945544516600016,...,,,,,,,,,,
248,2020-02-04,76.9452306965438,-183.5716723032944,558.6775934596141,-4.645623426619664,195.66549944157146,-269.8816737445945,-104.52817828611549,501.6433080147235,-351.42633289929626,...,,,,,,,,,,
249,2020-02-05,10.652329529262062,-92.97790162115265,290.5389107669862,22.162449371609387,87.50611519088693,-172.8079820152349,-44.02213733564594,256.1285303801866,-145.85243192728555,...,,,,,,,,,,


- fusion with prev price and volume

In [46]:
fused_feature = stock_price_feature.merge(feature_with_lagged_info, left_on="Date", right_on="Date")

- check imbalance

In [47]:
np.count_nonzero(fused_feature.Label.to_list()) / len(fused_feature)

0.549800796812749

In [48]:
fused_feature.head()

Unnamed: 0,Date,Label,Adj Close,0_lag1,1_lag1,2_lag1,3_lag1,4_lag1,5_lag1,6_lag1,...,num_pos_VB,num_pos_NN,num_neg_JJ,num_neg_RB,num_neg_VB,num_neg_NN,total_score_JJ,total_score_RB,total_score_VB,total_score_NN
0,2019-02-08,0,1095.060059,-1.9547378430000009,-14.441649583200002,63.64611239099997,-10.731835941300002,39.55415032889999,-40.86358978569999,-10.272326656500006,...,,,,,,,,,,
1,2019-02-11,0,1095.01001,5.805648479300003,-10.267494661,53.630592169,0.3061606127600004,39.19753926020002,-29.6630420611,-12.035485623449992,...,,,,,,,,,,
2,2019-02-12,1,1121.369995,12.109339575400009,-1.0150432949599992,59.91977977999998,-12.5179810023,36.486913223,-34.9959524369,-12.4716800629,...,,,,,,,,,,
3,2019-02-13,0,1120.160034,6.390823674223785,-82.13035037952076,211.9132613853845,20.25216950397589,122.8594045344966,-164.22174159824536,-45.78467643163015,...,,,,,,,,,,
4,2019-02-14,1,1121.670044,0.1768889739999977,-9.64450826,68.011613363,-8.693842723800001,35.41786913270001,-39.69494913759999,-15.748714003899996,...,,,,,,,,,,


- import ML lib

In [None]:
# from bert_serving.client import BertClient
import math
from string import punctuation
import matplotlib.pyplot as plt
import re
import os
# from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# import warnings
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_validate, GridSearchCV
import joblib
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.svm import SVC
import xgboost as xgb

# feature selection
from sklearn.feature_selection import SelectKBest, chi2, f_classif

In [None]:
fused_feature.iloc[:, 1:].shape

In [None]:
# import seaborn as sns

# # get first 20 dimensions from the 768-dimensional vector
# feature = fused_feature.iloc[:, 1:21]
# corr = feature.corr()
# _ = plt.figure(figsize=(30, 30))
# ax = sns.heatmap(
#     corr, 
#     vmin=-1, vmax=1, center=0,
#     cmap=sns.diverging_palette(20, 220, n=11), # better if the palette color num is odd
#     square=True
# )
# ax.set_xticklabels(
#     ax.get_xticklabels(),
#     rotation=45,
#     horizontalalignment='right'
# );

- feature selection

In [None]:
# apply SelectKBest class to extract top 10 best features
X = fused_feature.iloc[:, 2:]
y = fused_feature.Label
bestfeatures = SelectKBest(score_func=f_classif, k=25)
fit = bestfeatures.fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
# concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score']  # naming the dataframe columns
print(featureScores.nlargest(25, 'Score'))  # print 10 best features

In [None]:
selected_feature_columns = np.add(2, featureScores.nlargest(110, 'Score').index.to_list())

In [None]:
def string_to_datetime(string):
    date_parts = [int(part) for part in string.split('-')]
    return datetime.date(date_parts[0], date_parts[1], date_parts[2])

In [None]:
fused_feature.Date = fused_feature.Date.apply(lambda x: string_to_datetime(x))
fused_feature.to_csv("final_feature_from_twitter.csv", index=False)

- train test split

In [None]:
# X = fused_feature.iloc[:, 2:].to_numpy()
X = fused_feature.iloc[:, selected_feature_columns].to_numpy()
y = fused_feature.Label.to_list()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
xgb_model = xgb.XGBClassifier(learning_rate=0.03,
                                n_estimators=600,
                                max_depth=7,
                                min_child_weight=5,
                                gamma=0,
                                subsample=0.8,
                                colsample_bytree=0.8,
                                objective='binary:logistic',
                                nthread=4,
                                scale_pos_weight=1,
                                seed=27)
xgb_model.fit(X_train, y_train)

In [None]:
y_pred = xgb_model.predict(X_test)
y_pred_prob = xgb_model.predict_proba(X_test)[:,1]

In [None]:
print("\nModel Report")
print("Accuracy : %.4g" % accuracy_score(y_test, y_pred))
print("AUC Score (Train): %f" % roc_auc_score(y_test, y_pred_prob))