In [6]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import tree
import tensorflow as tf
from tensorflow import keras
import pickle
import datetime
import numpy as np

In [2]:
# Load features and labels data

with open('aapl_two_gram_features_labelled_df.pkl', 'rb') as f:
    labelled_aapl_news_df = pickle.load(f)

In [3]:
trainX_df=labelled_aapl_news_df['features']
trainY_df=labelled_aapl_news_df['label']

trainX = np.stack(trainX_df)

trainY_df=trainY_df.astype('int')
trainY = np.stack(trainY_df)

print(trainX.shape)
print(trainY.shape)
trainX

(71941, 3831)
(71941,)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [4]:
pca = PCA(n_components=3800)
pComponents= pca.fit_transform(trainX)
print(pca.explained_variance_ratio_)
sum([float(x) for x in pca.explained_variance_ratio_])

[8.01939841e-02 4.59203231e-02 3.82233813e-02 ... 7.04995361e-34
 7.04995361e-34 7.04995361e-34]


1.0000000000000004

In [5]:
X_train, X_test, y_train, y_test = train_test_split(trainX, trainY, test_size=0.2, random_state=23)

In [6]:
print(X_train.shape)
print(y_train.shape)

(57552, 3831)
(57552,)


In [8]:
print("training Logistic Regression now ...")

logistic_regression = LogisticRegression(max_iter=1000, verbose=15, n_jobs=-1).fit(X_train,y_train)
y_pred = logistic_regression.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print('Accuracy of Logistic Regression classifier is: %f ' % acc)

Accuracy of Logistic Regression classifier is: 0.598652 


In [10]:
print("training Decision Tree now ...")
decision_tree = tree.DecisionTreeClassifier(random_state=0).fit(X_train,y_train)
y_pred = decision_tree.predict(X_test)
acc=accuracy_score(y_test,y_pred)
print('Accuracy of Decision Tree classifier is: %f ' % acc)

training decision tree now


Accuracy of Decision Tree classifier is: 0.665578 


In [16]:
print("training Random Forest classifier now ...")

random_forest_classifier = RandomForestClassifier(n_estimators=100, verbose=15, n_jobs=-1).fit(X_train,y_train)
#increase n_estimators if you face issues in training
y_pred=random_forest_classifier.predict(X_test)
acc=accuracy_score(y_test,y_pred)
print('Accuracy of Random Forest classifier is: %f ' % acc)

training Random Forest classifier now ...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed: 222.7min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy of Random Forest classifier is: 0.678157 


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:   45.1s finished


In [4]:
print("current time:-", datetime.datetime.now())

pca = PCA(n_components=1500)
pComponents= pca.fit_transform(trainX)
print(pca.explained_variance_ratio_)
print(sum([float(x) for x in pca.explained_variance_ratio_]))

X_train, X_test, y_train, y_test = train_test_split(pComponents, trainY, test_size=0.2, random_state=23)
print(X_train.shape)
print(y_train.shape)

print("training SVM classifier now ...")

svm_classifier = svm.SVC(verbose=12).fit(X_train,y_train)
y_pred = svm_classifier.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print('Accuracy of SVM classifier is: %f ' % acc)
print("current time:-", datetime.datetime.now())

current time:- 2021-04-05 14:56:09.711582
[0.08019398 0.04592032 0.03822338 ... 0.00010264 0.00010252 0.00010225]
0.8816114817322985
(57552, 1500)
(57552,)
training SVM classifier now ...
[LibSVM]Accuracy of SVM classifier is: 0.619223 
current time:- 2021-04-05 17:10:51.641437


In [7]:
pca = PCA(n_components=50)
pComponents= pca.fit_transform(trainX)
print(pca.explained_variance_ratio_)
sum([float(x) for x in pca.explained_variance_ratio_])

X_train, X_test, y_train, y_test = train_test_split(pComponents, trainY, test_size=0.2, random_state=23)

print("training Deep Neural Network now ...")

dnn_model = keras.Sequential([
    keras.layers.Flatten(input_shape=(50,)),
    keras.layers.Dense(40, activation=tf.nn.relu),
    keras.layers.Dense(30, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.sigmoid),
])

dnn_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

print("current time:-", datetime.datetime.now())
dnn_model.fit(X_train, y_train, epochs=50, batch_size=1)

test_loss, test_acc = dnn_model.evaluate(X_test, y_test)
print('Accuracy of Deep Neural Network is: %f ' % test_loss)
print("current time:-", datetime.datetime.now())

[0.08019398 0.04592032 0.03822338 0.02302324 0.01630195 0.01025567
 0.00907625 0.00836145 0.00744542 0.00688101 0.00629214 0.00582196
 0.00525544 0.00489219 0.00479393 0.00437124 0.00432898 0.00416345
 0.0039819  0.00377176 0.00371864 0.00371087 0.00361883 0.0032395
 0.00308442 0.00305171 0.00302348 0.00294405 0.00288199 0.00284649
 0.00278903 0.00269139 0.00264729 0.00262152 0.00258664 0.00256707
 0.00251749 0.00250193 0.00249337 0.00248105 0.00241319 0.00239169
 0.00229377 0.00226504 0.00225156 0.00223218 0.0021955  0.00214689
 0.00211538 0.00206738]
training Deep Neural Network now ...
current time:- 2021-04-05 17:26:05.007403
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50