#All the libraries used in this notebook



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import tree
from tensorflow import keras
import pickle
import datetime
import numpy as np
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

#Loading the Apple one gram features for 1 day time interval

In [None]:
with open('/content/drive/MyDrive/SWM/CSE573-SWM-Data/aapl_one_gram_features_labelled_df.pkl', 'rb') as f:
    labelled_aapl_news_df = pickle.load(f)

#Splitting the dataset for training and testing

In [None]:
trainX_df=labelled_aapl_news_df['features']
trainY_df=labelled_aapl_news_df['label']

trainX = np.stack(trainX_df)

trainY_df=trainY_df.astype('int')
trainY = np.stack(trainY_df)



In [None]:
print(trainX.shape)
print(trainY.shape)

(71941, 703)
(71941,)


#Feature Extraction using chi2

In [None]:
chi2_selector = SelectKBest(chi2, k = 500)
chi_trainX = chi2_selector.fit_transform(trainX, trainY)
X_train, X_test, y_train, y_test = train_test_split(chi_trainX, trainY, test_size=0.2, random_state=23)
print(X_train.shape)
print(y_train.shape)

(57552, 500)
(57552,)


#Model Training

Logistic Regression 

*   Accuracy: 60.19%

In [None]:
print("training Logistic Regression now ...")
chi2_selector = SelectKBest(chi2, k = 700)
chi_trainX = chi2_selector.fit_transform(trainX, trainY)
X_train, X_test, y_train, y_test = train_test_split(chi_trainX, trainY, test_size=0.2, random_state=23)

logistic_regression = LogisticRegression(max_iter=1000, verbose=15).fit(X_train,y_train)
y_pred = logistic_regression.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print('Accuracy of Logistic Regression classifier is: %f ' % acc)

training Logistic Regression now ...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy of Logistic Regression classifier is: 0.569741 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.6s finished


In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
matrix = confusion_matrix(y_test,y_pred, labels=[1,-1])
print('Confusion matrix : \n',matrix)

# outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(y_test,y_pred,labels=[1,-1]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,y_pred,labels=[1,-1])
print('Classification report : \n',matrix)

Confusion matrix : 
 [[3293 3569]
 [2622 4905]]
Outcome values : 
 3293 3569 2622 4905
Classification report : 
               precision    recall  f1-score   support

           1       0.56      0.48      0.52      6862
          -1       0.58      0.65      0.61      7527

    accuracy                           0.57     14389
   macro avg       0.57      0.57      0.56     14389
weighted avg       0.57      0.57      0.57     14389



Decision Tree

*   Accuracy: 68.26%

In [None]:
print("training Decision Tree now ...")
chi2_selector = SelectKBest(chi2, k = 500)
chi_trainX = chi2_selector.fit_transform(trainX, trainY)
X_train, X_test, y_train, y_test = train_test_split(chi_trainX, trainY, test_size=0.2, random_state=23)

decision_tree = tree.DecisionTreeClassifier(random_state=0).fit(X_train,y_train)
y_pred = decision_tree.predict(X_test)
acc=accuracy_score(y_test,y_pred)
print('Accuracy of Decision Tree classifier is: %f ' % acc)

training Decision Tree now ...
Accuracy of Decision Tree classifier is: 0.682674 


In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
matrix = confusion_matrix(y_test,y_pred, labels=[1,-1])
print('Confusion matrix : \n',matrix)

# outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(y_test,y_pred,labels=[1,-1]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,y_pred,labels=[1,-1])
print('Classification report : \n',matrix)

Confusion matrix : 
 [[4567 2295]
 [2271 5256]]
Outcome values : 
 4567 2295 2271 5256
Classification report : 
               precision    recall  f1-score   support

           1       0.67      0.67      0.67      6862
          -1       0.70      0.70      0.70      7527

    accuracy                           0.68     14389
   macro avg       0.68      0.68      0.68     14389
weighted avg       0.68      0.68      0.68     14389



Random Forest Classification

*   Accuracy: 71.17%

In [None]:
print("training Random Forest classifier now ...")
chi2_selector = SelectKBest(chi2, k = 500)
chi_trainX = chi2_selector.fit_transform(trainX, trainY)
X_train, X_test, y_train, y_test = train_test_split(chi_trainX, trainY, test_size=0.2, random_state=23)

random_forest_classifier = RandomForestClassifier(n_estimators=100, verbose=15, n_jobs=-1).fit(X_train,y_train)
#increase n_estimators if you face issues in training
y_pred=random_forest_classifier.predict(X_test)
acc=accuracy_score(y_test,y_pred)
print('Accuracy of Random Forest classifier is: %f ' % acc)

training Random Forest classifier now ...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 100building tree 2 of 100



[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.2s


building tree 3 of 100
building tree 4 of 100


[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    2.2s


building tree 5 of 100


[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    2.4s


building tree 6 of 100


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    3.2s


building tree 7 of 100


[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    3.5s


building tree 8 of 100


[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    4.5s


building tree 9 of 100


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    4.8s


building tree 10 of 100


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    5.6s


building tree 11 of 100


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    6.0s


building tree 12 of 100


[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    6.8s


building tree 13 of 100


[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    7.2s


building tree 14 of 100


[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    8.1s


building tree 15 of 100
building tree 16 of 100


[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    9.6s


building tree 17 of 100
building tree 18 of 100


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   10.7s


building tree 19 of 100
building tree 20 of 100


[Parallel(n_jobs=-1)]: Done  19 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:   11.9s


building tree 21 of 100
building tree 22 of 100


[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done  22 tasks      | elapsed:   13.1s


building tree 23 of 100
building tree 24 of 100


[Parallel(n_jobs=-1)]: Done  23 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   14.2s


building tree 25 of 100
building tree 26 of 100


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   15.4s


building tree 27 of 100
building tree 28 of 100


[Parallel(n_jobs=-1)]: Done  27 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   16.6s


building tree 29 of 100
building tree 30 of 100


[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:   17.6s


building tree 31 of 100
building tree 32 of 100


[Parallel(n_jobs=-1)]: Done  31 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:   18.8s


building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done  35 tasks      | elapsed:   20.8s


building tree 37 of 100


[Parallel(n_jobs=-1)]: Done  36 tasks      | elapsed:   21.0s


building tree 38 of 100


[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:   22.1s


building tree 39 of 100
building tree 40 of 100


[Parallel(n_jobs=-1)]: Done  39 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   23.2s


building tree 41 of 100
building tree 42 of 100


[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:   24.0s


building tree 43 of 100


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   24.4s


building tree 44 of 100


[Parallel(n_jobs=-1)]: Done  43 tasks      | elapsed:   25.2s


building tree 45 of 100


[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:   25.5s


building tree 46 of 100


[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   26.2s


building tree 47 of 100


[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   26.6s


building tree 48 of 100


[Parallel(n_jobs=-1)]: Done  47 tasks      | elapsed:   27.5s


building tree 49 of 100


[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   27.8s


building tree 50 of 100


[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:   28.7s


building tree 51 of 100


[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:   29.1s


building tree 52 of 100


[Parallel(n_jobs=-1)]: Done  51 tasks      | elapsed:   29.9s


building tree 53 of 100


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:   30.3s


building tree 54 of 100


[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   31.2s
[Parallel(n_jobs=-1)]: Done  54 tasks      | elapsed:   31.3s


building tree 55 of 100
building tree 56 of 100


[Parallel(n_jobs=-1)]: Done  55 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   32.5s


building tree 57 of 100
building tree 58 of 100


[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:   33.4s


building tree 59 of 100


[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:   33.6s


building tree 60 of 100


[Parallel(n_jobs=-1)]: Done  59 tasks      | elapsed:   34.5s
[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed:   34.7s


building tree 61 of 100
building tree 62 of 100


[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done  62 tasks      | elapsed:   35.8s


building tree 63 of 100
building tree 64 of 100


[Parallel(n_jobs=-1)]: Done  63 tasks      | elapsed:   37.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   37.0s


building tree 65 of 100
building tree 66 of 100


[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:   38.1s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:   38.1s


building tree 67 of 100
building tree 68 of 100


[Parallel(n_jobs=-1)]: Done  67 tasks      | elapsed:   39.3s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:   39.3s


building tree 69 of 100
building tree 70 of 100


[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   40.5s
[Parallel(n_jobs=-1)]: Done  70 tasks      | elapsed:   40.5s


building tree 71 of 100
building tree 72 of 100


[Parallel(n_jobs=-1)]: Done  71 tasks      | elapsed:   41.7s
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:   41.7s


building tree 73 of 100
building tree 74 of 100


[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:   42.7s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:   42.8s


building tree 75 of 100
building tree 76 of 100


[Parallel(n_jobs=-1)]: Done  75 tasks      | elapsed:   43.9s
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:   43.9s


building tree 77 of 100
building tree 78 of 100


[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   45.0s
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed:   45.0s


building tree 79 of 100
building tree 80 of 100


[Parallel(n_jobs=-1)]: Done  79 tasks      | elapsed:   46.2s
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:   46.2s


building tree 81 of 100
building tree 82 of 100


[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   47.2s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   47.4s


building tree 83 of 100
building tree 84 of 100


[Parallel(n_jobs=-1)]: Done  83 tasks      | elapsed:   48.5s
[Parallel(n_jobs=-1)]: Done  84 tasks      | elapsed:   48.6s


building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100


[Parallel(n_jobs=-1)]: Done  85 tasks      | elapsed:   49.6s
[Parallel(n_jobs=-1)]: Done  86 tasks      | elapsed:   49.7s
[Parallel(n_jobs=-1)]: Done  87 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:   50.8s


building tree 89 of 100
building tree 90 of 100


[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:   51.8s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   51.9s


building tree 91 of 100
building tree 92 of 100


[Parallel(n_jobs=-1)]: Done  91 tasks      | elapsed:   52.9s


building tree 93 of 100


[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed:   53.1s


building tree 94 of 100
building tree 95 of 100


[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed:   54.0s
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:   54.3s


building tree 96 of 100


[Parallel(n_jobs=-1)]: Done  95 tasks      | elapsed:   55.2s


building tree 97 of 100


[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   55.5s


building tree 98 of 100


[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   56.4s


building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   57.6s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done   6 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done   7 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done  11 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done  12 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done  13 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    

Accuracy of Random Forest classifier is: 0.720064 


[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed:    0.8s
[Parallel(n_jobs=2)]: Done  95 tasks      | elapsed:    0.8s
[Parallel(n_jobs=2)]: Done  96 tasks      | elapsed:    0.8s
[Parallel(n_jobs=2)]: Done  97 tasks      | elapsed:    0.8s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.8s finished


In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
matrix = confusion_matrix(y_test,y_pred, labels=[1,-1])
print('Confusion matrix : \n',matrix)

# outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(y_test,y_pred,labels=[1,-1]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,y_pred,labels=[1,-1])
print('Classification report : \n',matrix)

Confusion matrix : 
 [[4743 2119]
 [1909 5618]]
Outcome values : 
 4743 2119 1909 5618
Classification report : 
               precision    recall  f1-score   support

           1       0.71      0.69      0.70      6862
          -1       0.73      0.75      0.74      7527

    accuracy                           0.72     14389
   macro avg       0.72      0.72      0.72     14389
weighted avg       0.72      0.72      0.72     14389



SVM Classifier

*   Accuracy: 67.6%

In [None]:
print("training SVM classifier now ...")

svm_classifier = svm.SVC(verbose=12).fit(X_train,y_train)
y_pred = svm_classifier.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print('Accuracy of SVM classifier is: %f ' % acc)

print("current time:-", datetime.datetime.now())

training SVM classifier now ...
[LibSVM]Accuracy of SVM classifier is: 0.665578 
current time:- 2021-04-08 09:24:34.290903


In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
matrix = confusion_matrix(y_test,y_pred, labels=[1,-1])
print('Confusion matrix : \n',matrix)

# outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(y_test,y_pred,labels=[1,-1]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,y_pred,labels=[1,-1])
print('Classification report : \n',matrix)

Confusion matrix : 
 [[3675 3187]
 [1625 5902]]
Outcome values : 
 3675 3187 1625 5902
Classification report : 
               precision    recall  f1-score   support

           1       0.69      0.54      0.60      6862
          -1       0.65      0.78      0.71      7527

    accuracy                           0.67     14389
   macro avg       0.67      0.66      0.66     14389
weighted avg       0.67      0.67      0.66     14389

