# Amazon stock price movement prediciton using social network sentiment

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [32]:
from sklearn import svm,metrics

In [7]:
from sklearn.model_selection import train_test_split

In [27]:
from sklearn import preprocessing

## import dataset

In [4]:
dataset_file_path = "./data/data_set.csv"

In [5]:
dataset = pd.read_csv(dataset_file_path)

In [6]:
dataset.head()

Unnamed: 0,datetime_format,-1,0,1,-1.1,0.1,1.1,-1.2,0.2,1.2,-1.3,0.3,1.3,stock_price_movement
0,2018/1/3,39,33,106,44,11,69,83,44,175,82.720004,38,174.808923,1
1,2018/1/4,69,34,143,115,20,170,184,54,313,180.893916,51,310.385097,1
2,2018/1/5,62,38,176,199,45,275,261,83,451,260.382077,76,449.895724,1
3,2018/1/8,65,45,158,82,28,147,147,73,305,146.260912,59,304.512629,1
4,2018/1/9,45,52,174,45,16,78,90,68,252,89.18364,57,249.169456,1


In [17]:
X = dataset.loc[:].copy()

In [18]:
X.drop(["stock_price_movement","datetime_format"],inplace=True,axis=1)

In [19]:
X.head()

Unnamed: 0,-1,0,1,-1.1,0.1,1.1,-1.2,0.2,1.2,-1.3,0.3,1.3
0,39,33,106,44,11,69,83,44,175,82.720004,38,174.808923
1,69,34,143,115,20,170,184,54,313,180.893916,51,310.385097
2,62,38,176,199,45,275,261,83,451,260.382077,76,449.895724
3,65,45,158,82,28,147,147,73,305,146.260912,59,304.512629
4,45,52,174,45,16,78,90,68,252,89.18364,57,249.169456


In [22]:
y = dataset["stock_price_movement"].copy()

## 分割数据集

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3., random_state=8)

In [25]:
X_train.shape

(82, 12)

In [26]:
X_test.shape

(42, 12)

## using svm to predict the stock price movement
1. 未进行数据标准化（scale，normalize）

---

In [28]:
# create SVM classifier
classifier = svm.SVC(gamma=0.001)

In [29]:
# learn from training data
classifier.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [30]:
# 使用建立起来的模型，预测测试数据，并获得预测的结果
y_pred = classifier.predict(X_test)

In [33]:
print("Confusion matrix:\n%s"
    % metrics.confusion_matrix(y_test, y_pred))

Confusion matrix:
[[ 3 13]
 [ 1 25]]


In [37]:
print("Classification report for classifier %s:\n%s\n"
    % (classifier, metrics.classification_report(y_test, y_pred)))

Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

         -1       0.75      0.19      0.30        16
          1       0.66      0.96      0.78        26

avg / total       0.69      0.67      0.60        42




In [65]:
metrics.accuracy_score(y_test, y_pred)

0.6666666666666666

## using normalied data

In [39]:
standardized_X = preprocessing.scale(X)

In [40]:
normalized_X = preprocessing.normalize(X)

In [48]:
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(standardized_X, y, test_size=1/3., random_state=8)

In [49]:
# create SVM classifier
classifier = svm.SVC(gamma=0.001)

In [50]:
# learn from training data
classifier.fit(X_train_s,y_train_s)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [51]:
# 使用建立起来的模型，预测测试数据，并获得预测的结果
y_pred_s = classifier.predict(X_test_s)

In [52]:
print("Confusion matrix:\n%s"
    % metrics.confusion_matrix(y_test_s, y_pred_s))

Confusion matrix:
[[ 0 16]
 [ 0 26]]


In [53]:
print("Classification report for classifier %s:\n%s\n"
    % (classifier, metrics.classification_report(y_test_s, y_pred_s)))

Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

         -1       0.00      0.00      0.00        16
          1       0.62      1.00      0.76        26

avg / total       0.38      0.62      0.47        42




  'precision', 'predicted', average, warn_for)


In [54]:
X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(normalized_X, y, test_size=1/3., random_state=8)

In [55]:
# create SVM classifier
classifier = svm.SVC(gamma=0.001)

In [56]:
# learn from training data
classifier.fit(X_train_n,y_train_n)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [57]:
# 使用建立起来的模型，预测测试数据，并获得预测的结果
y_pred_n = classifier.predict(X_test_n)

In [58]:
print("Confusion matrix:\n%s"
    % metrics.confusion_matrix(y_test_n, y_pred_n))

Confusion matrix:
[[ 0 16]
 [ 0 26]]


In [59]:
print("Classification report for classifier %s:\n%s\n"
    % (classifier, metrics.classification_report(y_test_n, y_pred_n)))

Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

         -1       0.00      0.00      0.00        16
          1       0.62      1.00      0.76        26

avg / total       0.38      0.62      0.47        42




  'precision', 'predicted', average, warn_for)


## using navie bayes

In [66]:
from sklearn.naive_bayes import GaussianNB

In [70]:
# import classification module
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

# learn from training data
gnb = gnb.fit(X_train,y_train)

# using builded model to predict testing data
# and get prediction
y_pred = gnb.predict(X_test)

In [69]:
print(nb_model)

GaussianNB(priors=None)


In [72]:
print("Classification report for classifier %s:\n%s\n"
    % (classifier, metrics.classification_report(y_test, y_pred)))

Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

         -1       0.29      0.12      0.17        16
          1       0.60      0.81      0.69        26

avg / total       0.48      0.55      0.49        42




In [73]:
metrics.accuracy_score(y_test, y_pred)

0.5476190476190477

## using random forest

In [74]:
# import random forest classification module
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=40)

# learn from training data
clf = clf.fit(X_train,y_train)

# using builded model to predict testing data
# and get prediction
y_pred = clf.predict(X_test)

In [76]:
# using precision, recall & f1-score to measure our prediction accurray
print("Classification report for classifier %s:\n%s\n"
    % (clf, metrics.classification_report(y_test, y_pred)))

Classification report for classifier RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False):
             precision    recall  f1-score   support

         -1       0.58      0.44      0.50        16
          1       0.70      0.81      0.75        26

avg / total       0.66      0.67      0.65        42




In [77]:
metrics.accuracy_score(y_test, y_pred)

0.6666666666666666

## using decision tree

In [78]:
# import classification module
from sklearn import tree
clf = tree.DecisionTreeClassifier()

# learn from training data
clf = clf.fit(X_train,y_train)

# using builded model to predict testing data
# and get prediction
y_pred = clf.predict(X_test)

In [79]:
# using precision, recall & f1-score to measure our prediction accurray
print("Classification report for classifier %s:\n%s\n"
    % (clf, metrics.classification_report(y_test, y_pred)))

Classification report for classifier DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'):
             precision    recall  f1-score   support

         -1       0.50      0.56      0.53        16
          1       0.71      0.65      0.68        26

avg / total       0.63      0.62      0.62        42




In [80]:
metrics.accuracy_score(y_test, y_pred)

0.6190476190476191