# Amazon stock price movement prediciton using social network sentiment

model lists:
- svm
- naive bayes
- random forest
- decision tree

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn import svm,metrics

In [3]:
from sklearn.model_selection import train_test_split

## import dataset

In [4]:
dataset_file_path = "./data/data_set.csv"

In [5]:
dataset = pd.read_csv(dataset_file_path)

In [6]:
X = dataset.loc[:].copy()

In [7]:
X.drop(["stock_price_movement","datetime_format"],inplace=True,axis=1)

In [8]:
y = dataset["stock_price_movement"].copy()

## 分割数据集

---

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

In [12]:
def model_fit_and_predict(model,X_train,y_train,X_test):
    model.fit(X_train,y_train)
    return model.predict(X_test)

In [28]:
def model_accuracy(y_test,y_pred):
    print("Classification report for classifier %s:\n%s\n"
        % (classifier, metrics.classification_report(y_test, y_pred)))
    print("model accuracy: ", metrics.accuracy_score(y_test, y_pred))

In [89]:
classifier = svm.SVC(gamma=0.001)

In [90]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [91]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=40)

In [92]:
from sklearn import tree
dtree = tree.DecisionTreeClassifier()

In [34]:
models = [{"name":"svm","model":classifier},
          {"name":"navie bayes","model":gnb},
          {"name":"random forest","model":clf},
          {"name":"decision tree","model":dtree}]

In [35]:
for model_item in models:
    print("="*90)
    print("Using ",model_item["name"],"to train and predict:")
    print('-'*90)
    y_pred = model_fit_and_predict(model_item["model"],X_train,y_train,X_test)
    model_accuracy(y_test,y_pred)

Using  svm to train and predict:
------------------------------------------------------------------------------------------
Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

         -1       0.75      0.38      0.50         8
          1       0.76      0.94      0.84        17

avg / total       0.76      0.76      0.73        25


model accuracy:  0.76
Using  navie bayes to train and predict:
------------------------------------------------------------------------------------------
Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001

## test different params of random forest model

In [40]:
# store the different random forest models
random_forest_models =[]

In [44]:
# create different random forest models
for i in range(1,10):
    model_item = {"name":'',"model":''}
    model_item["name"] = "random forest",i*10
    model_item["model"] = RandomForestClassifier(n_estimators=i*10)
    random_forest_models.append(model_item)

In [46]:
for model_item in random_forest_models:
    print("="*90)
    print("Using ",model_item["name"],"to train and predict:")
    print('-'*90)
    y_pred = model_fit_and_predict(model_item["model"],X_train,y_train,X_test)
    model_accuracy(y_test,y_pred)

Using  ('random forest', 10) to train and predict:
------------------------------------------------------------------------------------------
Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

         -1       0.56      0.62      0.59         8
          1       0.81      0.76      0.79        17

avg / total       0.73      0.72      0.72        25


model accuracy:  0.72
Using  ('random forest', 20) to train and predict:
------------------------------------------------------------------------------------------
Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None,

## using Simple Ensemble Techniques

In [169]:
def simple_ensemble(models_list,X_train,y_train,X_test):
    result = {}
    for model_item in models_list:
        model_name = model_item["name"]
        y_pred = model_fit_and_predict(model_item["model"],X_train,y_train,X_test)
        result[model_name] = list(y_pred)
    result = pd.DataFrame(result)
    result["sum"] = result.apply(lambda x: sum(x),axis=1)
    result["vote"]= result.apply(lambda x: 1 if x["sum"] > 0 else -1, axis=1)
    return result["vote"].copy()

In [239]:
test_models = [{"name":"svm","model":classifier},
               {"name":"random forest","model":clf},
               {"name":"decision tree","model":dtree}]

In [215]:
test_models_result = simple_ensemble(test_models,X_train,y_train,X_test)

In [216]:
model_accuracy(y_test,test_models_result)

Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

         -1       0.57      0.50      0.53         8
          1       0.78      0.82      0.80        17

avg / total       0.71      0.72      0.71        25


model accuracy:  0.72


In [220]:
models_name = []
for item in test_models:
    models_name.append(item["name"])
print("Using these models: ", models_name)
print("with ensemble techniques to predict the stock price movement")
for i in range(6):
    print("ensemble epoch ",i+1)
    print("="*90)
    test_models_result = simple_ensemble(test_models,X_train,y_train,X_test)
    model_accuracy(y_test,test_models_result)
    print('-'*90)

Using these models:  ['svm', 'random forest', 'decision tree']
with ensemble techniques to predict the stock price movement
ensemble epoch  1
Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

         -1       0.80      0.50      0.62         8
          1       0.80      0.94      0.86        17

avg / total       0.80      0.80      0.79        25


model accuracy:  0.8
------------------------------------------------------------------------------------------
ensemble epoch  2
Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verb

In [221]:
classifier = svm.SVC(gamma=0.001)

In [222]:
test_models = [{"name":"svm","model":classifier}]

In [237]:
def model_test(test_models,epochs,ensemble=False):
    models_name = []
    for item in test_models:
        models_name.append(item["name"])
    print("Testing these models: ", models_name)
    if ensemble == True:
        print("Using ensemble techniques to predict the stock price movement.")
    print('*'*90)
    if ensemble == False:
        for model_item in test_models:
            print("Using {} model to train and predict the data".format(model_item["name"]))
            for i in range(epochs):
                print("="*90)
                print("ensemble epoch ",i+1)
                test_models_result = model_fit_and_predict(model_item["model"],X_train,y_train,X_test)
                model_accuracy(y_test,test_models_result)
                print('-'*90)
        return
    if ensemble == True and len(test_models)%2 !=1:
        print("Ensemble needs odd number of models!")
        return
    for i in range(epochs):
        print("ensemble epoch ",i+1)
        print("="*90)
        test_models_result = simple_ensemble(test_models,X_train,y_train,X_test)
        model_accuracy(y_test,test_models_result)
        print('-'*90)

In [238]:
model_test(test_models,6)

Testing these models:  ['svm']
********************************************************************************
Using svm model to train and predict the data
ensemble epoch  1
Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

         -1       0.75      0.38      0.50         8
          1       0.76      0.94      0.84        17

avg / total       0.76      0.76      0.73        25


model accuracy:  0.76
------------------------------------------------------------------------------------------
ensemble epoch  2
Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None

In [242]:
model_test(test_models,2,ensemble=False)

Testing these models:  ['svm', 'random forest', 'decision tree']
********************************************************************************
Using svm model to train and predict the data
ensemble epoch  1
Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

         -1       0.75      0.38      0.50         8
          1       0.76      0.94      0.84        17

avg / total       0.76      0.76      0.73        25


model accuracy:  0.76
------------------------------------------------------------------------------------------
ensemble epoch  2
Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, pr