In [1]:
import numpy as np
import pandas as pd
import pickle

hm_days = 7

In [38]:
def process_data_for_labels(ticker):
    df = pd.read_csv('icare_joined_closes.csv', index_col=0)
    tickers = df.columns.values
    df.fillna(0, inplace=True)

    for i in range(1, hm_days+1):
        df['{}_{}d'.format(ticker, i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]   
        
    df.fillna(0, inplace=True)
    return tickers, df

process_data_for_labels('KO')

(array(['TSLA', 'LK', 'AAPL', 'AMZN', 'BIDU', 'KO', 'QCOM', 'FB', 'TCOM',
        'AMD', 'DIS', 'DOYU', 'INTC', 'DELL', 'JD', 'BABA', 'EBAY', 'ATVI',
        'GOOGL', 'MSFT'], dtype=object),
                   TSLA    LK        AAPL         AMZN        BIDU         KO  \
 Date                                                                           
 2009-12-31    0.000000  0.00   26.061205   134.520004   41.123001  19.278732   
 2010-01-04    0.000000  0.00   26.466835   133.899994   41.002998  19.292267   
 2010-01-05    0.000000  0.00   26.512596   134.690002   40.590000  19.058893   
 2010-01-06    0.000000  0.00   26.090879   132.250000   41.250000  19.052124   
 2010-01-07    0.000000  0.00   26.042646   130.000000   40.463001  19.004770   
 ...                ...   ...         ...          ...         ...        ...   
 2020-04-27  798.750000  4.39  282.405548  2376.000000  100.559998  46.779999   
 2020-04-28  769.119995  4.39  277.827911  2314.080078   97.129997  46.740002   

In [58]:
from collections import Counter

def buy_sell_hold(*args):
    cols = [c for c in args]
    requirement = 0.02
    for col in cols:
        # 股价上升5%
        if col > requirement: 
            return 1
        # 股价降低5%
        elif col < -requirement: 
            return -1
    # 股价没有（任何？）变化
    return 0

def extract_featuresets(ticker):
    tickers, df =process_data_for_labels(ticker)

    df['{}_target'.format(ticker)] = list(
        map(
            buy_sell_hold, 
            *[df['{}_{}d'.format(ticker, i)]for i in range(1, hm_days+1)]
        )
    )

    vals = df['{}_target'.format(ticker)].values
    str_vals = [str(i) for i in vals]
    print('Data spread:',Counter(str_vals))
    df.fillna(0,inplace=True)

    df = df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace=True)
    df_vals = df[[ticker for ticker in tickers]]. pct_change()
    df_vals = df_vals.replace([np.inf, -np.inf], 0)
    df_vals.fillna(0,inplace=True)

    X = df_vals.values
    y = df['{}_target'.format(ticker)].values

    return X, y, df

# extract_featuresets('KO')

In [59]:
from sklearn import svm, neighbors
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

def do_ml(ticker):
    X, y, df = extract_featuresets(ticker)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.25
    )

    # clf = neighbors.KNeighborsClassifier()
    clf = VotingClassifier([
        ('lsvc', svm.LinearSVC()),
        ('knn', neighbors.KNeighborsClassifier()),
        ('rfor', RandomForestClassifier())
    ])

    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print('Accuracy', confidence)
    predictions = clf.predict(X_test)
    print('Predicted spread:', Counter(predictions))

    return confidence

do_ml('TSLA')

Data spread: Counter({'1': 1358, '-1': 1078, '0': 165})
Accuracy 0.5192604006163328
Predicted spread: Counter({1: 506, -1: 139, 0: 4})


0.5192604006163328