In [196]:
import numpy as np 
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import random
import os
import pickle
from sklearn import metrics
from datetime import date
from sklearn.metrics import classification_report

In [148]:
file_path = 'D:\\庫存健診開發\\data\\prediction\\full\\'
file_list = os.listdir(file_path)

df_list = []

for filename in file_list:
    df = pd.read_csv(file_path + filename, converters={'StockNo': str})
    df_list.append(df)
    
prediction = pd.concat(df_list, axis=0)

In [16]:
original = pd.read_csv('D:\\OneDrive - SinoPac\\training\\test_large.csv', converters={'StockNo': str}, usecols=["ts", "StockNo", "Y"])

In [149]:
combine = pd.merge(original, prediction[['ts', 'StockNo', 'Y_0_score', 'Y_1_score', 'cluster']], on=['ts', 'StockNo'], how='inner')
combine

Unnamed: 0,Y,ts,StockNo,Y_0_score,Y_1_score,cluster
0,1,2017-10-05,1215,0.403472,0.596528,2
1,1,2017-10-06,1215,0.341521,0.658479,2
2,1,2017-10-11,1215,0.314727,0.685273,2
3,1,2017-10-12,1215,0.316733,0.683267,2
4,1,2017-10-13,1215,0.349956,0.650044,2
...,...,...,...,...,...,...
258852,1,2019-09-16,9958,0.228284,0.771716,4
258853,1,2019-09-17,9958,0.244692,0.755308,4
258854,1,2019-09-18,9958,0.296794,0.703206,4
258855,0,2019-09-19,9958,0.462566,0.537434,4


In [32]:
def predict_up(row, threshold):
    
    if row['Y_1_score'] > threshold:
        return 1
    else:
        return 0

In [38]:
def predict_down(row, threshold):
    
    if row['Y_0_score'] > threshold:
        return 0
    else:
        return 1

In [34]:
def Evaluation_up(data, threshold=0.5):
    
    data['prediction'] = data.apply(predict_up, threshold=threshold, axis=1)
    fpr, tpr, thresholds = metrics.roc_curve(data['Y'], data['prediction'])
    auc = metrics.auc(fpr, tpr)
    accuracy = metrics.accuracy_score(data['Y'], data['prediction'])

    target_names = ['down', 'up']
    report = classification_report(data['Y'].tolist(), data['prediction'].tolist(), target_names=target_names)

    return [auc, accuracy, report]

In [39]:
def Evaluation_down(data, threshold=0.5):
    
    data['prediction'] = data.apply(predict_down, threshold=threshold, axis=1)
    fpr, tpr, thresholds = metrics.roc_curve(data['Y'], data['prediction'])
    auc = metrics.auc(fpr, tpr)
    accuracy = metrics.accuracy_score(data['Y'], data['prediction'])

    target_names = ['down', 'up']
    report = classification_report(data['Y'].tolist(), data['prediction'].tolist(), target_names=target_names)

    return [auc, accuracy, report]

In [304]:
r = Evaluation_down(combine, threshold=0.6)

In [305]:
print(r[2])

              precision    recall  f1-score   support

        down       0.71      0.53      0.60    131848
          up       0.61      0.77      0.68    127009

    accuracy                           0.65    258857
   macro avg       0.66      0.65      0.64    258857
weighted avg       0.66      0.65      0.64    258857



In [306]:
combine[(combine.Y_0_score > 0.6)]['StockNo']

6         1215
7         1215
8         1215
9         1215
10        1215
          ... 
258841    9958
258842    9958
258843    9958
258844    9958
258845    9958
Name: StockNo, Length: 98554, dtype: object

In [69]:
def Separate_Evaluation(data, threshold=0.5, prediction=False, **kwargs):
    
    if prediction:
        data['prediction'] = data.apply(predict_up, threshold=threshold, axis=1)
        data['ts'] = pd.to_datetime(data['ts'])
        
    else:
        data = data

    if (len(kwargs.keys()) == 1):
        if 'year' in kwargs:
            year = kwargs['year']
            df = data[data.ts.dt.year == year]
        else:
            stock = kwargs['stock']
            df = data[data.StockNo == stock]

    target_names = ['down', 'up']

    fpr, tpr, thresholds = metrics.roc_curve(df['Y'], df['prediction'])
    auc = metrics.auc(fpr, tpr)

    report = classification_report(df['Y'], df['prediction'], target_names=target_names)
    accuracy = metrics.accuracy_score(df['Y'], df['prediction'])

    return report, accuracy, auc

In [100]:
def Overall_Evaluation(data, threshold=0.6):
    
    data['prediction'] = data.apply(predict_up, threshold=0.5, axis=1)
    data['ts'] = pd.to_datetime(data['ts'])
    df_list = [group[1] for group in data.groupby(data['StockNo'])]
    
    stock_list = []
    cluster_list = []
    acc_list = []
    for i, d in enumerate(tqdm(df_list)):
        acc = metrics.accuracy_score(d['Y'], d['prediction'])
        acc_list.append(acc)
        if  acc< threshold:
            stock_list.append(d['StockNo'].iloc[0])
            cluster_list.append(d['cluster'].iloc[0])
            
    rate_acc = len(stock_list)/len(df_list)



    return rate_acc, stock_list, cluster_list, acc_list

In [160]:
r, s, c, a = Overall_Evaluation(combine)

HBox(children=(IntProgress(value=0, max=557), HTML(value='')))

In [309]:
r, acc, auc = Separate_Evaluation(combine, year=2017)

In [310]:
print(r)

              precision    recall  f1-score   support

        down       0.67      0.54      0.60     16026
          up       0.60      0.72      0.65     15351

    accuracy                           0.63     31377
   macro avg       0.63      0.63      0.63     31377
weighted avg       0.63      0.63      0.63     31377



In [165]:
count_list = []
for i in range(5):
    count_num = c.count(i)/len(combine[combine.cluster == i]['StockNo'].unique())
    count_list.append(count_num)

In [173]:
np.array(a).std()

0.03649623029215041

In [188]:
[acc for acc in a if acc < 0.55]

[0.5958333333333333,
 0.5833333333333334,
 0.5916666666666667,
 0.5938375350140056,
 0.5854166666666667,
 0.5770833333333333,
 0.5958333333333333,
 0.5729166666666666,
 0.5479166666666667,
 0.5729166666666666,
 0.5625,
 0.5979166666666667,
 0.5958333333333333,
 0.5833333333333334,
 0.5921787709497207,
 0.55859375,
 0.5875,
 0.5958333333333333,
 0.5666666666666667,
 0.5666666666666667,
 0.5755395683453237,
 0.5945945945945946,
 0.5259259259259259,
 0.4918032786885246,
 0.4878048780487805,
 0.5895833333333333]

In [171]:
prediction[prediction.StockNo.isin(s)]['industry_close_corr_day20'].mean()

0.7549629550010196

In [172]:
prediction['industry_close_corr_day20'].mean()

0.7193440485070444

In [187]:
s

['1476',
 '1590',
 '1730',
 '1760',
 '2227',
 '2360',
 '2362',
 '2454',
 '2474',
 '2723',
 '3008',
 '3443',
 '3535',
 '3665',
 '4961',
 '4989',
 '5269',
 '6005',
 '6415',
 '6452',
 '6668',
 '6669',
 '6670',
 '6672',
 '8462',
 '8464']

In [311]:
import json
d = []
with open('C:\\Users\\011553\\Downloads\\bq-results-20191108-185923-d66j3pbl4kdt.json' , 'r') as fp:
    for line in fp:
        d.append(json.loads(line))
        
    


In [312]:
Y = [int(item['Y']) for item in d]
ts = [item['ts'] for item in d]
StockNo = [item['StockNo'] for item in d]
cluster = [item['cluster'] for item in d]
Y_down = [float(item['predicted_Y'][0]['tables']['score']) for item in d]
Y_up = [float(item['predicted_Y'][1]['tables']['score']) for item in d]

In [313]:
df = pd.DataFrame(np.stack([Y, ts, StockNo, cluster, Y_down, Y_up], axis=1), columns=['Y', 'ts', 'StockNo', 'cluster', 'Y_0_score', 'Y_1_score'])

In [314]:
df['Y_1_score'] = df['Y_1_score'].astype(np.float64)
df['Y_0_score'] = df['Y_0_score'].astype(np.float64)
df['Y'] = df['Y'].astype(np.int)

In [315]:
r, s, c, a = Overall_Evaluation(df)

HBox(children=(IntProgress(value=0, max=528), HTML(value='')))

In [324]:
count_list

[0.35789473684210527,
 0.34057971014492755,
 0.29464285714285715,
 0.3939393939393939,
 0.28205128205128205]

In [323]:
count_list = []
for i in range(5):
    count_num = c.count(str(i))/len(df[df.cluster == str(i)]['StockNo'].unique())
    count_list.append(count_num)

In [326]:
np.array(a).std()

0.051310027958286014

In [325]:
len([acc for acc in a if acc < 0.55])

39

In [243]:
max(a)

0.875

In [327]:
r, acc, auc = Separate_Evaluation(df, year=2017)

In [328]:
print(r)

              precision    recall  f1-score   support

        down       0.60      0.64      0.62     40635
          up       0.64      0.61      0.63     43443

    accuracy                           0.62     84078
   macro avg       0.62      0.62      0.62     84078
weighted avg       0.62      0.62      0.62     84078



In [331]:
r = Evaluation_up(df, threshold=0.6)

In [332]:
print(r[2])

              precision    recall  f1-score   support

        down       0.55      0.87      0.67     54663
          up       0.71      0.30      0.42     56027

    accuracy                           0.58    110690
   macro avg       0.63      0.59      0.55    110690
weighted avg       0.63      0.58      0.55    110690



In [262]:
data = pd.read_csv('D:\\庫存健診開發\\data\\processed\\TWSE_Stock_feature.csv', converters={'ts': str, 'StockNo': str, 'StockName': str})
data['ts'] = pd.to_datetime(data['ts'])
data

Unnamed: 0,ts,StockNo,StockName,open,high,low,close,vol,total,capital,...,investment_buy_kurtosis,dealer_buy_skew,dealer_buy_kurtosis,pvt_current,pvt,TR,ATR,RSI_15,SO5,SO3
0,2007-07-02,1103,嘉泥,16.690909,17.075348,16.389434,16.696833,8922.148890,241197.752334,163.0,...,,,,,,0.685913,0.685913,0.000000,0.000000,0.000000
1,2007-07-03,1103,嘉泥,16.792673,17.181600,16.483122,16.793699,9528.445464,252811.658292,161.7,...,,,,55.278694,,0.698478,0.692614,0.000000,0.000000,0.000000
2,2007-07-04,1103,嘉泥,16.895752,17.289264,16.578034,16.891876,11023.590605,286518.072056,160.7,...,,,,64.444429,119.723123,0.711230,0.699664,0.000000,0.000000,0.000000
3,2007-07-05,1103,嘉泥,16.999760,17.397944,16.673817,16.991004,12925.887006,327957.933786,163.7,...,,,,75.854092,140.298521,0.724127,0.707054,0.000000,0.000000,0.000000
4,2007-07-06,1103,嘉泥,17.109628,17.512718,16.774992,17.095685,22855.487064,576534.951942,175.2,...,,,,140.811720,216.665812,0.737727,0.714925,0.000000,0.628737,0.324509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2369242,2019-09-17,9955,佳龍,19.540854,19.684652,19.311951,19.442533,809.447597,15833.513865,19.9,...,-3.0,-4.129483,15.052632,-6.151946,-13.724452,0.372701,0.503662,0.060364,0.119488,0.113594
2369243,2019-09-18,9955,佳龍,19.327445,19.472854,19.141697,19.287597,362.143798,6863.474058,19.8,...,-3.0,-4.129483,15.052632,-2.885898,-9.037843,0.331157,0.482099,0.049477,0.134196,0.123895
2369244,2019-09-19,9955,佳龍,19.170038,19.316798,19.013120,19.169869,840.337000,15650.591246,19.4,...,-3.0,-4.129483,15.052632,-5.129252,-8.015149,0.303678,0.459797,0.042812,0.150675,0.137285
2369245,2019-09-20,9955,佳龍,19.072044,19.220740,18.930397,19.093576,434.980049,8042.232935,19.8,...,-3.0,-4.129483,15.052632,-1.731143,-6.860395,0.290343,0.438615,0.038954,0.171759,0.154522


In [263]:
d = data[data.ts.dt.date > date(2017,9,1)]

In [264]:
d

Unnamed: 0,ts,StockNo,StockName,open,high,low,close,vol,total,capital,...,investment_buy_kurtosis,dealer_buy_skew,dealer_buy_kurtosis,pvt_current,pvt,TR,ATR,RSI_15,SO5,SO3
2521,2017-09-04,1103,嘉泥,10.920425,11.057998,10.838284,10.934541,921.181813,11217.222223,93.4,...,-3.0,-0.526775,1.996868,4.036192,74.154054,0.219714,0.293473,0.997760,0.881613,0.852314
2522,2017-09-05,1103,嘉泥,10.960410,11.107632,10.881336,10.986639,924.558778,11232.632421,93.8,...,-3.0,-0.092092,3.730104,4.405061,8.441252,0.226295,0.285076,0.997904,0.884583,0.868448
2523,2017-09-06,1103,嘉泥,11.003959,11.160571,10.927978,11.042632,1128.730577,13613.414757,91.8,...,-3.0,-0.129504,5.470242,5.752541,10.157602,0.232593,0.278516,0.998057,0.888721,0.878585
2524,2017-09-07,1103,嘉泥,11.058198,11.227687,10.986329,11.113473,976.787028,11716.815539,92.2,...,-3.0,-0.050406,6.886184,6.266258,12.018799,0.241358,0.273871,0.998242,0.733459,0.806022
2525,2017-09-08,1103,嘉泥,11.121816,11.306781,11.054852,11.197008,534.815238,6343.851278,93.8,...,-3.0,-0.050406,6.886184,4.019982,10.286240,0.251930,0.271128,0.998443,0.765690,0.785856
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2369242,2019-09-17,9955,佳龍,19.540854,19.684652,19.311951,19.442533,809.447597,15833.513865,19.9,...,-3.0,-4.129483,15.052632,-6.151946,-13.724452,0.372701,0.503662,0.060364,0.119488,0.113594
2369243,2019-09-18,9955,佳龍,19.327445,19.472854,19.141697,19.287597,362.143798,6863.474058,19.8,...,-3.0,-4.129483,15.052632,-2.885898,-9.037843,0.331157,0.482099,0.049477,0.134196,0.123895
2369244,2019-09-19,9955,佳龍,19.170038,19.316798,19.013120,19.169869,840.337000,15650.591246,19.4,...,-3.0,-4.129483,15.052632,-5.129252,-8.015149,0.303678,0.459797,0.042812,0.150675,0.137285
2369245,2019-09-20,9955,佳龍,19.072044,19.220740,18.930397,19.093576,434.980049,8042.232935,19.8,...,-3.0,-4.129483,15.052632,-1.731143,-6.860395,0.290343,0.438615,0.038954,0.171759,0.154522


In [283]:
data[data.vol == 0][['ts', 'origin_close', 'StockNo', 'vol']].iloc[-50:]

Unnamed: 0,ts,origin_close,StockNo,vol
2332904,2011-12-26,4.51,9935,0.0
2332905,2011-12-27,4.51,9935,0.0
2332906,2011-12-28,4.51,9935,0.0
2332907,2011-12-29,4.51,9935,0.0
2332908,2011-12-30,4.51,9935,0.0
2337022,2016-06-04,30.4,9937,0.0
2337027,2016-06-14,30.52,9937,0.0
2337485,2018-04-26,36.95,9937,0.0
2337496,2018-05-14,36.27,9937,0.0
2337500,2018-05-18,36.5,9937,0.0


In [297]:
data.loc[2353264][['ts', 'StockNo', 'origin_close']]

ts              2008-09-22 00:00:00
StockNo                        9943
origin_close                  10.79
Name: 2353264, dtype: object