In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt

#from datetime import date
import glob
import os

In [2]:
# import data: cluster centers, RMSE & profit, latest data
cluster_centers_path = "cluster_centers.csv"
RMSE_and_profit_path = "cluster_centers_profit_and_error.csv"
data_to_predict_folder = glob.glob("../prediction_data/compiled_per_date/*")
data_to_predict_path = max(data_to_predict_folder, key=os.path.getctime)
training_data_path = "../preprocessed_data.csv"

df_cluster_centers = pd.read_csv(cluster_centers_path)
df_RMSE_and_profit = pd.read_csv(RMSE_and_profit_path)
df_data_to_predict = pd.read_csv(data_to_predict_path)
df_training_data = pd.read_csv(training_data_path)

tp_day_col_name = df_cluster_centers.columns[-1]
#print(tp_day_col)
#metadata_cols = df_data_to_predict.columns[:2]

# process dataframes
df_cluster_centers_features = df_cluster_centers[df_cluster_centers.columns[:-2]]
df_data_to_predict_features = df_data_to_predict[df_data_to_predict.columns[2:-3]]
df_training_data_clipped = df_training_data[df_training_data.columns[2:-3]]
#print(df_training_data_clipped.columns)

df_cluster_centers_features_arr = df_cluster_centers_features.values.astype(np.float32)
df_data_to_predict_features_arr = df_data_to_predict_features.values.astype(np.float32)
df_training_arr = df_training_data_clipped.values.astype(np.float32)

df_cluster_centers_data = pd.concat([
    df_cluster_centers[["next_day_open", tp_day_col_name]],
    df_RMSE_and_profit
], axis=1)

df_cluster_centers_data

Unnamed: 0,next_day_open,tp_day_d+2,RMSE_label_1,RMSE_label_2,exp_profit_pct
0,-0.064373,-0.060091,0.005123,0.006622,0.428143
1,0.030284,0.031918,0.006103,0.007268,0.163492
2,0.076477,0.068339,0.007706,0.008215,-0.813828
3,-0.019984,-0.050248,0.008593,0.009754,-3.026377
4,-0.004842,-0.005534,0.009518,0.010101,-0.069194
5,0.071077,0.073228,0.01098,0.012209,0.215159
6,0.069884,0.053025,0.011156,0.011744,-1.685854
7,-0.042976,-0.018952,0.011961,0.012076,2.402374
8,-0.238042,-0.271089,0.016224,0.019482,-3.304765
9,0.023849,0.014074,0.013503,0.01416,-0.977507


In [3]:
def input_predict_fn():
    return tf.data.Dataset.from_tensors(
        tf.convert_to_tensor(df_data_to_predict_features_arr, dtype=tf.float32)).repeat(1)

def input_training_fn():
    return tf.data.Dataset.from_tensors(
        tf.convert_to_tensor(df_training_arr, dtype=tf.float32)).repeat(1)

In [4]:
kmeans = tf.contrib.factorization.KMeansClustering(
	num_clusters=df_cluster_centers.shape[0],
    initial_clusters = df_cluster_centers_features_arr,
    use_mini_batch=False)
kmeans.train(input_training_fn)
print(kmeans.cluster_centers())

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp3j0so7i3', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fbda91139e8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph wa

In [5]:
prediction_clusters = list(kmeans.predict_cluster_index(input_predict_fn))
df_data_to_predict["prediction_clusters"] = np.array(prediction_clusters)

# init new columns
df_data_to_predict["predict_next_open_day"] = np.nan
df_data_to_predict["predict_tp_day"] = np.nan
df_data_to_predict["RMSE_next_open_day"] = np.nan
df_data_to_predict["RMSE_tp_day"] = np.nan
df_data_to_predict["exp_profit_pct"] = np.nan
df_data_to_predict["diff_with_cluster_centers"] = np.nan


for i, row in df_data_to_predict.iterrows():
    i_clus = row["prediction_clusters"]
    
    # set diff
    diff = row[df_data_to_predict.columns[2:-10]] - df_cluster_centers_features.loc[i_clus]
    diff_std = diff.values.std(ddof=1)
    df_data_to_predict.at[i, "diff_with_cluster_centers"] = diff_std

colpart_df_data_to_predict = ["predict_next_open_day", "predict_tp_day", "RMSE_next_open_day", "RMSE_tp_day", "exp_profit_pct"]
col_df_cluster_centers_data = df_cluster_centers_data.columns

for i, source_col in enumerate(col_df_cluster_centers_data):
    df_data_to_predict[colpart_df_data_to_predict[i]] = df_data_to_predict["prediction_clusters"].map(
        lambda x: df_cluster_centers_data.loc[x, source_col]
    )

df_data_to_predict

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp3j0so7i3/model.ckpt-2
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


Unnamed: 0,ticker_name,Date,O0,H0,L0,C0,O1,H1,L1,C1,...,next_day_open,tp_day_d+2,Volume,prediction_clusters,predict_next_open_day,predict_tp_day,RMSE_next_open_day,RMSE_tp_day,exp_profit_pct,diff_with_cluster_centers
0,BULL,2019-01-11,0.0,0.032258,-0.008130,0.008065,0.016129,0.024194,-0.024793,-0.016393,...,,,65750600.0,13,0.050120,0.119750,0.016607,0.017633,6.963055,0.014848
1,BBTN,2019-01-11,0.0,0.000000,-0.029851,-0.029851,-0.029851,-0.018450,-0.041509,-0.029851,...,,,23600800.0,10,-0.023658,-0.033207,0.014202,0.014466,-0.954898,0.008792
2,RIMO,2019-01-11,0.0,0.014706,0.000000,0.000000,0.007353,0.014706,-0.007407,0.000000,...,,,180109500.0,16,0.008648,0.012052,0.017784,0.018076,0.340359,0.004828
3,HKMU,2019-01-11,0.0,0.028571,0.000000,0.011429,0.017143,0.017143,0.000000,0.005714,...,,,16399700.0,16,0.008648,0.012052,0.017784,0.018076,0.340359,0.005854
4,MGRO,2019-01-11,0.0,0.056497,-0.023121,0.056497,0.062147,0.107345,0.056497,0.079096,...,,,8066800.0,9,0.023849,0.014074,0.013503,0.014160,-0.977507,0.021633
5,BBNI,2019-01-11,0.0,0.000000,-0.014164,-0.011299,-0.002801,-0.002801,-0.017045,-0.011299,...,,,23608500.0,4,-0.004842,-0.005534,0.009518,0.010101,-0.069194,0.006137
6,ASRI,2019-01-11,0.0,0.036145,0.000000,0.030120,0.042169,0.054217,0.030120,0.042169,...,,,11086300.0,15,0.028901,0.007654,0.017227,0.017578,-2.124672,0.013007
7,BUMI,2019-01-11,0.0,0.064748,-0.007246,0.021583,0.028777,0.093525,0.021583,0.050360,...,,,367896200.0,5,0.071077,0.073228,0.010980,0.012209,0.215159,0.011959
8,RISE,2019-01-11,0.0,0.004032,-0.004049,0.004032,0.004032,0.004032,0.000000,0.000000,...,,,2123300.0,16,0.008648,0.012052,0.017784,0.018076,0.340359,0.007261
9,INDF,2019-01-11,0.0,0.003390,-0.013746,-0.006826,-0.013746,0.010169,-0.024306,0.010169,...,,,7313100.0,18,0.008928,-0.011490,0.018760,0.018846,-2.041806,0.009046


In [6]:
# rearrange columns
cols = list(df_data_to_predict.columns)
if cols[2] == "O0":
    cols = cols[:2] + cols[-7:] + cols[2:-8]
    df_data_to_predict = df_data_to_predict[cols]

In [7]:
# sort from most profitable & least difference with cluster centres
df_data_to_predict = df_data_to_predict.sort_values(by=["exp_profit_pct", "diff_with_cluster_centers"], ascending=[False, True])
df_data_to_predict

Unnamed: 0,ticker_name,Date,prediction_clusters,predict_next_open_day,predict_tp_day,RMSE_next_open_day,RMSE_tp_day,exp_profit_pct,diff_with_cluster_centers,O0,...,O1,H1,L1,C1,O2,H2,L2,C2,next_day_open,tp_day_d+2
35,PNLF,2019-01-11,13,0.050120,0.119750,0.016607,0.017633,6.963055,0.006790,0.0,...,0.000000,0.022556,-0.007576,0.007519,0.007519,0.037594,-0.007576,0.015038,,
46,WOOD,2019-01-11,13,0.050120,0.119750,0.016607,0.017633,6.963055,0.009446,0.0,...,-0.008772,0.017391,-0.017699,0.000000,0.017391,0.034783,0.008696,0.017391,,
121,ELSA,2019-01-11,13,0.050120,0.119750,0.016607,0.017633,6.963055,0.009827,0.0,...,0.005556,0.022222,-0.011236,0.000000,0.000000,0.055556,-0.005587,0.038889,,
133,MAPI,2019-01-11,13,0.050120,0.119750,0.016607,0.017633,6.963055,0.010320,0.0,...,0.000000,0.022599,-0.011429,-0.005682,0.005650,0.033898,-0.005682,0.028249,,
36,CPIN,2019-01-11,13,0.050120,0.119750,0.016607,0.017633,6.963055,0.010416,0.0,...,0.003300,0.016502,0.000000,0.016502,0.016502,0.052805,0.013201,0.052805,,
85,MCAS,2019-01-11,13,0.050120,0.119750,0.016607,0.017633,6.963055,0.010748,0.0,...,-0.006645,0.000000,-0.010000,-0.003311,0.000000,0.049505,-0.003311,0.016502,,
99,APLN,2019-01-11,13,0.050120,0.119750,0.016607,0.017633,6.963055,0.011512,0.0,...,0.006173,0.030864,-0.012500,-0.012500,-0.012500,0.037037,-0.012500,0.030864,,
158,CLEO,2019-01-11,13,0.050120,0.119750,0.016607,0.017633,6.963055,0.012666,0.0,...,0.000000,0.007194,-0.007246,-0.007246,0.000000,0.057554,-0.007246,0.043165,,
10,ADMF,2019-01-11,13,0.050120,0.119750,0.016607,0.017633,6.963055,0.012918,0.0,...,0.005731,0.008596,0.002865,0.005731,0.008596,0.063037,0.008596,0.040115,,
71,VOKS,2019-01-11,13,0.050120,0.119750,0.016607,0.017633,6.963055,0.013117,0.0,...,0.000000,0.038462,-0.007752,0.000000,-0.007752,0.053846,-0.007752,0.030769,,


In [8]:
save_path = "../prediction_data/prediction_history/"
date = df_data_to_predict.loc[0, "Date"]
df_data_to_predict.to_csv(save_path + date + "-predictions.csv", index=False)