In [None]:
! pip install catboost
! pip install --upgrade scikit-learn
! pip install openpyxl
import pandas as pd
import numpy as np

In [None]:
def ml_pipeline (fil):
  # importing df and basic cleaning
  df_ml = pd.read_csv(fil)
  df_ml["Forecasttidspunkt"] = pd.to_datetime(df_ml["Forecasttidspunkt"])
  df_ml["periode_regnskabstal"] = pd.to_datetime(df_ml["periode_regnskabstal"])
  df_ml.set_index("Forecasttidspunkt",inplace=True)

  #Dropping non relevant columns
  df_ml.rename(columns={"CUSIP":"cusip"},inplace=True)
  list_drop = ["SHROUT","company_name","cusip","analyst_EPS_mean","analyst_high","analyst_low","analyst_std_mean","periode_regnskabstal","PERMNO","date","slutmåned","Instrument"]
  df_ml.drop(list_drop,axis=1,inplace=True,errors="ignore")

  # Creating year and month
  df_ml["month"] = df_ml.index.month
  df_ml["year"] = df_ml.index.year
  df_ml = df_ml.astype({'GVKEY': 'category'})
  df_ml = df_ml.astype({'industry_fama': 'category'})
  df_ml.sort_index(inplace=True)
  
  # New features og duplikat kolonner
  df_ml["volume_usd"] = df_ml["ALTPRC"] * df_ml["VOL"]
  df_ml["Div_yield"] = df_ml["DPS_ex_date(dvpsxq)"] / df_ml["ALTPRC"]
  df_ml.drop(["PRC","adj_close"],axis=1,inplace=True)

  columns_drop_list = ["Assets_total(Atq)","industry_return","volume_usd","VOL"]
  df_ml.drop(columns_drop_list,axis=1,inplace=True)

  
  # Lagging variable

  columns_lag = ['EPS_actual', 'Accounts_payable(Apq)',
       'Assets_total(Atq)', 'COGS(Cogsq)',
       'Common_equity(Ceqq)', 'DPS_ex_date(dvpsxq)', 'Deprect_and_ammor(Dpq)',
       'EPS_lagged', 'GVKEY', 'Income taxes_total(Txtq)',
       'Non_operating_income(Nopiq)', 'Operating_income_after_deprec(Oiadpq)',
       'Opex_total(Xoprq)', 'PP&E_total_net(Ppentq)','PRC',
       'Pretax_income(Piq)', 'Receviables(Rectq)', 'Revenue_total(Revtq)',
    'VOL', 'adj_return','adj_close','market_cap', 'niq', 'industry_return', 'merafkast', 'ALTPRC','volume_usd','Div_yield']

  df_lag = df_ml.copy()
  df_lag = df_lag[df_lag.columns.intersection(columns_lag)]
  df_lag_new = df_lag.copy()

  for column in df_lag.columns:
    for i in range(0,1):
      df_lag_new[f'{column}_{i}'] = df_lag.groupby("GVKEY")[f'{column}'].shift(i).values

  columns_to_choose = ['Assets_total(Atq)','Accounts_payable(Apq)', 'COGS(Cogsq)',
       'Common_equity(Ceqq)', 'DPS_ex_date(dvpsxq)', 'Deprect_and_ammor(Dpq)',
       'EPS_lagged', 'Income taxes_total(Txtq)',
       'Non_operating_income(Nopiq)', 'Operating_income_after_deprec(Oiadpq)',
       'Opex_total(Xoprq)', 'PP&E_total_net(Ppentq)',
       'Pretax_income(Piq)', 'Receviables(Rectq)', 'Revenue_total(Revtq)',
       'market_cap', 'niq',"Div_yield","ALTPRC",'adj_return',"industry_return","merafkast",'VOL','volume_usd','PRC','adj_close']
  col_lag_choose = []
  for element in columns_to_choose:
    for i in range(0,1):
      col_lag_choose.append(f'{element}_{i}')
  col_lag_choose.append("GVKEY")

  # Vælger kun de endelige features
  df_lag_new = df_lag_new[df_lag_new.columns.intersection(col_lag_choose)]

  # Merge lags og oprindelig dataframe sammen
  df_ml = df_ml.reset_index().merge(df_lag_new.reset_index(),how="inner",on=["GVKEY","Forecasttidspunkt"])
  
  # Fikse indeks og erstatte NA i lags med NA

  numeric_columns = df_ml.select_dtypes(include=['number']).columns
  df_ml[numeric_columns] = df_ml[numeric_columns].fillna(0)

  df_ml.set_index("Forecasttidspunkt",inplace=True)
  
  cutoff_date = "2016-06-30"
  validation_start = "2016-09-30"
  validation_end = "2018-06-30"
  test_start = "2018-09-30"

  training_df = df_ml.loc[:cutoff_date,:]
  validation_df = df_ml.loc[validation_start:validation_end,:]
  test_df = df_ml.loc[test_start:,:]

  training_df_cv =  df_ml.loc[:validation_end,:]

# X og Y

  training_y = training_df["EPS_actual"]
  validatation_y = validation_df["EPS_actual"]
  test_y = test_df["EPS_actual"]
  training_y_cv = training_df_cv["EPS_actual"]

  training_x = training_df.drop("EPS_actual",axis=1)
  training_x_cv = training_df_cv.drop("EPS_actual",axis=1)
  validatation_x = validation_df.drop("EPS_actual",axis=1)
  test_x = test_df.drop("EPS_actual",axis=1)


  from catboost import CatBoostRegressor
  from catboost import Pool

  cat_model = CatBoostRegressor(cat_features=["GVKEY","industry_fama"],random_seed=2021,loss_function="MAE",early_stopping_rounds=250,iterations=50,max_ctr_complexity=0,one_hot_max_size=5000)

  pool_val = Pool(validatation_x,validatation_y,cat_features=["GVKEY","industry_fama"])

  cat_model.fit(training_x,training_y,eval_set=pool_val,verbose=False)

  print(cat_model.best_iteration_)
  print(cat_model.get_best_score())

  test_pred = cat_model.predict(test_x)
  return training_df_cv, training_x_cv,training_y_cv


In [None]:
fil_sti = "input.csv.zip"

training_df_cv, training_x_cv,training_y_cv = ml_pipeline(fil_sti)

In [None]:
pd.Series(training_df_cv.columns).to_excel("excel_features.xlsx")

In [None]:
(training_df_cv["EPS_lagged"] == training_df_cv["EPS_lagged_0"])

In [None]:
training_df_cv[["EPS_lagged","EPS_lagged_0"]]

# Cross Validating

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit

In [None]:
ts_cv = TimeSeriesSplit(n_splits=5,test_size=10000)

for train_index, test_index in ts_cv.split(training_df_cv):
  print("TRAIN:", train_index, "TEST:", test_index)

In [None]:
paramsxxx = {"cat_features": ["GVKEY","industry_fama"], "random_seed": 2021, "loss_function": "MAE", "max_ctr_complexity" : 0,"one_hot_max_size":0}

In [None]:
from catboost import cv,Pool
pool_data = Pool(data=training_x_cv,label=training_y_cv, cat_features = ["GVKEY","industry_fama"])


pandas_score = cv(pool=pool_data,params=params, iterations=50, early_stopping_rounds=3,folds=ts_cv,verbose=False,as_pandas=False)

# Optuna Hyperparameter

In [None]:
from optuna.integration.tensorboard import TensorBoardCallback

from azureml.tensorboard import Tensorboard

tb = Tensorboard([],local_root="logs/")

# If successful, start() returns a string with the URI of the instance.
tb.stop()
tb.start()

# After your job completes, be sure to stop() the streaming otherwise it will continue to run. 

In [None]:
! pip install --quiet optuna
import optuna
import numpy as np

from optuna.samplers import TPESampler
sampler = TPESampler(seed=2021)
pool_data = Pool(data=training_x_cv,label=training_y_cv, cat_features = ["GVKEY","industry_fama"])
# defining the trial object with parameters and metric that needs to maximized 
def objective(trial):

    param = {
        "random_state": 2021,
        "verbose": False,
        "loss_function": "MAE",
        "max_ctr_complexity": 0,
        "one_hot_max_size": 5000,
        "cat_features": ["GVKEY","industry_fama"],
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        )
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    
# creating the crossval score. This outputs the mean of the five cross validations runs. 
    cross_val_scores = cv(pool=pool_data,params=param, iterations=1000, early_stopping_rounds=250,folds=ts_cv,verbose=False,as_pandas=False)["test-MAE-mean"][-1]

# returns the cross val score so the subsequnt optimizere know what to maximize
    return cross_val_scores

tensorboard_callback = TensorBoardCallback("logs/", metric_name="MAE")
# Defining the optuna optimizer
if __name__ == "__main__":
    study = optuna.create_study(direction="minimize",sampler=sampler)
    study.optimize(objective,timeout=43200,callbacks=[tensorboard_callback])

# Some formatting of the output
    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))


# Saving the Study element to a pickle file

import joblib
joblib.dump(study,"hyperparametertuning_study.pkl")