# Funktioner

In [None]:
# Preprocessing data

def ml_pipeline (fil,industri):
  # importing df and basic cleaning
  df_ml = pd.read_csv(fil)
  df_ml["Forecasttidspunkt"] = pd.to_datetime(df_ml["Forecasttidspunkt"])
  df_ml["periode_regnskabstal"] = pd.to_datetime(df_ml["periode_regnskabstal"])
  df_ml.set_index("Forecasttidspunkt",inplace=True)

  #Dropping non relevant columns
  df_ml.rename(columns={"CUSIP":"cusip"},inplace=True)
  list_drop = ["SHROUT","company_name","cusip","analyst_EPS_mean","analyst_high","analyst_low","analyst_std_mean","periode_regnskabstal","PERMNO","date","slutmåned","Instrument"]
  df_ml.drop(list_drop,axis=1,inplace=True,errors="ignore")

  # Creating year and month
  df_ml["month"] = df_ml.index.month
  df_ml["year"] = df_ml.index.year
  df_ml = df_ml.astype({'GVKEY': 'category'})
  df_ml = df_ml.astype({'industry_fama': 'category'})
  df_ml.sort_index(inplace=True)
  
  # New features og duplikat kolonner
  df_ml["volume_usd"] = df_ml["ALTPRC"] * df_ml["VOL"]
  df_ml["Div_yield"] = df_ml["DPS_ex_date(dvpsxq)"] / df_ml["ALTPRC"]
  df_ml.drop(["PRC","adj_close"],axis=1,inplace=True)

  columns_drop_list = ["Assets_total(Atq)","merafkast"]
  df_ml.drop(columns_drop_list,axis=1,inplace=True)


  # Lagging variable

  columns_lag = ['EPS_actual', 'Accounts_payable(Apq)',
       'Assets_total(Atq)', 'COGS(Cogsq)',
       'Common_equity(Ceqq)', 'DPS_ex_date(dvpsxq)', 'Deprect_and_ammor(Dpq)',
       'EPS_lagged', 'GVKEY', 'Income taxes_total(Txtq)',
       'Non_operating_income(Nopiq)', 'Operating_income_after_deprec(Oiadpq)',
       'Opex_total(Xoprq)', 'PP&E_total_net(Ppentq)','PRC',
       'Pretax_income(Piq)', 'Receviables(Rectq)', 'Revenue_total(Revtq)',
    'VOL', 'adj_return','adj_close','market_cap', 'niq', 'industry_return', 'merafkast', 'ALTPRC','volume_usd','Div_yield']

  df_lag = df_ml.copy()
  df_lag = df_lag[df_lag.columns.intersection(columns_lag)]
  df_lag_new = df_lag.copy()

  for column in df_lag.columns:
    for i in range(1,6):
      df_lag_new[f'{column}_{i}'] = df_lag.groupby("GVKEY")[f'{column}'].shift(i).values

  columns_to_choose = ['Assets_total(Atq)','Accounts_payable(Apq)', 'COGS(Cogsq)',
       'Common_equity(Ceqq)', 'DPS_ex_date(dvpsxq)', 'Deprect_and_ammor(Dpq)',
       'EPS_lagged', 'Income taxes_total(Txtq)',
       'Non_operating_income(Nopiq)', 'Operating_income_after_deprec(Oiadpq)',
       'Opex_total(Xoprq)', 'PP&E_total_net(Ppentq)',
       'Pretax_income(Piq)', 'Receviables(Rectq)', 'Revenue_total(Revtq)',
       'market_cap', 'niq',"Div_yield","ALTPRC",'adj_return',"industry_return","merafkast",'VOL','volume_usd','PRC','adj_close']
  col_lag_choose = []
  for element in columns_to_choose:
    for i in range(1,6):
      col_lag_choose.append(f'{element}_{i}')
  col_lag_choose.append("GVKEY")

  # Vælger kun de endelige features
  df_lag_new = df_lag_new[df_lag_new.columns.intersection(col_lag_choose)]

  # Merge lags og oprindelig dataframe sammen
  df_ml = df_ml.reset_index().merge(df_lag_new.reset_index(),how="inner",on=["GVKEY","Forecasttidspunkt"])

  # Fikse indeks og erstatte NA i lags med NA

  numeric_columns = df_ml.select_dtypes(include=['number']).columns
  df_ml[numeric_columns] = df_ml[numeric_columns].fillna(0)

  df_ml.set_index("Forecasttidspunkt",inplace=True)
  df_ml = df_ml[df_ml.industry_fama==industri]
  
  cutoff_date = "2016-06-30"
  validation_start = "2016-09-30"
  validation_end = "2018-06-30"
  test_start = "2018-09-30"

  training_df = df_ml.loc[:cutoff_date,:]
  validation_df = df_ml.loc[validation_start:validation_end,:]
  test_df = df_ml.loc[test_start:,:]

  training_df_cv =  df_ml.loc[:validation_end,:]

# X og Y

  training_y = training_df["EPS_actual"]
  validatation_y = validation_df["EPS_actual"]
  test_y = test_df["EPS_actual"]
  training_y_cv = training_df_cv["EPS_actual"]

  training_x = training_df.drop("EPS_actual",axis=1)
  training_x_cv = training_df_cv.drop("EPS_actual",axis=1)
  validatation_x = validation_df.drop("EPS_actual",axis=1)
  test_x = test_df.drop("EPS_actual",axis=1)


  from catboost import CatBoostRegressor
  from catboost import Pool

  cat_model = CatBoostRegressor(cat_features=["GVKEY","industry_fama"],random_seed=2021,boosting_type="Plain",bootstrap_type ="MVS",
                              colsample_bylevel =0.09625657571631001, depth=10, iterations=13000,one_hot_max_size = 5000,
                              max_ctr_complexity=0, early_stopping_rounds=250)

  pool_val = Pool(validatation_x,validatation_y,cat_features=["GVKEY","industry_fama"])

  cat_model.fit(training_x,training_y,eval_set=pool_val,verbose=False)

  print(cat_model.best_iteration_)
  print(cat_model.get_best_score())

  test_pred = cat_model.predict(test_x)
  return test_pred


In [None]:
# Mase function
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import numpy as np
def mase(actual_eps_training,lagged_eps,model_predicted,actual_eps_test):
  '''
  Calculates the Mean Absolute scaled error per Hyndman definition
  actuals_eps_training = EPS actual from training
  Lagged_eps = Lagged EPS from training (naive forecast)
  Model_predicted = Model predictions in the test set
  Actual_eps_test = The actual EPS from the test set  
   '''
  mae = mean_absolute_error(actual_eps_training,lagged_eps)
  q = (actual_eps_test - model_predicted) /mae
  mase = np.mean(np.abs(q))
  return mase


  # Metric Dataframe
def metric_dataframe (test_predicted,industri, fil="/content/drive/MyDrive/Cand.fælles/Speciale/datarobot_input/input.csv.zip"):

  '''fil : Indsæt stien til den fil der indeholder data før ML rensning
    test_predicted : Henvis til et array af predicted værdier på testsættet
    industri (string): Den industri der skal køres performance på'''

  df = pd.read_csv(fil)
  df["Forecasttidspunkt"] = pd.to_datetime(df["Forecasttidspunkt"])
  df.set_index("Forecasttidspunkt",inplace=True)
  df.sort_index(inplace=True)
  df= df[df.industry_fama ==industri]
  træning_slut = "2018-06-30"
  test_start = "2018-09-30"
  train_df = df.loc[:træning_slut,:]
  test_df = df.loc[test_start:,:]
  test_df["Model_predicted"] = test_predicted
  test_df.dropna(subset=["analyst_EPS_mean"],inplace=True)
  return train_df,test_df

  # Metric Calculations
def metric_calculations (metric_test_df,metric_train_df):
  # Mae
  analytiker_mae = mean_absolute_error(metric_test_df["EPS_actual"],metric_test_df["analyst_EPS_mean"])
  model_mae = mean_absolute_error(metric_test_df["EPS_actual"],metric_test_df["Model_predicted"])
  # RMSE
  analytiker_rmse = mean_squared_error(metric_test_df["EPS_actual"],metric_test_df["analyst_EPS_mean"],squared=False)
  model_rmse = mean_squared_error(metric_test_df["EPS_actual"],metric_test_df["Model_predicted"],squared=False)

  # Mase
  analytiker_mase = mase(metric_train_df["EPS_actual"],metric_train_df["EPS_lagged"],metric_test_df["analyst_EPS_mean"],metric_test_df["EPS_actual"]) 
  model_mase = mase(metric_train_df["EPS_actual"],metric_train_df["EPS_lagged"],metric_test_df["Model_predicted"],metric_test_df["EPS_actual"]) 
  


  metric_df = pd.DataFrame.from_dict({"MAE":[analytiker_mae,model_mae],"RMSE": [analytiker_rmse,model_rmse],"MASE": [analytiker_mase,model_mase]})
  metric_df.index = ["Analytiker","Model"]
  return metric_df

  

In [None]:
import numpy as np
import pandas as pd

# Industri model

In [None]:
! pip install catboost
import joblib

## 1Q

In [None]:
industrier_liste = ['Healthcare', 'Manufacturing', 'Business Equipment',
       'Wholesale/Retail', 'finance', 'Chemicals and Allied products',
       'Consumer Nondurables', 'Energy', 'telecom', 'Consumer durables']

Q1_testpred = {}
Q1_train ={}
Q1_test ={}
Q1_performance={} 

for industri__valg in industrier_liste:

  fil_sti = "input.csv.zip"
  Q1_testpred[f'{industri__valg}'] = ml_pipeline(fil_sti,industri=f'{industri__valg}')
  Q1_train[f'{industri__valg}'],Q1_test[f'{industri__valg}'] = metric_dataframe(test_predicted=Q1_testpred[f'{industri__valg}'],fil=fil_sti,industri=f'{industri__valg}')
  Q1_performance[f'{industri__valg}'] = metric_calculations(Q1_test[f'{industri__valg}'],Q1_train[f'{industri__valg}'])

# Joblib save output

joblib.dump(Q1_testpred,"Q1_testpred_industri.pkl")
joblib.dump(Q1_train,"Q1_train_industri.pkl")
joblib.dump(Q1_test,"Q1_test_industri.pkl")
joblib.dump(Q1_performance,"Q1_performance_industri.pkl")

## Q2

In [None]:
industrier_liste = ['Healthcare', 'Manufacturing', 'Business Equipment',
       'Wholesale/Retail', 'finance', 'Chemicals and Allied products',
       'Consumer Nondurables', 'Energy', 'telecom', 'Consumer durables']

Q2_testpred = {}
Q2_train ={}
Q2_test ={}
Q2_performance={} 

for industri__valg in industrier_liste:

  fil_sti = "input_2Q.csv.zip"
  Q2_testpred[f'{industri__valg}'] = ml_pipeline(fil_sti,industri=f'{industri__valg}')
  Q2_train[f'{industri__valg}'],Q2_test[f'{industri__valg}'] = metric_dataframe(test_predicted=Q2_testpred[f'{industri__valg}'],fil=fil_sti,industri=f'{industri__valg}')
  Q2_performance[f'{industri__valg}'] = metric_calculations(Q2_test[f'{industri__valg}'],Q2_train[f'{industri__valg}'])

joblib.dump(Q2_testpred,"Q2_testpred_industri.pkl")
joblib.dump(Q2_train,"Q2_train_industri.pkl")
joblib.dump(Q2_test,"Q2_test_industri.pkl")
joblib.dump(Q2_performance,"Q2_performance_industri.pkl")

## Q4

In [None]:
industrier_liste = ['Healthcare', 'Manufacturing', 'Business Equipment',
       'Wholesale/Retail', 'finance', 'Chemicals and Allied products',
       'Consumer Nondurables', 'Energy', 'telecom', 'Consumer durables']

Q4_testpred = {}
Q4_train ={}
Q4_test ={}
Q4_performance={} 

for industri__valg in industrier_liste:

  fil_sti = "input_4Q.csv.zip"
  Q4_testpred[f'{industri__valg}'] = ml_pipeline(fil_sti,industri=f'{industri__valg}')
  Q4_train[f'{industri__valg}'],Q4_test[f'{industri__valg}'] = metric_dataframe(test_predicted=Q4_testpred[f'{industri__valg}'],fil=fil_sti,industri=f'{industri__valg}')
  Q4_performance[f'{industri__valg}'] = metric_calculations(Q4_test[f'{industri__valg}'],Q4_train[f'{industri__valg}'])


joblib.dump(Q4_testpred,"Q4_testpred_industri.pkl")
joblib.dump(Q4_train,"Q4_train_industri.pkl")
joblib.dump(Q4_test,"Q4_test_industri.pkl")
joblib.dump(Q4_performance,"Q4_performance_industri.pkl")

In [None]:
Q1_performance["Business Equipment"]