# Opsætning af figur layout


In [None]:
import matplotlib.pyplot as plt
plt.style.reload_library()
plt.style.use(['science',"ieee"])


In [None]:
plt.rcParams.keys()

In [None]:
from matplotlib import cycler
import seaborn as sns
import pandas as pd
plt.rcParams.update({
    "ytick.minor.visible" : False,
     "xtick.minor.visible" : False,
      "axes.prop_cycle" : cycler('color', sns.color_palette("Blues_r").as_hex()),
      "figure.figsize": (4.2,2.1),
      "xtick.top": False,
      "ytick.right":False})


# Metode

## Oersigt over obervationer over tid

In [None]:
data_metode = pd.read_csv("content/drive/MyDrive/Cand.fælles/Speciale/datarobot_input/input.csv.zip")

In [None]:
data_metode["year"] = pd.to_datetime(data_metode["Forecasttidspunkt"]).dt.year

In [None]:
year_obs = data_metode.groupby("year")["EPS_actual"].count()


In [None]:
year_obs.rename("Obervationer over tid",inplace=True)

In [None]:
year_obs.sum()

In [None]:
årstal = year_obs.index

In [None]:
year_obs

In [None]:
års_obs = sns.catplot(data=data_metode,x="year",kind="count",color="#084488",height=2.10,aspect=2)
års_obs.set_xticklabels(rotation=90)
års_obs.set_ylabels("Antal observationer")
års_obs.set_xlabels("Year")
års_obs.despine(top=False,right=False)
plt.savefig("nytsnsplot.pgf")


##Oversigt over fordelingen af observationer pr industri


In [None]:
print(sns.color_palette("Blues_r",12).as_hex())

In [None]:
sns.color_palette("bone")

In [None]:
industri_opdeling = data_metode.groupby("industry_fama")["Forecasttidspunkt"].count()

In [None]:
data_metode.rename(columns={"industry_fama": "industri"},inplace=True)

In [None]:
data_metode["industri"] = data_metode["industri"].apply(lambda x: x.capitalize())

In [None]:
data_metode["industri"].replace({"Chemicals and allied products": "Chemicals and A.P","Consumer nondurables": "Consumer N.D","Wholesale \\& retail": "Wholesale and Retail"},inplace=True)

In [None]:
data_metode.industri.unique()

In [None]:
industri_obs = sns.catplot(data=data_metode,x="industri",kind="count",palette=sns.color_palette("Blues_r",12),height=2.10,aspect=2)
industri_obs.set_xticklabels(rotation=90)
industri_obs.set_ylabels("Antal observationer")
industri_obs.set_xlabels("Industri")
industri_obs.despine(top=False,right=False)
plt.savefig("industriobs.pgf")





## EPS distribution

In [None]:
data_metode.EPS_actual.min()
data_metode.EPS_actual.max()


In [None]:
data_metode

In [None]:
data_metode.rename(columns={'EPS_actual': 'Realiseret EPS'},inplace=True)

In [None]:
sns.histplot(data=data_metode,x="Realiseret EPS",bins=75,color="#084488")
plt.ylabel("Antal observationer")
plt.annotate(r'Min: $-10,8$',xy=(-10,25000))
plt.annotate(r'Max: $3,82$',xy=(-10,22000))

plt.savefig("epshistogram.pdf")
plt.savefig("epshistogram.pgf")

## Imputede værdier

In [None]:
imputede_værdier = pd.read_excel("content/drive/MyDrive/Cand.fælles/Speciale/Imputede værdier.xlsx")

In [None]:
imputede_værdier.drop(17,inplace=True)

In [None]:
imputede_værdier

In [None]:
imputede_værdier.replace({"SG&A":"SG\&A","PP&E":"PP\&E", "Non Operating income": "N.O. income"},inplace=True)

In [None]:
imputede_værdier.rename(columns={"Inputvairable": "Inputvariable"},inplace=True)

In [None]:
# Sorterede værdier - Fjern denne
imputede_værdier.sort_values(by="Imputede værdier",ascending=False,inplace=True)

In [None]:
imputede_graf = sns.catplot(data=imputede_værdier,x="Inputvariable",y="Imputede værdier",palette=sns.color_palette("Blues_r",18),kind="bar",height=2.1,aspect=2)
imputede_graf.set_xticklabels(rotation=90)
imputede_graf.despine(right=False,top=False)
imputede_graf.savefig('imputing.pgf')
imputede_graf.savefig('imputing.pdf')


**Alternativ graf uden brug af FacetGrid, men hvor rotation er lidt mere drilsk. Fordelen er nu vi har en helt boks hele vejen rundt**

In [None]:
sns.barplot(data=imputede_værdier,x="Inputvariable",y="Imputede værdier",palette=sns.color_palette("Blues_r",18))
axis_rot = plt.gca()
for tick in axis_rot.get_xticklabels():
    tick.set_rotation(90)

# Modellering

## Hyperparameter historie

In [None]:
import optuna
import joblib
study = joblib.load("content/drive/MyDrive/Cand.fælles/Speciale/Python/hyperparametertuning_study.pkl")

In [None]:
study.trials_dataframe()

In [None]:
study.best_trial

In [None]:
trials_df = study.trials_dataframe()

In [None]:
plt.scatter(x=trials_df["number"],y=trials_df["value"],s=2)
plt.ylim(0.30,0.45)
plt.xlabel("Kørsels nr.")
plt.ylabel("MAE")
plt.scatter(155, 0.3266674125188159, marker="*",color='red',s=6)

plt.savefig("hyperparamertuning.pdf")
plt.savefig("hyperparamertuning.pgf")

In [None]:
#trials_df.rename(columns=dic_renamer_trials,inplace=True)
#dic_renamer_trials = {"params_bagging_temperature": "Bagging temperature",	"params_boosting_type": "Boosting type",	"params_bootstrap_type": "Bootstrap type",	"params_colsample_bylevel": "Colsample by level",	"params_depth" : "Depth",	"params_subsample": "Subsample"}

## Hyperameter parametre

In [None]:
hyper_features = {"Depth": 0.54, "Bootstrap type": 0.26, "Colsample by level": 0.19, "Boosting type": 0.01}

In [None]:
pd.DataFrame.from_dict(hyper_features,orient="index",columns=["Parametre importance"]).sort_values("Parametre importance").plot.barh()
plt.legend(loc="lower right")

plt.savefig("hyperparamertuning_parametre.pdf")
plt.savefig("hyperparamertuning_parametre.pgf")

## Cross Validation

In [None]:
cross_val_df = pd.read_csv("content/drive/MyDrive/Cand.fælles/Speciale/Cross Validation -Datarobot vs Catboost - Værdier.csv",decimal=",")

In [None]:
cross_val_df

In [None]:
sns.scatterplot(x=cross_val_df["Tid(sekunder)"],y=cross_val_df["Gennemsnitlig MAE"],s=8,hue=cross_val_df["Model Navn"],palette=sns.color_palette("Blues_r",5))
plt.savefig("crossvaldatarobot.pdf")
plt.savefig("crossvaldatarobot.pgf")

In [None]:
sns.barplot(data=cross_val_df,x="Gennemsnitlig MAE",y="Model Navn")
plt.xlim((0.2,0.45))
plt.savefig("crossvaldatarobot_bar.pdf")
plt.savefig("crossvaldatarobot_bar.pgf")

# Analyse

## Feature Importance

In [None]:
feature_imp_q1 = pd.read_excel("content/drive/MyDrive/Cand.fælles/Speciale/Feature Importance.xlsx",sheet_name="Q1")
feature_imp_q2 = pd.read_excel("content/drive/MyDrive/Cand.fælles/Speciale/Feature Importance.xlsx",sheet_name="Q2")
feature_imp_q4 = pd.read_excel("content/drive/MyDrive/Cand.fælles/Speciale/Feature Importance.xlsx",sheet_name="Q4")

In [None]:
feature_imp_q1.replace({"Sum af 104 resterende inputvariable": "Sum af rest"},inplace=True)
feature_imp_q2.replace({"Sum af 104 resterende inputvariable": "Sum af rest"},inplace=True)
feature_imp_q4.replace({"Sum af 104 resterende inputvariable": "Sum af rest"},inplace=True)

In [None]:
fig1,ax1 = plt.subplots(1,1,figsize=(1,3))
sns.barplot(data=feature_imp_q1,x="SHAP",y="Inputvariable",palette=sns.color_palette("Blues_r",21),ax=ax1)
fig1.savefig("featuresq1.pdf")


fig1,ax1 = plt.subplots(1,1,figsize=(1,3))
sns.barplot(data=feature_imp_q2,x="SHAP",y="Inputvariable",palette=sns.color_palette("Blues_r",21),ax=ax1)
fig1.savefig("featuresq2.pdf")

fig1,ax1 = plt.subplots(1,1,figsize=(1,3))
sns.barplot(data=feature_imp_q4,x="SHAP",y="Inputvariable",palette=sns.color_palette("Blues_r",21),ax=ax1)
fig1.savefig("featuresq4.pdf")

In [None]:
fig,ax =plt.subplots(1,3,figsize=(5,3))
sns.barplot(data=feature_imp_q1,x="SHAP",y="Inputvariable",palette=sns.color_palette("Blues_r",21),ax=ax[0])
sns.barplot(data=feature_imp_q2,x="SHAP",y="Inputvariable",palette=sns.color_palette("Blues_r",21),ax=ax[1])
sns.barplot(data=feature_imp_q4,x="SHAP",y="Inputvariable",palette=sns.color_palette("Blues_r",21),ax=ax[2])

In [None]:
ax[0].set_xlim(0,0.30)
ax[1].set_xlim(0,0.30)
ax[2].set_xlim(0,0.30)

In [None]:
fig.tight_layout()

In [None]:
fig

In [None]:
fig.savefig("feature.pdf")

## Market cap

In [None]:
Q1 = joblib.load("content/drive/MyDrive/Cand.fælles/Speciale/Python/Q1_test_marketcap.pkl")
Q2 = joblib.load("content/drive/MyDrive/Cand.fælles/Speciale/Python/Q2_test_marketcap.pkl")
Q4 = joblib.load("content/drive/MyDrive/Cand.fælles/Speciale/Python/Q4_test_marketcap.pkl")

Q1= pd.concat(Q1)
Q2 = pd.concat(Q2)
Q4 = pd.concat(Q4)

Q1.reset_index(0,drop=True,inplace=True)
Q2.reset_index(0,drop=True,inplace=True)
Q4.reset_index(0,drop=True,inplace=True)

Q1["horizon"] = 1
Q2["horizon"] = 2
Q4["horizon"] = 4

Q_samlet = pd.concat([Q1,Q2,Q4])

Q_samlet["market_cap"] = Q_samlet["market_cap"] / 1000000

In [None]:
Q_small = Q_samlet[Q_samlet.cap_category=="small_cap"]
Q_mid = Q_samlet[Q_samlet.cap_category=="mid_cap"]
Q_large = Q_samlet[Q_samlet.cap_category=="large_cap"]

In [None]:
x1 = Q1.cap_category.value_counts(normalize=True)
x2 = Q2.cap_category.value_counts(normalize=True)
x4 = Q4.cap_category.value_counts(normalize=True)



fig, axes = plt.subplots(1,3,figsize=(20,5))
wedges,labels,autopct = axes[0].pie(x1.values, labels=["small","mid","big"], autopct='%1.1f%%',
        shadow=False, startangle=90,)
plt.setp(labels, fontsize=15)

axes[0].axis('equal')
axes[0].set_title("Q1",fontsize=20)
# Q2
wedges,labels,autopct =axes[1].pie(x2.values, labels=["small","mid","big"], autopct='%1.1f%%',
        shadow=False, startangle=90)
plt.setp(labels, fontsize=15)

axes[1].axis('equal')
axes[1].set_title("Q2",fontsize=20)


#Q4 
wedges,labels,autopct =axes[2].pie(x4.values,labels=["small","mid","big"], autopct='%1.1f%%',
        shadow=False, startangle=90)
plt.setp(labels, fontsize=15)

axes[2].axis('equal')
axes[2].set_title("Q4",fontsize=20)




In [None]:
print("test")

In [None]:
fig.savefig("marketcap.pgf")
fig.savefig("marketcap.pdf")

## Længden af træningsdata

In [None]:
q1_længde = joblib.load("content/drive/MyDrive/Cand.fælles/Speciale/Python/robusthed/Q1_performance_trainchange")
q2_længde = joblib.load("content/drive/MyDrive/Cand.fælles/Speciale/Python/robusthed/Q2_performance_trainchange")
q4_længde = joblib.load("content/drive/MyDrive/Cand.fælles/Speciale/Python/robusthed/Q4_performance_trainchange")

In [None]:
q1_perf_samlet = pd.concat(q1_længde)
q2_perf_samlet = pd.concat(q2_længde)
q4_perf_samlet = pd.concat(q4_længde)

q1_perf_samlet["horizon"] = 1
q2_perf_samlet["horizon"] = 2
q4_perf_samlet["horizon"] = 4

In [None]:
df_samlet_perf = pd.concat([q1_perf_samlet,q2_perf_samlet,q4_perf_samlet])

plot_df = df_samlet_perf.reset_index()[df_samlet_perf.reset_index().level_1=="Outperformance MAE(%)"]

plot_df["level_0"] = pd.to_datetime(plot_df["level_0"])

In [None]:
liste_ = []

cutoff = pd.Timestamp("2018-06-30")

plot_df["year"] = plot_df.level_0.apply(lambda x: cutoff.year - x.year)
plot_df.set_index("year",inplace=True)
plot_df.sort_index(ascending=False,inplace=True)


In [None]:
plot_df

In [None]:
plot_df.rename(columns={0:"Outperformance"},inplace=True)

In [None]:
import matplotlib.ticker as ticker


In [None]:
sns.lineplot(data=plot_df,x=plot_df.index,y="Outperformance",hue="horizon",palette=sns.color_palette("Blues_r",3))
ax_længde = plt.gca()
ax_længde.invert_xaxis()
ax_længde.set_ylim((-0.03,0.03))
plt.legend(title="Forecast horisont")
leg = plt.legend()
leg.get_texts()[0].set_text(r'$h=1$')
leg.get_texts()[1].set_text(r'$h=2$')
leg.get_texts()[2].set_text(r'$h=4$')
plt.xlim((15.5,2.5))
plt.xlabel("Længde af træningsdata (år)")
ax_længde.xaxis.set_major_locator(ticker.IndexLocator(base=2, offset=0))

plt.savefig("træningsdata_robusthed.pgf")
plt.savefig("træningsdata_robusthed.pdf")


In [None]:
plot_df.horizon.unique()

## Rolling

In [None]:
import joblib
import pandas as pd
plot_rolling = joblib.load("content/drive/MyDrive/Cand.fælles/Speciale/Python/robusthed/plot_df_rolling.pkl")
plot_rolling["year"] = pd.to_datetime(plot_rolling.index).year



In [None]:
plot_rolling

In [None]:
plot_rolling_long = plot_rolling.melt(id_vars="year")
plot_rolling_long.rename(columns={"variable":"Forecast horisont","value":"Outperformance"},inplace=True)


In [None]:
plot_rolling_long

In [None]:
sns.lineplot(data=plot_rolling_long,x="year",y="Outperformance",hue="Forecast horisont")
plt.xlabel("År")
leg = plt.legend()
leg.get_texts()[0].set_text(r'$h=1$')
leg.get_texts()[1].set_text(r'$h=2$')
leg.get_texts()[2].set_text(r'$h=4$')
plt.savefig("rolling_robusthed.pgf")
plt.savefig("rolling_robusthed.pdf")

## Corona

In [None]:
corona_df = joblib.load("content/drive/MyDrive/Cand.fælles/Speciale/Python/robusthed/performance_corona.pkl")

In [None]:
corona_df.reset_index(inplace=True)

In [None]:
corona= corona_df[corona_df["level_1"]=="Outperformance MAE(%)"]

In [None]:
corona

In [None]:
corona.rename(columns={0:"MAE outperformance","level_0":"Forecast horisont"},inplace=True)

In [None]:
corona.replace({"true": "Under corona","false": "Før corona"},inplace=True)

In [None]:
corona.rename(columns={"corona": "Corona"},inplace=True)

In [None]:
sns.barplot(x="Forecast horisont",y="MAE outperformance",data=corona,hue="Corona",orient="v")
plt.axhline(0,0,1,color="black",lw=0.35)
plt.ylim((-0.05,0.38))


In [None]:
plt.savefig("corona.pgf")
plt.savefig("corona.pdf")