# Set Up

**Import libraries**

In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

**Ignore warnings**

In [2]:
import warnings
warnings.filterwarnings("ignore")

**Load dataset**

In [3]:
JST_df = pd.read_excel("JSTdatasetR4.xlsx", sheet_name = "Data")

# Feature Engineering

**Precrisis indicator**

Crisis have to be predicted before they happen. If we focus on the years with crisis, model will fit the features when it's to late. In order to avoid this, is a good option to create a new target variable with positive value 2 years before a crisis.

In [4]:
def add_precrisis(df):
    df["precrisis"] = 0
    
    for i in range(1,df.shape[0]):
        if (df.loc[i, "crisisJST"] == 1):
            df.loc[(i-1), "precrisis"] = 1
            df.loc[(i-2), "precrisis"] = 1
    return df

JST_df = add_precrisis(JST_df)

**Features used in the model**

Obtain variables used in the model. This includes transformations and creation of new variables. Some variables are expressed as percentual growth since immediate value doesnt mean nothing, it's important to know how they change before a crisis. This way also scales a little bit the values between diferent countries.

In [5]:
def define_variables(df):
    # Slope of the yield curve
    df["slope"] = df["ltrate"] - df["stir"]
    
    # Global credit
    df["global_credit"] = 0
    for i in range(df.shape[0]):
        year_it = df.loc[(i), "year"] 
        df["global_credit"][i] = (df[df["year"] == year_it].tloans.sum() - df.loc[(i), "tloans"])/(df[df["year"] == year_it].tloans.shape[0] - 1)
    df["gr_global_credit"] = df["global_credit"].pct_change(periods = 2) / df["gdp"]
    
    # Log CPI
    df["log_cpi"] = np.log(df["cpi"])
    df["gr_log_cpi"] = df["log_cpi"].pct_change(periods = 2)
    
    # CA/GDP
    df["ca_gdp"] = df["ca"]/df["gdp"]
    df["gr_ca_gdp"] = df["ca_gdp"].pct_change(periods = 2)
    
    # Log RGDP
    df["log_real_gdp"] = np.log(df["rgdpmad"])
    df["gr_log_real_gdp"] = df["log_real_gdp"].pct_change(periods = 2)
    
    # Log money
    df["log_money"] = np.log(df["money"])
    df["gr_log_money"] = df["log_money"].pct_change(periods = 2)
    
    # Log Domestic Credit
    df["log_credit"] = np.log(df["tloans"])
    df["gr_log_credit"] = df["log_credit"].pct_change(periods = 2)
    
    # Inversion
    df["gr_inv"] = df["iy"].pct_change(periods = 2)
    
    # Acumulated Variance
    df["roll_credit"] = df.tloans.rolling(window=2 ,center=False).std()
    df["roll_money"] = df.money.rolling(window=2 ,center=False).std()
    df["roll_ltrate"] = df.ltrate.rolling(window=2 ,center=False).std()
    
    df["gr_gdp"] = df["gdp"].pct_change(periods = 2)
    
    return df

pre_df = define_variables(JST_df)

**Drop Bias PostCrisis**

Posterior period from a crisis is a recuperation time where economies are not healthy. Since they don't follow it's normal cycle, is better to drop the 5 years after a crisis and the year of the crisis itself. With this it's easier to avoid the bias produced and avoid confusing the model.

In [6]:
def drop_crisis_bias(df):
    df["drops"] = 0
    
    for i in range(1,(df.shape[0])):
        if (df.loc[i, "crisisJST"] == 1):
            df.loc[i, "drops"] = 1
            df.loc[(i+1), "drops"] = 1
            df.loc[(i+2), "drops"] = 1
            df.loc[(i+3), "drops"] = 1
            df.loc[(i+4), "drops"] = 1
            df.loc[(i+5), "drops"] = 1  
            
    df = df[df["drops"] == 0]
    df = df.drop(["drops"], axis=1)
    return df

pre_df = drop_crisis_bias(pre_df)

**Final features**

In [7]:
df = pre_df[["precrisis","slope","gr_global_credit","gr_log_cpi","ca_gdp","gr_log_real_gdp","gr_log_money",
            "gr_log_credit","gr_inv","year","country","stir","gr_ca_gdp","roll_credit","roll_money",
            "roll_ltrate","gr_gdp"]]

**Delete worst years**

Years of Great Depression and World Wars may be excluded from data since they are periods where economy doesnt follow a normal trend.

In [8]:
def drop_years(df):
    excluded_years = [1914,1915,1916,1917,1918,1933,1934,1935,1936,1937,1938,1939,1940,1941,1942,1943,1944,1945]
    df = df[~df["year"].isin(excluded_years)]
    return df

df = drop_years(df)

**Delete missing data**

In [9]:
df = df.replace(np.inf, np.NaN)
df.dropna(inplace=True)

**Simplification**

Simplify model to 6 nordic countries.

In [11]:
def select_country(df, paises):
    df = df[df["country"].isin(paises)]
    return df
df = select_country(df,["Norway","Switzerland","Finland","Germany","Denmark","Netherlands"])

Simplify to modern years (Post WWII).

In [12]:
df = df[df["year"] > 1949]

# Model implementatión

**Classification function**

In [13]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix,roc_auc_score


def clasificacion(classifier, X, y):
    y_train_pred = cross_val_predict(classifier, X, y ,cv=5)
    print("Matriz de confusion:")
    print(confusion_matrix(y, y_train_pred))
    print("AUC score:",roc_auc_score(y.values, y_train_pred))

Target and variables.

In [14]:
X = df[["slope","gr_global_credit","gr_log_cpi","ca_gdp","gr_log_real_gdp","gr_log_money",
            "gr_log_credit","gr_inv"]]

y = df["precrisis"]

**Logistic Regresion**

In [15]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42, penalty=None, loss="log")

clasificacion(sgd_clf, X, y)

Matriz de confusion:
[[317  17]
 [ 15   1]]
AUC score: 0.5058008982035929


**Random Forest**

In [16]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=42, n_estimators=1000)

pred = clasificacion(rf_clf, X, y)

Matriz de confusion:
[[334   0]
 [ 14   2]]
AUC score: 0.5625


**Extremely randomised trees**

In [17]:
from sklearn.ensemble import ExtraTreesClassifier

etc_clf = ExtraTreesClassifier(n_estimators=1000, random_state=22)

clasificacion(etc_clf, X, y)

Matriz de confusion:
[[333   1]
 [ 16   0]]
AUC score: 0.49850299401197606


**Neural network**

In [18]:
from sklearn.neural_network import MLPClassifier

mlp_clf = MLPClassifier(max_iter=1000, solver="lbfgs", random_state=42)

clasificacion(mlp_clf,X,y)

Matriz de confusion:
[[314  20]
 [ 14   2]]
AUC score: 0.5325598802395209


**Extreme gradient boosting**

In [19]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(random_state=42,n_estimators = 1000, learning_rate=0.1,min_child_weight=5)

clasificacion(xgb_clf, X ,y)

Matriz de confusion:
[[332   2]
 [ 16   0]]
AUC score: 0.49700598802395207


# Optimal hyperparameters GridSearch

### Neural Network

In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
param_grid = {"solver":["lbfgs","adam"],"learning_rate":["constant","invscaling","adaptative"],
              "random_state":[42],"activation":["identity","logistic","tanh","relu"],
             "alpha":[0.00001,0.0005,]}

mlp_clf = MLPClassifier()

grid_search = GridSearchCV(mlp_clf, param_grid, scoring="roc_auc", cv=5, return_train_score = True)

grid_search.fit(X,y)

print("Best params:",grid_search.best_params_)
print("Score:",grid_search.best_score_)

Best params: {'activation': 'identity', 'alpha': 1e-05, 'learning_rate': 'constant', 'random_state': 42, 'solver': 'adam'}
Score: 0.7948778833107191


In [22]:
print("Best params:",grid_search.best_params_)
print("Score:",grid_search.best_score_)

Best params: {'activation': 'identity', 'alpha': 1e-05, 'learning_rate': 'constant', 'random_state': 42, 'solver': 'adam'}
Score: 0.7948778833107191


### Logitstic Regresion

In [23]:
param_grid = {"loss":["hinge","log","squared_loss","huber","epsilon_insensitive"],
             "penalty":["l2","l1","elasticnet"],"max_iter":[500,1000,1500],
             "learning_rate":["optimal","constant","adaptive","invscaling"],
             "eta0":[0.00001,0.0001,0.005,0.1]}

clf = SGDClassifier()

grid_search = GridSearchCV(clf, param_grid, scoring="roc_auc", cv=5, return_train_score = True)

grid_search.fit(X,y)

print("Best params:",grid_search.best_params_)
print("Score:",grid_search.best_score_)

Best params: {'eta0': 1e-05, 'learning_rate': 'optimal', 'loss': 'hinge', 'max_iter': 500, 'penalty': 'elasticnet'}
Score: 0.8587290818634102


### Random Forest

In [24]:
param_grid = {"n_estimators":[50,100,300,500,1000,1500],"criterion":["gini","entropy"],
             "max_features":["auto","sqrt","log2"],"random_state":[42],
              "class_weight":["balanced","balanced_subsample",None]}

clf = RandomForestClassifier()

grid_search = GridSearchCV(clf, param_grid, scoring="roc_auc", cv=5, return_train_score = True)

grid_search.fit(X,y)

print("Best params:",grid_search.best_params_)
print("Score:",grid_search.best_score_)

Best params: {'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 50, 'random_state': 42}
Score: 0.8734678878335593


###  Extremely randomised trees

In [25]:
param_grid = {"n_estimators":[50,100,300,500,1000,1500],"criterion":["gini","entropy"],
             "max_features":["auto","sqrt","log2"],"random_state":[42],
              "class_weight":["balanced","balanced_subsample",None]}

clf = ExtraTreesClassifier()

grid_search = GridSearchCV(clf, param_grid, scoring="roc_auc", cv=5, return_train_score = True)

grid_search.fit(X,y)

print("Best params:",grid_search.best_params_)
print("Score:",grid_search.best_score_)

Best params: {'class_weight': None, 'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 300, 'random_state': 42}
Score: 0.8760119855269111


### XGB

In [26]:
param_grid = {"random_state":[42],"sampling_method":["uniform","gradient_based"],
             "tree_method":["auto","exact","approx","hist","gpu_hist"],
             "grow_policy":['depthwise', 'lossguide'],
             "predictor":["auto","cpu_predictor","gpu_predictor"]}

clf = XGBClassifier()

grid_search = GridSearchCV(clf, param_grid, scoring="roc_auc", cv=5, return_train_score = True)

grid_search.fit(X,y)

print("Best params:",grid_search.best_params_)
print("Score:",grid_search.best_score_)

Best params: {'grow_policy': 'depthwise', 'predictor': 'cpu_predictor', 'random_state': 42, 'sampling_method': 'uniform', 'tree_method': 'approx'}
Score: 0.8565355042966983
