In [1]:
import pandas as pd
import numpy as np

In [2]:
CIR_white = pd.read_excel('CIR.xlsx', 'CIR_white')
CIR_white['date'] = pd.to_datetime(CIR_white['date'], dayfirst=True)
CIR_pink = pd.read_excel('CIR.xlsx', 'CIR_pink')
CIR_pink['date'] = pd.to_datetime(CIR_pink['date'], dayfirst=True)
CIR_red = pd.read_excel('CIR.xlsx', 'CIR_red')
CIR_red['date'] = pd.to_datetime(CIR_red['date'], dayfirst=True)

### Need to split all data into sequences. ###

In [3]:
def create_sequences(X, Y, train_batch=100):
    X_train, Y_train = [], []
    for i in range(train_batch, len(X)):
        X_train.append(X[(i-train_batch):(i-1)])
        Y_train.append(Y[i-1])
    return np.array(X_train), np.array(Y_train)

#### White noise ####

In [4]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

scaled = scaler.fit_transform(np.array(CIR_white['interest_rate']).reshape(-1, 1))
scaled = scaled.reshape(len(CIR_white['interest_rate']))

In [5]:
from sklearn.model_selection import train_test_split

X, Y = create_sequences(scaled, CIR_white['breakpoint'])

X_train, X_test = train_test_split(X, test_size=0.2, shuffle=False)
X_train, X_val = train_test_split(X_train, test_size=0.1, shuffle=False)

Y_train, Y_test = train_test_split(Y, test_size=0.2, shuffle=False)
Y_train, Y_val = train_test_split(Y_train, test_size=0.1, shuffle=False)

train_dates, test_dates = train_test_split(CIR_white['date'], test_size=0.2, shuffle=False)
train_dates, val_dates = train_test_split(train_dates, test_size=0.1, shuffle=False)

sample_weight = [1] * len(Y)
for i in range (len(Y)):
    if Y[i] == 1:
        sample_weight[i] *= 10

In [26]:
from xgboost import XGBClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

max_depth_values = range(2, 5)
max_leaves_values = range(9)
learning_rate_values = [0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
best_logloss = np.inf
best_max_depth = 0
best_max_leaves = 0
best_learning_rate = 0
best_accuracy = 0
best_precision = 0
best_recall = 0

for max_depth in max_depth_values:
    for max_leaves in max_leaves_values:
        for learning_rate in learning_rate_values:
            model = XGBClassifier(n_estimators=100, max_depth=max_depth, max_leaves=max_leaves, learning_rate=learning_rate, objective='binary:logistic')
            model.fit(X_train, Y_train, sample_weight=sample_weight[:len(Y_train)])
            val_pred = model.predict(X_val)
            val_logloss = log_loss(Y_val, val_pred, sample_weight=sample_weight[len(Y_train):(len(Y_train)+len(Y_val))])

            if val_logloss < best_logloss:
                best_logloss = val_logloss
                best_accuracy = accuracy_score(Y_val, val_pred, sample_weight=sample_weight[len(Y_train):(len(Y_train)+len(Y_val))])
                best_precision = precision_score(Y_val, val_pred, sample_weight=sample_weight[len(Y_train):(len(Y_train)+len(Y_val))])
                best_recall = recall_score(Y_val, val_pred, sample_weight=sample_weight[len(Y_train):(len(Y_train)+len(Y_val))])
                best_max_depth = max_depth
                best_learning_rate = learning_rate

print("Best logloss = %.3f" % best_logloss, sep="")
print("Best max depth = ", best_max_depth, sep="")
print("Best max leaves = ", best_max_leaves, sep="")
print("Best learning rate = ", best_learning_rate, sep="")
print("Best accuracy score = ", best_accuracy, sep="")
print("Best precision score = ", best_precision, sep="")
print("Best recall score = ", best_recall, sep="")

Best logloss = 9.046
Best max depth = 2
Best max leaves = 0
Best learning rate = 1e-05
Best accuracy score = 0.7380952380952381
Best precision score = 0.7692307692307693
Best recall score = 0.25


In [27]:
best_model = XGBClassifier(n_estimators=1000, max_depth=best_max_depth, max_leaves=best_max_leaves, learning_rate=best_learning_rate, objective='binary:logistic')
best_model.fit(X_train, Y_train, sample_weight=sample_weight[:len(Y_train)])
test_pred = best_model.predict(X_test)
test_logloss = log_loss(Y_test, test_pred, sample_weight=sample_weight[len(Y_train)+len(Y_val):])
test_accuracy = accuracy_score(Y_test, test_pred, sample_weight=sample_weight[len(Y_train)+len(Y_val):])
test_precision = precision_score(Y_test, test_pred, sample_weight=sample_weight[len(Y_train)+len(Y_val):])
test_recall = recall_score(Y_test, test_pred, sample_weight=sample_weight[len(Y_train)+len(Y_val):])

print("Test logloss = %.3f" % test_logloss, sep="")
print("Test accuracy score = ", test_accuracy, sep="")
print("Test precision score = ", test_precision, sep="")
print("Test recall score = ", test_recall, sep="")

Test logloss = 16.097
Test accuracy score = 0.5339506172839507
Test precision score = 0.297029702970297
Test recall score = 0.2727272727272727


#### Pink noise ####

In [36]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

scaled = scaler.fit_transform(np.array(CIR_pink['interest_rate']).reshape(-1, 1))
scaled = scaled.reshape(len(CIR_pink['interest_rate']))

In [40]:
from sklearn.model_selection import train_test_split

X, Y = create_sequences(scaled, CIR_pink['breakpoint'])

X_train, X_test = train_test_split(X, test_size=0.2, shuffle=False)
X_train, X_val = train_test_split(X_train, test_size=0.1, shuffle=False)

Y_train, Y_test = train_test_split(Y, test_size=0.2, shuffle=False)
Y_train, Y_val = train_test_split(Y_train, test_size=0.1, shuffle=False)

train_dates, test_dates = train_test_split(CIR_pink['date'], test_size=0.2, shuffle=False)
train_dates, val_dates = train_test_split(train_dates, test_size=0.1, shuffle=False)

sample_weight = [1] * len(Y)
for i in range (len(Y)):
    if Y[i] == 1:
        sample_weight[i] *= 30

In [41]:
from xgboost import XGBClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

max_depth_values = range(2, 5)
max_leaves_values = range(9)
learning_rate_values = [0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
best_logloss = np.inf
best_max_depth = 0
best_max_leaves = 0
best_learning_rate = 0
best_accuracy = 0
best_precision = 0
best_recall = 0

for max_depth in max_depth_values:
    for max_leaves in max_leaves_values:
        for learning_rate in learning_rate_values:
            model = XGBClassifier(n_estimators=100, max_depth=max_depth, max_leaves=max_leaves, learning_rate=learning_rate, objective='binary:logistic')
            model.fit(X_train, Y_train, sample_weight=sample_weight[:len(Y_train)])
            val_pred = model.predict(X_val)
            val_logloss = log_loss(Y_val, val_pred, sample_weight=sample_weight[len(Y_train):(len(Y_train)+len(Y_val))])

            if val_logloss < best_logloss:
                best_logloss = val_logloss
                best_accuracy = accuracy_score(Y_val, val_pred, sample_weight=sample_weight[len(Y_train):(len(Y_train)+len(Y_val))])
                best_precision = precision_score(Y_val, val_pred, sample_weight=sample_weight[len(Y_train):(len(Y_train)+len(Y_val))])
                best_recall = recall_score(Y_val, val_pred, sample_weight=sample_weight[len(Y_train):(len(Y_train)+len(Y_val))])
                best_max_depth = max_depth
                best_learning_rate = learning_rate

print("Best logloss = %.3f" % best_logloss, sep="")
print("Best max depth = ", best_max_depth, sep="")
print("Best max leaves = ", best_max_leaves, sep="")
print("Best learning rate = ", best_learning_rate, sep="")
print("Best accuracy score = ", best_accuracy, sep="")
print("Best precision score = ", best_precision, sep="")
print("Best recall score = ", best_recall, sep="")

  _warn_prf(average, modifier, msg_start, len(result))


Best logloss = 8.707
Best max depth = 2
Best max leaves = 0
Best learning rate = 1e-05
Best accuracy score = 0.7478991596638656
Best precision score = 0.0
Best recall score = 0.0


In [42]:
best_model = XGBClassifier(n_estimators=1000, max_depth=best_max_depth, max_leaves=best_max_leaves, learning_rate=best_learning_rate, objective='binary:logistic')
best_model.fit(X_train, Y_train, sample_weight=sample_weight[:len(Y_train)])
test_pred = best_model.predict(X_test)
test_logloss = log_loss(Y_test, test_pred, sample_weight=sample_weight[len(Y_train)+len(Y_val):])
test_accuracy = accuracy_score(Y_test, test_pred, sample_weight=sample_weight[len(Y_train)+len(Y_val):])
test_precision = precision_score(Y_test, test_pred, sample_weight=sample_weight[len(Y_train)+len(Y_val):])
test_recall = recall_score(Y_test, test_pred, sample_weight=sample_weight[len(Y_train)+len(Y_val):])

print("Test logloss = %.3f" % test_logloss, sep="")
print("Test accuracy score = ", test_accuracy, sep="")
print("Test precision score = ", test_precision, sep="")
print("Test recall score = ", test_recall, sep="")

Test logloss = 16.947
Test accuracy score = 0.5093457943925234
Test precision score = 0.0
Test recall score = 0.0


  _warn_prf(average, modifier, msg_start, len(result))


### Red noise ###

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

scaled = scaler.fit_transform(np.array(CIR_red['interest_rate']).reshape(-1, 1))
scaled = scaled.reshape(len(CIR_red['interest_rate']))

In [None]:
from sklearn.model_selection import train_test_split

X, Y = create_sequences(scaled, CIR_red['breakpoint'])

X_train, X_test = train_test_split(X, test_size=0.2, shuffle=False)
X_train, X_val = train_test_split(X_train, test_size=0.1, shuffle=False)

Y_train, Y_test = train_test_split(Y, test_size=0.2, shuffle=False)
Y_train, Y_val = train_test_split(Y_train, test_size=0.1, shuffle=False)

train_dates, test_dates = train_test_split(CIR_red['date'], test_size=0.2, shuffle=False)
train_dates, val_dates = train_test_split(train_dates, test_size=0.1, shuffle=False)

sample_weight = [1] * len(Y)
for i in range (len(Y)):
    if Y[i] == 1:
        sample_weight[i] *= 50

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

max_depth_values = range(2, 5)
max_leaves_values = range(9)
learning_rate_values = [0.00001, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
best_logloss = np.inf
best_max_depth = 0
best_max_leaves = 0
best_learning_rate = 0
best_accuracy = 0
best_precision = 0
best_recall = 0

for max_depth in max_depth_values:
    for max_leaves in max_leaves_values:
        for learning_rate in learning_rate_values:
            model = XGBClassifier(n_estimators=100, max_depth=max_depth, max_leaves=max_leaves, learning_rate=learning_rate, objective='binary:logistic')
            model.fit(X_train, Y_train, sample_weight=sample_weight[:len(Y_train)])
            val_pred = model.predict(X_val)
            val_logloss = log_loss(Y_val, val_pred, sample_weight=sample_weight[len(Y_train):(len(Y_train)+len(Y_val))])

            if val_logloss < best_logloss:
                best_logloss = val_logloss
                best_accuracy = accuracy_score(Y_val, val_pred, sample_weight=sample_weight[len(Y_train):(len(Y_train)+len(Y_val))])
                best_precision = precision_score(Y_val, val_pred, sample_weight=sample_weight[len(Y_train):(len(Y_train)+len(Y_val))])
                best_recall = recall_score(Y_val, val_pred, sample_weight=sample_weight[len(Y_train):(len(Y_train)+len(Y_val))])
                best_max_depth = max_depth
                best_learning_rate = learning_rate

print("Best logloss = %.3f" % best_logloss, sep="")
print("Best max depth = ", best_max_depth, sep="")
print("Best max leaves = ", best_max_leaves, sep="")
print("Best learning rate = ", best_learning_rate, sep="")
print("Best accuracy score = ", best_accuracy, sep="")
print("Best precision score = ", best_precision, sep="")
print("Best recall score = ", best_recall, sep="")

In [None]:
best_model = XGBClassifier(n_estimators=1000, max_depth=best_max_depth, max_leaves=best_max_leaves, learning_rate=best_learning_rate, objective='binary:logistic')
best_model.fit(X_train, Y_train, sample_weight=sample_weight[:len(Y_train)])
test_pred = best_model.predict(X_test)
test_logloss = log_loss(Y_test, test_pred, sample_weight=sample_weight[len(Y_train)+len(Y_val):])
test_accuracy = accuracy_score(Y_test, test_pred, sample_weight=sample_weight[len(Y_train)+len(Y_val):])
test_precision = precision_score(Y_test, test_pred, sample_weight=sample_weight[len(Y_train)+len(Y_val):])
test_recall = recall_score(Y_test, test_pred, sample_weight=sample_weight[len(Y_train)+len(Y_val):])

print("Test logloss = %.3f" % test_logloss, sep="")
print("Test accuracy score = ", test_accuracy, sep="")
print("Test precision score = ", test_precision, sep="")
print("Test recall score = ", test_recall, sep="")