In [14]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import tensorflow as tf
import keras as keras
import optuna
from sklearn.model_selection import cross_val_score
plt.style.use('ggplot')


In [15]:
codes = {}
codes['US'] = 'USGG10YR'
codes['Germany'] = 'GDBR10'
codes['UK'] = 'GUKG10'
codes['France'] = 'GFRN10'
codes['Australia'] = 'GACGB10'
codes['Canada'] = 'GCAN10YR'
codes['New Zealand'] = 'GNZGB10'
codes['Japan'] = 'JGBS10'
codes['Switzerland'] = 'GSWISS10'
codes['Norway'] = 'GNOR10YR'
codes['Italy'] = 'GBTPGR10'

sheet_names = pd.ExcelFile('G10_RV.xlsx').sheet_names[:11]
dfs = {x: pd.read_excel('G10_RV.xlsx', sheet_name=x)[['Date', 'Last Price']].rename(columns={'Last Price': x}) for x in sheet_names}
df = pd.DataFrame({'Date': dfs[sheet_names[0]]['Date']})  

for key in dfs:
    df = pd.merge(df, dfs[key], on='Date', how='outer')

df = df.set_index('Date').resample('D').asfreq().ffill().dropna()
df.iloc[[0,-1],:]

Unnamed: 0_level_0,USGG10YR,GDBR10,GUKG10,GFRN10,GACGB10,GCAN10YR,GNZGB10,JGBS10,GSWISS10,GNOR10YR,GBTPGR10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010-01-05,3.7608,3.373,4.016,3.592,5.621,3.565,5.843,1.329,2.009,4.147,4.101
2024-04-12,4.5216,2.359,4.137,2.865,4.266,3.649,4.839,0.864,0.739,3.707,3.762


In [16]:
training_X = df.copy()
training_y = df.copy()
testing_X = df.copy()
testing_y = df.copy()

def predictor(method, target, t):
    data = df.copy()
    target = codes[target]
    target_t = f'{target}_{t}'
    ts = [1,5,10,25,50,100]

    for x in data:
        for z in ts:
            data[f'{x}_{z}'] = df[x].diff(z)

    data = data.dropna()
    cutoff = '2020-1-1'
    training = data[data.index < cutoff]
    testing = data[data.index > cutoff]
    training_X = training[[x for x in training if '_' in x and x != target_t]]
    training_y = training[target_t]
    testing_X = testing[[x for x in testing if '_' in x and x != target_t]]
    testing_y = testing[target_t]
    # scaler = StandardScaler()
    # training_X = scaler.fit_transform(training_X)
    # testing_X = scaler.transform(testing_X)
    # model = tf.keras.models.Sequential([
    #         tf.keras.layers.Dense(64, activation='relu', input_shape=[training_X.shape[1]]),
    #         tf.keras.layers.Dropout(0.3),
    #         tf.keras.layers.Dense(32, activation='relu'),
    #         tf.keras.layers.Dropout(0.3),
    #         tf.keras.layers.Dense(1)
    # ])
    # model.compile(optimizer='adam', loss='mean_squared_error')
    # early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)
    rfr = RandomForestRegressor(random_state=34)
    rfr.fit(training_X, training_y)
    training_prediction = rfr.predict(training_X)
    testing_prediction = rfr.predict(testing_X)
    mae = mean_absolute_error(testing_y, testing_prediction)
    mse = mean_squared_error(testing_y, testing_prediction)
    training_accuracy = round(r2_score(training_y, training_prediction), 2)
    testing_accuracy = round(r2_score(testing_y, testing_prediction), 2)
    prediction = pd.DataFrame(testing[target].copy())
    prediction['c_prediction'] = testing_prediction
    prediction['prediction'] = prediction[target].shift(t) + prediction['c_prediction']
    prediction = prediction[[target, 'prediction']].dropna()
    return prediction, training_accuracy, testing_accuracy, mae, mse

def performance(t):
    performances = pd.DataFrame()
    performances['Country'] = codes.keys()
    results = performances['Country'].apply(lambda x: predictor('neural', x, t))  # get all results
    performances['r2: Training'] = results.apply(lambda x: x[1])  # training_accuracy
    performances['r2: Testing'] = results.apply(lambda x: x[2])  # testing_accuracy
    performances['MAE'] = results.apply(lambda x: x[3])  # mean_absolute_error
    performances['MSE'] = results.apply(lambda x: x[4])  # mean_square_error
    performances = performances.set_index('Country')
    nnte_mean = round(performances['r2: Training'].mean(), 2)
    nntr_mean = round(performances['r2: Testing'].mean(), 2)
    mae_mean = round(performances['MAE'].mean(), 3)
    mse_mean = round(performances['MSE'].mean(), 3)
    performances.loc['Mean'] = {'r2: Training': nnte_mean, 'r2: Testing': nntr_mean, 'MAE': mae_mean, 'MSE': mse_mean}
    return performances

performance(10)

Unnamed: 0_level_0,r2: Training,r2: Testing,MAE,MSE
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
US,0.99,0.8,0.050682,0.004994
Germany,0.99,0.88,0.033285,0.002568
UK,0.98,0.68,0.06275,0.01091
France,0.98,0.88,0.030771,0.002765
Australia,0.98,0.76,0.060806,0.006913
Canada,0.98,0.8,0.05049,0.005132
New Zealand,0.98,0.73,0.066748,0.008112
Japan,0.97,0.61,0.018565,0.000772
Switzerland,0.98,0.71,0.044952,0.004762
Norway,0.97,0.72,0.056568,0.00588


In [17]:
def objective(trial, training_X, training_y):
    n_estimators = trial.suggest_int('n_estimators', 2, 150)
    max_depth = trial.suggest_int('max_depth', 1, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf =  trial.suggest_int('min_samples_leaf', 1, 20)

    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)

    score = cross_val_score(model, training_X, training_y, cv=5, scoring='neg_mean_squared',n_jobs=-1).mean()

    return score

In [18]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler())

[I 2024-04-24 19:43:53,036] A new study created in memory with name: no-name-3c162625-1120-4c77-abc9-106eda4ae145


In [19]:
study.optimize(lambda trial: objective(trial, training_X, training_y), n_trials=200)

[W 2024-04-24 19:43:53,041] Trial 0 failed with parameters: {'n_estimators': 76, 'max_depth': 17, 'min_samples_split': 14, 'min_samples_leaf': 3} because of the following error: InvalidParameterError("The 'scoring' parameter of cross_val_score must be a str among {'precision_weighted', 'explained_variance', 'neg_log_loss', 'f1_macro', 'r2', 'top_k_accuracy', 'v_measure_score', 'max_error', 'roc_auc_ovr_weighted', 'neg_mean_squared_error', 'neg_brier_score', 'positive_likelihood_ratio', 'adjusted_mutual_info_score', 'neg_mean_absolute_error', 'normalized_mutual_info_score', 'f1_weighted', 'average_precision', 'jaccard_samples', 'jaccard_micro', 'precision_micro', 'precision_samples', 'recall_samples', 'rand_score', 'neg_mean_gamma_deviance', 'neg_root_mean_squared_error', 'adjusted_rand_score', 'jaccard_weighted', 'fowlkes_mallows_score', 'roc_auc_ovr', 'neg_mean_squared_log_error', 'roc_auc_ovo', 'neg_median_absolute_error', 'balanced_accuracy', 'precision', 'jaccard_macro', 'mutual_in

InvalidParameterError: The 'scoring' parameter of cross_val_score must be a str among {'precision_weighted', 'explained_variance', 'neg_log_loss', 'f1_macro', 'r2', 'top_k_accuracy', 'v_measure_score', 'max_error', 'roc_auc_ovr_weighted', 'neg_mean_squared_error', 'neg_brier_score', 'positive_likelihood_ratio', 'adjusted_mutual_info_score', 'neg_mean_absolute_error', 'normalized_mutual_info_score', 'f1_weighted', 'average_precision', 'jaccard_samples', 'jaccard_micro', 'precision_micro', 'precision_samples', 'recall_samples', 'rand_score', 'neg_mean_gamma_deviance', 'neg_root_mean_squared_error', 'adjusted_rand_score', 'jaccard_weighted', 'fowlkes_mallows_score', 'roc_auc_ovr', 'neg_mean_squared_log_error', 'roc_auc_ovo', 'neg_median_absolute_error', 'balanced_accuracy', 'precision', 'jaccard_macro', 'mutual_info_score', 'neg_root_mean_squared_log_error', 'roc_auc', 'completeness_score', 'recall_micro', 'recall', 'neg_mean_poisson_deviance', 'precision_macro', 'homogeneity_score', 'neg_mean_absolute_percentage_error', 'f1', 'f1_micro', 'jaccard', 'recall_macro', 'roc_auc_ovo_weighted', 'matthews_corrcoef', 'neg_negative_likelihood_ratio', 'f1_samples', 'accuracy', 'recall_weighted'}, a callable or None. Got 'neg_mean_squared' instead.