In [1]:
### Import libraries ----
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [2]:
### Generate data ----
#!Python 1_Get_UCDP.py

In [3]:
### Load data ----
df_ucdp = pd.read_csv('ucdp_month.csv')
df_ucdp = df_ucdp.set_index("date")
df_ucdp

Unnamed: 0_level_0,Afghanistan,Albania,Algeria,Angola,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahrain,...,Uganda,Ukraine,United Arab Emirates,United Kingdom,United States of America,Uzbekistan,Venezuela,Yemen (North Yemen),Zambia,Zimbabwe (Rhodesia)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1989-01,1298.788462,0,0.0,300.273810,0,0.0,0,0,0.000000,0,...,0.000000,0.000000,0,3.0,0,0.0,0.000000,0.000000,0.0,0.0
1989-02,198.715385,0,0.0,427.809524,0,0.0,0,0,0.000000,0,...,0.000000,0.000000,0,5.0,0,0.0,21.897727,0.000000,0.0,0.0
1989-03,2211.050962,0,0.0,415.261905,0,0.0,0,0,0.000000,0,...,0.000000,0.000000,0,5.0,0,0.0,22.505682,0.000000,0.0,10.5
1989-04,510.730769,0,0.0,177.857143,0,0.0,0,0,0.000000,0,...,97.333333,0.000000,0,0.0,0,0.0,0.000000,0.000000,0.0,0.0
1989-05,821.833544,0,0.0,278.592437,0,0.0,0,0,0.000000,0,...,593.126667,0.000000,0,1.0,0,0.0,0.000000,0.000000,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09,813.026316,0,0.0,0.000000,0,1.0,0,0,0.857143,0,...,0.000000,15.082873,0,0.0,0,0.0,1.000000,3212.639990,0.0,0.0
2021-10,243.317982,0,0.0,0.000000,0,0.0,0,0,1.000000,0,...,0.000000,15.310635,0,0.0,0,0.0,0.000000,5431.433057,0.0,0.0
2021-11,80.500000,0,0.0,1.000000,0,1.0,0,0,14.000000,0,...,0.000000,13.151838,0,0.0,0,0.0,0.000000,5790.900966,0.0,0.0
2021-12,28.000000,0,0.0,0.000000,0,1.0,0,0,4.000000,0,...,0.000000,9.613444,0,0.0,0,0.0,0.000000,2273.000000,0.0,0.0


In [4]:
### Prepare data -----

# Use 10 months sequences
number_s=11          

# Normalization
scaler = MinMaxScaler(feature_range=(0,1))
df = pd.DataFrame(scaler.fit_transform(df_ucdp))

# Create time sequences
ts_seq=[]
for col in range(len(df.columns)):
    for i in range(number_s-1,len(df)):
        ts_seq.append(df.iloc[i-number_s+1:i+1,col])
        
# Create training, validation and test partitions with 80-20-20 split
ts_seq=np.array(ts_seq)
ts_seq_l= ts_seq.reshape(len(df.columns),len(df.index)-number_s+1,number_s)

ts_seq_learn=ts_seq_l[:,:int(0.6*len(df)),:]
ts_seq_learn=ts_seq_learn.reshape(ts_seq_learn.shape[0]*ts_seq_learn.shape[1],number_s)

ts_seq_val=ts_seq_l[:,int(0.6*len(df)):int(0.8*len(df)),:]
ts_seq_val=ts_seq_val.reshape(ts_seq_val.shape[0]*ts_seq_val.shape[1],number_s)

ts_seq_test=ts_seq_l[:,int(0.8*len(df)):,:]
ts_seq_test=ts_seq_test.reshape(ts_seq_test.shape[0]*ts_seq_test.shape[1],number_s)

# Obtain input and output training, validation and test partitions
train_x = ts_seq_learn[:,:-1]
train_y = ts_seq_learn[:,-1]
val_x = ts_seq_val[:,:-1]
val_y = ts_seq_val[:,-1]
test_x = ts_seq_test[:,:-1]
test_y = ts_seq_test[:,-1]

In [5]:
### Hyperparameter tuning using validation data -----

# Empty df
tuning = pd.DataFrame(columns=["n_trees", "min_split", "mse", "wmse"])

# Loop through different values of n_trees and min_samples_split
for n in [50, 100, 200, 500, 1000]:
    print(n)
    for min_split in [20, 30, 40, 50]:
        print(min_split)
        
        # fit model, make predictions and calculate mse, wmse
        model = RandomForestRegressor(n_estimators=n, 
                                      min_samples_split=min_split, 
                                      random_state=1).fit(train_x, train_y)
        pred = model.predict(val_x)
        mse = mean_squared_error(val_y, pred)
        wmse =  mean_squared_error(val_y,pred,
                                   sample_weight=val_y+1)
    
        # Save results
        para = pd.DataFrame({"n_trees": n,
                             "min_split": min_split,
                             "mse": mse, 
                             "wmse": wmse},
                             index=[n])
        tuning = tuning.append(para)
        tuning.reset_index(drop=True, inplace=True)

tuning.to_latex(f'tuning_random_forest.tex', index=False)
tuning

50
20
30
40
50
100
20
30
40
50
200
20
30
40
50
500
20
30
40
50
1000
20
30
40
50


Unnamed: 0,n_trees,min_split,mse,wmse
0,50,20,0.003651,0.005737
1,50,30,0.003584,0.005664
2,50,40,0.00356,0.005647
3,50,50,0.003556,0.005656
4,100,20,0.003613,0.005687
5,100,30,0.003554,0.005625
6,100,40,0.003538,0.005616
7,100,50,0.003542,0.005641
8,200,20,0.003585,0.005649
9,200,30,0.003541,0.00561


In [6]:
### Sort by wmse ----
tuning.sort_values(by="wmse")

Unnamed: 0,n_trees,min_split,mse,wmse
18,1000,40,0.00352,0.005601
14,500,40,0.003522,0.005602
17,1000,30,0.003534,0.005602
13,500,30,0.003536,0.005603
10,200,40,0.00353,0.005609
9,200,30,0.003541,0.00561
6,100,40,0.003538,0.005616
19,1000,50,0.003526,0.005624
5,100,30,0.003554,0.005625
15,500,50,0.00353,0.005628


In [7]:
### Make prediction in test data for final model -----
model = RandomForestRegressor(n_estimators=50, 
                              min_samples_split=50, 
                              random_state=1).fit(train_x, 
                                                  train_y)
pred = model.predict(test_x)

# mse
mse = mean_squared_error(test_y,
                         pred)
print(mse)

# wmse
wmse =  mean_squared_error(test_y,
                           pred,
                           sample_weight=test_y+1)
print(wmse)

0.004485809630112067
0.007135581135980753


In [8]:
### Convert predictions back to original dataframe format ----
df_rf = pred.reshape((len(df.iloc[0,:]),int(len(ts_seq_test[:,:-1])/len(df.iloc[0,:]))))
df_rf = df_rf.T
df_nn = pd.DataFrame(df_rf)
df_rf = pd.DataFrame(scaler.inverse_transform(df_rf))
df_rf.columns = df_ucdp.columns
df_rf.index=df_ucdp.index[-70:]

# Save
df_rf.to_csv('preds_rf.csv', index=False)
df_rf.head(10)

Unnamed: 0_level_0,Afghanistan,Albania,Algeria,Angola,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahrain,...,Uganda,Ukraine,United Arab Emirates,United Kingdom,United States of America,Uzbekistan,Venezuela,Yemen (North Yemen),Zambia,Zimbabwe (Rhodesia)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-04,1986.933332,0.012335,13.009005,2.810563,0.176805,0.8432,0.004112,0.008223,9.290326,0.016447,...,19.248918,47.04441,0.002056,0.106905,5.976403,0.712016,7.678064,593.185744,0.041117,0.328939
2016-05,2199.136478,0.012335,39.257007,3.043565,0.176805,0.548783,0.004112,0.008223,336.405198,0.016447,...,12.936093,39.702841,0.002056,0.106905,43.250889,0.712016,8.176426,649.584435,0.041117,0.328939
2016-06,2737.535787,0.012335,23.922124,4.753215,0.176805,0.11006,0.004112,0.008223,35.945428,0.016447,...,3.805657,53.417071,0.002056,0.106905,12.34719,0.712016,6.839831,507.49584,0.041117,0.328939
2016-07,2290.122956,0.012335,34.790446,8.272386,0.176805,0.048834,0.004112,0.008223,16.264272,0.016447,...,3.557535,54.448123,0.002056,0.106905,1.945251,0.712016,8.177032,477.142199,0.041117,0.328939
2016-08,2309.804356,0.012335,25.470952,6.844361,0.176805,0.219978,0.004112,0.008223,152.394208,0.016447,...,5.02148,68.584459,0.002056,0.106905,11.175849,0.712016,4.790108,507.146739,0.041117,0.328939
2016-09,2486.197169,0.012335,8.4102,4.113017,0.176805,0.219978,0.004112,0.008223,30.875814,0.016447,...,2.668858,51.664038,0.002056,0.106905,1.245878,0.712016,6.288933,403.544258,0.041117,0.328939
2016-10,2319.506711,0.012335,7.838959,18.559919,0.176805,0.219978,0.004112,0.008223,348.572151,0.016447,...,4.438735,62.604562,0.002056,0.106905,1.473821,0.712016,8.923621,386.251794,0.041117,0.328939
2016-11,2549.238978,0.012335,14.167575,9.621388,0.176805,0.219978,0.004112,0.008223,4.965392,0.016447,...,11.017777,43.973933,0.002056,0.106905,6.138827,0.712016,6.192692,463.174169,0.041117,0.328939
2016-12,2014.343875,0.012335,9.037607,6.811235,0.176805,0.219978,0.004112,0.008223,23.416833,0.016447,...,2.878217,41.2636,0.002056,0.106905,6.138827,0.712016,6.895843,564.751598,0.041117,0.328939
2017-01,1758.551027,0.012335,14.300991,5.174069,0.176805,1.190455,0.004112,0.008223,5.870205,0.016447,...,0.670352,29.659795,0.002056,0.106905,6.138827,0.712016,3.973674,386.310679,0.041117,0.328939


In [18]:
### Get feature importance scores ---
feat_imp = pd.DataFrame(model.feature_importances_)
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

# Create df
feat_imp["X"] = "1"
feat_imp["X"].iloc[1] = "2"
feat_imp["X"].iloc[2] = "3"
feat_imp["X"].iloc[3] = "4"
feat_imp["X"].iloc[3] = "4"
feat_imp["X"].iloc[4] = "5"
feat_imp["X"].iloc[5] = "6"
feat_imp["X"].iloc[6] = "7"
feat_imp["X"].iloc[7] = "8"
feat_imp["X"].iloc[8] = "9"
feat_imp["X"].iloc[9] = "10"
feat_imp = feat_imp[["X", 0]]

# Rename column
# Source: https://www.statology.org/pandas-rename-columns/
feat_imp.rename(columns = {0: 'feat_imp'}, inplace = True)

# Sort
feat_imp = feat_imp.sort_values(by='feat_imp', ascending=False)

# Save
feat_imp.T.to_latex("feat_imp.tex", index=False)
feat_imp.T

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,9,8,7,5,4,1,6,2,3,0
X,10.0,9.0,8.0,6.0,5.0,2.0,7.0,3.0,4.0,1.0
feat_imp,0.622691,0.09436,0.066672,0.036868,0.03367,0.031976,0.030782,0.030446,0.029348,0.023187
