In [1]:
### Import libraries ----
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


In [2]:
### Generate data ----
#!Python 1_Get_UCDP.py

In [3]:
### Load data ----
df_ucdp = pd.read_csv('ucdp_month.csv')
df_ucdp = df_ucdp.set_index("date")
df_ucdp

Unnamed: 0_level_0,Afghanistan,Albania,Algeria,Angola,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahrain,...,Uganda,Ukraine,United Arab Emirates,United Kingdom,United States of America,Uzbekistan,Venezuela,Yemen (North Yemen),Zambia,Zimbabwe (Rhodesia)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1989-01,1298.788462,0,0.0,300.273810,0,0.0,0,0,0.000000,0,...,0.000000,0.000000,0,3.0,0,0.0,0.000000,0.000000,0.0,0.0
1989-02,198.715385,0,0.0,427.809524,0,0.0,0,0,0.000000,0,...,0.000000,0.000000,0,5.0,0,0.0,21.897727,0.000000,0.0,0.0
1989-03,2211.050962,0,0.0,415.261905,0,0.0,0,0,0.000000,0,...,0.000000,0.000000,0,5.0,0,0.0,22.505682,0.000000,0.0,10.5
1989-04,510.730769,0,0.0,177.857143,0,0.0,0,0,0.000000,0,...,97.333333,0.000000,0,0.0,0,0.0,0.000000,0.000000,0.0,0.0
1989-05,821.833544,0,0.0,278.592437,0,0.0,0,0,0.000000,0,...,593.126667,0.000000,0,1.0,0,0.0,0.000000,0.000000,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09,813.026316,0,0.0,0.000000,0,1.0,0,0,0.857143,0,...,0.000000,15.082873,0,0.0,0,0.0,1.000000,3212.639990,0.0,0.0
2021-10,243.317982,0,0.0,0.000000,0,0.0,0,0,1.000000,0,...,0.000000,15.310635,0,0.0,0,0.0,0.000000,5431.433057,0.0,0.0
2021-11,80.500000,0,0.0,1.000000,0,1.0,0,0,14.000000,0,...,0.000000,13.151838,0,0.0,0,0.0,0.000000,5790.900966,0.0,0.0
2021-12,28.000000,0,0.0,0.000000,0,1.0,0,0,4.000000,0,...,0.000000,9.613444,0,0.0,0,0.0,0.000000,2273.000000,0.0,0.0


In [4]:
### Prepare data -----
number_s=11          # 10 months sequences

# Normalization
scaler = MinMaxScaler(feature_range=(0,1))
df = pd.DataFrame(scaler.fit_transform(df_ucdp))

In [5]:
# Creation of the sequences
ts_seq=[]
for col in range(len(df.columns)):
    for i in range(number_s-1,len(df)):
        ts_seq.append(df.iloc[i-number_s+1:i+1,col])

In [6]:
# Creation of trainset, validation and testet
ts_seq=np.array(ts_seq)
ts_seq_l= ts_seq.reshape(len(df.columns),len(df.index)-number_s+1,number_s)

ts_seq_learn=ts_seq_l[:,:int(0.6*len(df)),:]
ts_seq_learn=ts_seq_learn.reshape(ts_seq_learn.shape[0]*ts_seq_learn.shape[1],number_s)

ts_seq_val=ts_seq_l[:,int(0.6*len(df)):int(0.8*len(df)),:]
ts_seq_val=ts_seq_val.reshape(ts_seq_val.shape[0]*ts_seq_val.shape[1],number_s)

ts_seq_test=ts_seq_l[:,int(0.8*len(df)):,:]
ts_seq_test=ts_seq_test.reshape(ts_seq_test.shape[0]*ts_seq_test.shape[1],number_s)

In [7]:
# Creation of input/output for each set
train_x = ts_seq_learn[:,:-1]
train_y = ts_seq_learn[:,-1]
val_x = ts_seq_val[:,:-1]
val_y = ts_seq_val[:,-1]
test_x = ts_seq_test[:,:-1]
test_y = ts_seq_test[:,-1]

In [8]:
# Hyperparameter tuning -----

tuning = pd.DataFrame(columns=["n_trees", "min_split", "mse", "wmse"])

for n in [50, 100, 200, 500]:
    print(n)
    for min_split in [20, 30, 40]:
        print(min_split)
        model = RandomForestRegressor(n_estimators=n, min_samples_split=min_split).fit(train_x, train_y)
        pred = model.predict(val_x)
        mse = mean_squared_error(val_y, pred)
        wmse =  mean_squared_error(val_y,pred,sample_weight=val_y+1)
    
        # Save results
        para = pd.DataFrame({"n_trees": n,
                             "min_split": min_split,
                             "mse": mse, 
                             "wmse": wmse},
                             index=[n])
        tuning = tuning.append(para)
        tuning.reset_index(drop=True, inplace=True)

tuning.to_latex(f'tuning_random_forest.tex', index=False)
tuning

50
20
30
40
100
20
30
40
200
20
30
40
500
20
30
40


Unnamed: 0,n_trees,min_split,mse,wmse
0,50,20,0.003605,0.005674
1,50,30,0.003582,0.005667
2,50,40,0.003503,0.005567
3,100,20,0.003642,0.005728
4,100,30,0.003557,0.005632
5,100,40,0.003509,0.005585
6,200,20,0.003584,0.005652
7,200,30,0.003553,0.005621
8,200,40,0.003541,0.005632
9,500,20,0.003585,0.005653


In [9]:
### Sort ----
tuning.sort_values(by="wmse")

Unnamed: 0,n_trees,min_split,mse,wmse
2,50,40,0.003503,0.005567
5,100,40,0.003509,0.005585
10,500,30,0.003546,0.005617
7,200,30,0.003553,0.005621
11,500,40,0.003534,0.005622
4,100,30,0.003557,0.005632
8,200,40,0.003541,0.005632
6,200,20,0.003584,0.005652
9,500,20,0.003585,0.005653
1,50,30,0.003582,0.005667


In [10]:
### Final model ----
model = RandomForestRegressor(n_estimators=500, min_samples_split=20).fit(train_x, train_y)
pred = model.predict(test_x)
mse = mean_squared_error(test_y,pred)
print(mse)
weighted_mse =  mean_squared_error(test_y,pred,sample_weight=test_y+1)
print(weighted_mse)

0.004503703314879411
0.007137012913948011


In [11]:
### Feature importance 
model.feature_importances_

array([0.03360047, 0.03517086, 0.03970189, 0.03879465, 0.04149096,
       0.04390485, 0.03940656, 0.06782452, 0.10897801, 0.55112724])

In [12]:
### Convert back to original dataframe ----
df_rf = pred.reshape((len(df.iloc[0,:]),int(len(ts_seq_test[:,:-1])/len(df.iloc[0,:]))))
df_rf = df_rf.T
df_nn = pd.DataFrame(df_rf)
df_rf = pd.DataFrame(scaler.inverse_transform(df_rf))
df_rf.columns = df_rf.columns
df_rf.index=df_ucdp.index[-70:]
df_rf.to_csv('preds_rf.csv', index=False)
df_rf.head(10)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,110,111,112,113,114,115,116,117,118,119
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-04,2189.938643,0.012468,26.964733,10.058388,0.178707,0.454071,0.004156,0.008312,8.681868,0.016624,...,25.039257,49.407544,0.002078,0.108055,1.051222,0.719677,11.738482,517.571341,0.04156,0.332478
2016-05,2258.209115,0.012468,31.704129,4.594114,0.178707,0.718201,0.004156,0.008312,156.102851,0.016624,...,10.77716,54.063195,0.002078,0.108055,32.763832,0.719677,8.596083,710.113365,0.04156,0.332478
2016-06,2591.377681,0.012468,22.856854,4.609231,0.178707,0.005288,0.004156,0.008312,32.833449,0.016624,...,4.495262,60.315563,0.002078,0.108055,12.714142,0.719677,8.648112,453.252794,0.04156,0.332478
2016-07,2212.605114,0.012468,29.061568,8.320847,0.178707,0.019364,0.004156,0.008312,16.627253,0.016624,...,5.985324,56.780034,0.002078,0.108055,2.526046,0.719677,7.110197,539.923911,0.04156,0.332478
2016-08,2233.547525,0.012468,12.679456,10.65605,0.178707,0.222345,0.004156,0.008312,73.110143,0.016624,...,5.930823,62.502375,0.002078,0.108055,6.487694,0.719677,5.040419,493.620201,0.04156,0.332478
2016-09,2413.150222,0.012468,14.997183,6.039785,0.178707,0.222345,0.004156,0.008312,30.962039,0.016624,...,2.781601,48.426351,0.002078,0.108055,1.884428,0.719677,8.134733,376.578797,0.04156,0.332478
2016-10,2398.404407,0.012468,10.3143,19.349351,0.178707,0.222345,0.004156,0.008312,573.828266,0.016624,...,3.097629,54.857916,0.002078,0.108055,4.613168,0.719677,8.404586,362.17543,0.04156,0.332478
2016-11,2352.37686,0.012468,14.326227,9.547748,0.178707,0.222345,0.004156,0.008312,11.029377,0.016624,...,6.390503,51.144561,0.002078,0.108055,6.204876,0.719677,7.805493,524.988131,0.04156,0.332478
2016-12,2135.243431,0.012468,12.421203,8.872331,0.178707,0.222345,0.004156,0.008312,27.654984,0.016624,...,0.959047,44.529329,0.002078,0.108055,6.204876,0.719677,7.063216,597.54174,0.04156,0.332478
2017-01,1782.35861,0.012468,28.84557,5.177369,0.178707,0.470528,0.004156,0.008312,6.957828,0.016624,...,0.21626,33.444298,0.002078,0.108055,6.204876,0.719677,5.553624,381.671513,0.04156,0.332478
