In [1]:
### Import libraries ----
import pandas as pd
import numpy as np
import sklearn.linear_model as linear_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [2]:
### Generate data ----
#!Python 1_Get_UCDP.py

In [3]:
### Load data ----
df_ucdp = pd.read_csv('ucdp_month.csv')
df_ucdp = df_ucdp.set_index("date")
df_ucdp.head(5)

Unnamed: 0_level_0,Afghanistan,Albania,Algeria,Angola,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahrain,...,Uganda,Ukraine,United Arab Emirates,United Kingdom,United States of America,Uzbekistan,Venezuela,Yemen (North Yemen),Zambia,Zimbabwe (Rhodesia)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1989-01,1298.788462,0,0.0,300.27381,0,0.0,0,0,0.0,0,...,0.0,0.0,0,3.0,0,0.0,0.0,0.0,0.0,0.0
1989-02,198.715385,0,0.0,427.809524,0,0.0,0,0,0.0,0,...,0.0,0.0,0,5.0,0,0.0,21.897727,0.0,0.0,0.0
1989-03,2211.050962,0,0.0,415.261905,0,0.0,0,0,0.0,0,...,0.0,0.0,0,5.0,0,0.0,22.505682,0.0,0.0,10.5
1989-04,510.730769,0,0.0,177.857143,0,0.0,0,0,0.0,0,...,97.333333,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0
1989-05,821.833544,0,0.0,278.592437,0,0.0,0,0,0.0,0,...,593.126667,0.0,0,1.0,0,0.0,0.0,0.0,4.0,0.0


In [4]:
### Prepare data -----
number_s=11          # 10 months sequences

# Normalization
scaler = MinMaxScaler(feature_range=(0,1))
df = pd.DataFrame(scaler.fit_transform(df_ucdp))

In [5]:
# Creation of the sequences
ts_seq=[]
for col in range(len(df.columns)):
    for i in range(number_s-1,len(df)):
        ts_seq.append(df.iloc[i-number_s+1:i+1,col])

In [6]:
# Creation of trainset, validation and testet
ts_seq=np.array(ts_seq)
ts_seq_l= ts_seq.reshape(len(df.columns),len(df.index)-number_s+1,number_s)

ts_seq_learn=ts_seq_l[:,:int(0.6*len(df)),:]
ts_seq_learn=ts_seq_learn.reshape(ts_seq_learn.shape[0]*ts_seq_learn.shape[1],number_s)

ts_seq_val=ts_seq_l[:,int(0.6*len(df)):int(0.8*len(df)),:]
ts_seq_val=ts_seq_val.reshape(ts_seq_val.shape[0]*ts_seq_val.shape[1],number_s)

ts_seq_test=ts_seq_l[:,int(0.8*len(df)):,:]
ts_seq_test=ts_seq_test.reshape(ts_seq_test.shape[0]*ts_seq_test.shape[1],number_s)

In [7]:
# Creation of input/output for each set
train_x = ts_seq_learn[:,:-1]
train_y = ts_seq_learn[:,-1]
val_x = ts_seq_val[:,:-1]
val_y = ts_seq_val[:,-1]
test_x = ts_seq_test[:,:-1]
test_y = ts_seq_test[:,-1]

In [8]:
# Hyperparameter tuning -----

tuning = pd.DataFrame(columns=["alpha", "mse", "wmse"])

for a in [0.1, 2, 3, 5, 10, 50, 200, 1000, 2000]:
    model = linear_model.Lasso(max_iter=2000,alpha=1/(2*a)).fit(train_x, train_y)
    pred = model.predict(val_x)
    mse = mean_squared_error(val_y, pred)
    wmse =  mean_squared_error(val_y,pred,sample_weight=val_y+1)
    
    # Save results
    para = pd.DataFrame({"alpha": a,
                         "mse": mse, 
                         "wmse": wmse},
                         index=[a])
    tuning = tuning.append(para)
    tuning.reset_index(drop=True, inplace=True)

tuning.to_latex(f'tuning_lasso.tex', index=False)
tuning

Unnamed: 0,alpha,mse,wmse
0,0.1,0.008102,0.012932
1,2.0,0.008102,0.012932
2,3.0,0.008102,0.012932
3,5.0,0.008102,0.012932
4,10.0,0.008102,0.012932
5,50.0,0.008102,0.012932
6,200.0,0.005524,0.008928
7,1000.0,0.003727,0.006017
8,2000.0,0.003635,0.005842


In [9]:
### Final model -----
model = linear_model.Lasso(max_iter=2000,alpha=1/(2*1000)).fit(train_x, train_y)
pred = model.predict(test_x)
mse = mean_squared_error(test_y,pred)
print(mse)
weighted_mse =  mean_squared_error(test_y,pred,sample_weight=test_y+1)
print(weighted_mse)

0.010545126013465054
0.01670663728149058


In [10]:
### Convert back to original dataframe ----
df_lasso = pred.reshape((len(df.iloc[0,:]),int(len(ts_seq_test[:,:-1])/len(df.iloc[0,:]))))
df_lasso = df_lasso.T
df_lasso = pd.DataFrame(df_lasso)
df_lasso = pd.DataFrame(scaler.inverse_transform(df_lasso))
df_lasso.columns = df_ucdp.columns
df_lasso.index=df_ucdp.index[-70:]
df_lasso.to_csv('preds_lasso.csv', index=False)
df_lasso.head(10)

Unnamed: 0_level_0,Afghanistan,Albania,Algeria,Angola,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahrain,...,Uganda,Ukraine,United Arab Emirates,United Kingdom,United States of America,Uzbekistan,Venezuela,Yemen (North Yemen),Zambia,Zimbabwe (Rhodesia)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-04,249.299577,0.150303,30.56697,120.489954,2.154349,2.680411,0.050101,0.100202,144.76361,0.200405,...,16.208553,35.041541,0.025051,1.30263,74.800997,8.675847,3.331726,145.06536,0.501011,4.008091
2016-05,249.299577,0.150303,30.56697,120.489954,2.154349,2.680411,0.050101,0.100202,144.76361,0.200405,...,16.208553,35.041541,0.025051,1.30263,74.800997,8.675847,3.331726,145.06536,0.501011,4.008091
2016-06,249.299577,0.150303,30.56697,120.489954,2.154349,2.680411,0.050101,0.100202,144.76361,0.200405,...,16.208553,35.041541,0.025051,1.30263,74.800997,8.675847,3.331726,145.06536,0.501011,4.008091
2016-07,249.299577,0.150303,30.56697,120.489954,2.154349,2.680411,0.050101,0.100202,144.76361,0.200405,...,16.208553,35.041541,0.025051,1.30263,74.800997,8.675847,3.331726,145.06536,0.501011,4.008091
2016-08,249.299577,0.150303,30.56697,120.489954,2.154349,2.680411,0.050101,0.100202,144.76361,0.200405,...,16.208553,35.041541,0.025051,1.30263,74.800997,8.675847,3.331726,145.06536,0.501011,4.008091
2016-09,249.299577,0.150303,30.56697,120.489954,2.154349,2.680411,0.050101,0.100202,144.76361,0.200405,...,16.208553,35.041541,0.025051,1.30263,74.800997,8.675847,3.331726,145.06536,0.501011,4.008091
2016-10,249.299577,0.150303,30.56697,120.489954,2.154349,2.680411,0.050101,0.100202,144.76361,0.200405,...,16.208553,35.041541,0.025051,1.30263,74.800997,8.675847,3.331726,145.06536,0.501011,4.008091
2016-11,249.299577,0.150303,30.56697,120.489954,2.154349,2.680411,0.050101,0.100202,144.76361,0.200405,...,16.208553,35.041541,0.025051,1.30263,74.800997,8.675847,3.331726,145.06536,0.501011,4.008091
2016-12,249.299577,0.150303,30.56697,120.489954,2.154349,2.680411,0.050101,0.100202,144.76361,0.200405,...,16.208553,35.041541,0.025051,1.30263,74.800997,8.675847,3.331726,145.06536,0.501011,4.008091
2017-01,249.299577,0.150303,30.56697,120.489954,2.154349,2.680411,0.050101,0.100202,144.76361,0.200405,...,16.208553,35.041541,0.025051,1.30263,74.800997,8.675847,3.331726,145.06536,0.501011,4.008091
