In [1]:
### Import libraries ----
import pandas as pd
import numpy as np
import sklearn.linear_model as linear_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyRegressor

In [2]:
### Generate data ----
#!Python 1_Get_UCDP.py

In [3]:
### Load data ----
df_ucdp = pd.read_csv('ucdp_month.csv')
df_ucdp = df_ucdp.set_index("date")
df_ucdp.head(5)

Unnamed: 0_level_0,Afghanistan,Albania,Algeria,Angola,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahrain,...,Uganda,Ukraine,United Arab Emirates,United Kingdom,United States of America,Uzbekistan,Venezuela,Yemen (North Yemen),Zambia,Zimbabwe (Rhodesia)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1989-01,1298.788462,0,0.0,300.27381,0,0.0,0,0,0.0,0,...,0.0,0.0,0,3.0,0,0.0,0.0,0.0,0.0,0.0
1989-02,198.715385,0,0.0,427.809524,0,0.0,0,0,0.0,0,...,0.0,0.0,0,5.0,0,0.0,21.897727,0.0,0.0,0.0
1989-03,2211.050962,0,0.0,415.261905,0,0.0,0,0,0.0,0,...,0.0,0.0,0,5.0,0,0.0,22.505682,0.0,0.0,10.5
1989-04,510.730769,0,0.0,177.857143,0,0.0,0,0,0.0,0,...,97.333333,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0
1989-05,821.833544,0,0.0,278.592437,0,0.0,0,0,0.0,0,...,593.126667,0.0,0,1.0,0,0.0,0.0,0.0,4.0,0.0


In [4]:
### Prepare data -----

# Use 10 months sequences
number_s=11          

# Normalization
scaler = MinMaxScaler(feature_range=(0,1))
df = pd.DataFrame(scaler.fit_transform(df_ucdp))

# Create time sequences
ts_seq=[]
for col in range(len(df.columns)):
    for i in range(number_s-1,len(df)):
        ts_seq.append(df.iloc[i-number_s+1:i+1,col])
        
# Create training, validation and test partitions with 80-20-20 split
ts_seq=np.array(ts_seq)
ts_seq_l= ts_seq.reshape(len(df.columns),len(df.index)-number_s+1,number_s)

ts_seq_learn=ts_seq_l[:,:int(0.6*len(df)),:]
ts_seq_learn=ts_seq_learn.reshape(ts_seq_learn.shape[0]*ts_seq_learn.shape[1],number_s)

ts_seq_val=ts_seq_l[:,int(0.6*len(df)):int(0.8*len(df)),:]
ts_seq_val=ts_seq_val.reshape(ts_seq_val.shape[0]*ts_seq_val.shape[1],number_s)

ts_seq_test=ts_seq_l[:,int(0.8*len(df)):,:]
ts_seq_test=ts_seq_test.reshape(ts_seq_test.shape[0]*ts_seq_test.shape[1],number_s)

# Obtain input and output training, validation and test partitions
train_x = ts_seq_learn[:,:-1]
train_y = ts_seq_learn[:,-1]
val_x = ts_seq_val[:,:-1]
val_y = ts_seq_val[:,-1]
test_x = ts_seq_test[:,:-1]
test_y = ts_seq_test[:,-1]

In [5]:
### Check length of training, validation and test partitions ----
print(len(train_x))
print(len(val_x))
print(len(test_x))

28560
9480
8400


In [6]:
### Hyperparameter tuning using validation data -----

# Empty df
tuning = pd.DataFrame(columns=["alpha", "mse", "wmse"])

# Loop through different values of alpha
for a in [0.0001, 0.0025, 0.0005, 0.00025, 0.01, 0.05, 0.1]:
    
    # fit model, make predictions and calculate mse, wmse
    model = linear_model.Lasso(max_iter=2000, 
                               alpha=a).fit(train_x, 
                                            train_y)
    pred = model.predict(val_x)
    mse = mean_squared_error(val_y, 
                             pred)
    wmse =  mean_squared_error(val_y,
                               pred, 
                               sample_weight=val_y+1)
    
    # Save results
    para = pd.DataFrame({"alpha": a,
                         "mse": mse, 
                         "wmse": wmse},
                         index=[a])
    tuning = tuning.append(para)
    tuning.reset_index(drop=True, inplace=True)

tuning.to_latex(f'tuning_lasso.tex', index=False)
tuning

Unnamed: 0,alpha,mse,wmse
0,0.0001,0.003595,0.005758
1,0.0025,0.005524,0.008928
2,0.0005,0.003727,0.006017
3,0.00025,0.003635,0.005842
4,0.01,0.008102,0.012932
5,0.05,0.008102,0.012932
6,0.1,0.008102,0.012932


In [7]:
### Sort by wmse ----
tuning.sort_values(by="wmse")

Unnamed: 0,alpha,mse,wmse
0,0.0001,0.003595,0.005758
3,0.00025,0.003635,0.005842
2,0.0005,0.003727,0.006017
1,0.0025,0.005524,0.008928
4,0.01,0.008102,0.012932
5,0.05,0.008102,0.012932
6,0.1,0.008102,0.012932


In [8]:
### Make prediction in test data for final model -----
model = linear_model.Lasso(max_iter=2000,alpha=0.0001).fit(train_x, 
                                                           train_y)
pred = model.predict(test_x)

# mse
mse = mean_squared_error(test_y,
                         pred)
print(mse)

# wmse
weighted_mse =  mean_squared_error(test_y,
                                   pred,
                                   sample_weight=test_y+1)
print(weighted_mse)

0.00463286394463583
0.007351346420351095


In [9]:
### Convert predictions back to original dataframe format ----
df_lasso = pred.reshape((len(df.iloc[0,:]),int(len(ts_seq_test[:,:-1])/len(df.iloc[0,:]))))
df_lasso = df_lasso.T
df_lasso = pd.DataFrame(df_lasso)
df_lasso = pd.DataFrame(scaler.inverse_transform(df_lasso))
df_lasso.columns = df_ucdp.columns
df_lasso.index=df_ucdp.index[-70:]

# Save
df_lasso.to_csv('preds_lasso.csv', index=False)
df_lasso.head(10)

Unnamed: 0_level_0,Afghanistan,Albania,Algeria,Angola,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahrain,...,Uganda,Ukraine,United Arab Emirates,United Kingdom,United States of America,Uzbekistan,Venezuela,Yemen (North Yemen),Zambia,Zimbabwe (Rhodesia)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-04,1574.264077,0.028583,14.948756,23.576439,0.409684,0.66144,0.009528,0.019055,29.8219,0.03811,...,7.167087,43.75142,0.004764,0.247716,14.917745,1.64985,4.152088,501.264889,0.095275,0.762202
2016-05,1765.395282,0.028583,15.070348,23.633674,0.409684,0.705059,0.009528,0.019055,78.192917,0.03811,...,7.485765,42.922479,0.004764,0.247716,14.859943,1.64985,4.45265,418.972922,0.095275,0.762202
2016-06,1979.815441,0.028583,18.092478,23.706093,0.409684,0.688153,0.009528,0.019055,49.253568,0.03811,...,5.323857,44.596394,0.004764,0.247716,14.763914,1.64985,3.681524,395.008517,0.095275,0.762202
2016-07,1841.100558,0.028583,16.371913,24.95377,0.409684,0.683045,0.009528,0.019055,42.105266,0.03811,...,4.747997,33.982909,0.004764,0.247716,14.755607,1.64985,3.865895,414.799168,0.095275,0.762202
2016-08,1795.191446,0.028583,12.123848,25.475744,0.409684,0.509723,0.009528,0.019055,37.190407,0.03811,...,4.388088,40.861305,0.004764,0.247716,14.908273,1.64985,4.211999,366.846265,0.095275,0.762202
2016-09,1886.568309,0.028583,10.478419,24.954351,0.409684,0.509723,0.009528,0.019055,36.10842,0.03811,...,4.292528,34.814666,0.004764,0.247716,14.849102,1.64985,3.838196,370.925493,0.095275,0.762202
2016-10,1848.474208,0.028583,11.162154,25.910919,0.409684,0.509723,0.009528,0.019055,34.791714,0.03811,...,4.392653,29.580713,0.004764,0.247716,14.831225,1.64985,4.939546,312.566048,0.095275,0.762202
2016-11,2360.012929,0.028583,11.699525,24.968948,0.409684,0.509723,0.009528,0.019055,34.003964,0.03811,...,4.414153,39.629797,0.004764,0.247716,14.224596,1.64985,3.812212,357.354278,0.095275,0.762202
2016-12,1804.52921,0.028583,10.140361,25.324148,0.409684,0.509723,0.009528,0.019055,36.053899,0.03811,...,4.446328,31.103949,0.004764,0.247716,14.224596,1.64985,6.327627,435.131715,0.095275,0.762202
2017-01,1627.168205,0.028583,9.810094,24.798568,0.409684,1.716324,0.009528,0.019055,35.22882,0.03811,...,3.77528,26.375442,0.004764,0.247716,14.224596,1.64985,3.425361,352.246231,0.095275,0.762202


In [10]:
### Convert test data back to original dataframe format ----
df_test = test_y.reshape((len(df.iloc[0,:]),int(len(ts_seq_test[:,:-1])/len(df.iloc[0,:]))))
df_test = df_test.T
df_test = pd.DataFrame(df_test)
df_test = pd.DataFrame(scaler.inverse_transform(df_test))
df_test.columns = df_ucdp.columns
df_test.index=df_ucdp.index[-70:]

# Save
df_test.to_csv('df_test.csv', index=True)
df_test.head(10)

Unnamed: 0_level_0,Afghanistan,Albania,Algeria,Angola,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahrain,...,Uganda,Ukraine,United Arab Emirates,United Kingdom,United States of America,Uzbekistan,Venezuela,Yemen (North Yemen),Zambia,Zimbabwe (Rhodesia)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-04,2389.0,0.0,15.0,1.315068,0.0,0.0,0.0,0.0,161.75,0.0,...,7.752809,47.666667,0.0,0.0,0.0,0.0,5.0,296.476923,0.0,0.0
2016-05,2801.833333,0.0,24.0,1.358904,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,62.333333,0.0,0.0,0.0,0.0,2.0,359.974359,0.0,0.0
2016-06,2201.9,0.0,15.0,5.315068,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,30.166667,0.0,0.0,0.0,0.0,4.0,500.166667,0.0,0.0
2016-07,2206.77381,0.0,0.0,5.358904,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,58.342857,0.0,0.0,0.0,0.0,5.0,367.5,0.0,0.0
2016-08,2728.502564,0.0,0.0,2.358904,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,30.0,0.0,0.0,0.0,0.0,3.0,451.0,0.0,0.0
2016-09,2271.791667,0.0,5.0,5.815068,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,16.0,0.0,0.0,0.0,0.0,6.5,258.0,0.0,0.0
2016-10,3964.097009,0.0,6.0,1.358904,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,54.5,0.0,0.0,0.0,0.0,2.0,486.0,0.0,0.0
2016-11,1382.9,0.0,0.0,3.315068,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,12.428571,0.0,0.0,0.0,0.0,12.0,690.534483,0.0,0.0
2016-12,1273.9,0.0,0.0,1.358904,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,6.0,0.0,0.0,0.0,0.0,0.0,272.0,0.0,0.0
2017-01,1897.4,0.0,2.0,0.766484,0.0,0.0,0.0,0.0,7.325758,0.0,...,0.0,52.007962,0.0,0.0,0.0,0.0,3.0,1083.166667,0.0,0.0


In [13]:
### Make prediction in test data for a simple baseline -----

# Always predicts mean
model = DummyRegressor(strategy="mean").fit(train_x, 
                                            train_y)
# https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html

pred = model.predict(test_x)

# mse
mse = mean_squared_error(test_y,
                         pred)
print(mse)

# wmse
weighted_mse =  mean_squared_error(test_y,
                                   pred,
                                   sample_weight=test_y+1)
print(weighted_mse)

0.010545126013465054
0.01670663728149058


In [15]:
pred

array([0.02505057, 0.02505057, 0.02505057, ..., 0.02505057, 0.02505057,
       0.02505057])