In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from category_encoders import OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import Ridge, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score, f1_score, mean_squared_error, mean_absolute_error

In [2]:
df = pd.read_csv('data/flam dynamics.csv')

list_of_df = []
for i in range(len(df)):
    new_df = df.iloc[i,:].to_frame(name="flammability").reset_index()
    new_df['region'] = new_df.iloc[1,1]
    new_df['OSM_ID'] = new_df.iloc[0,1]
    new_df = new_df.iloc[3:,:]
    list_of_df.append(new_df)
    
output1 = pd.concat(list_of_df).reset_index()
output1 = output1.rename(columns={"index": "year"})
output1 = output1.drop(['level_0'], axis=1)

In [3]:
df = pd.read_csv('data/NDVI dynamics.csv')

list_of_df = []
for i in range(len(df)):
    new_df = df.iloc[i,:].to_frame(name="NDVI").reset_index()
    new_df['OSM_ID'] = new_df.iloc[0,1]
    new_df = new_df.iloc[1:,:]
    list_of_df.append(new_df)
    
output2 = pd.concat(list_of_df).reset_index()
output2 = output2.rename(columns={"index": "year"})
output2 = output2.drop(['level_0'], axis=1)

In [4]:
df = pd.read_csv('data/precip dynamics.csv')

list_of_df = []
for i in range(len(df)):
    new_df = df.iloc[i,:].to_frame(name="precipitation").reset_index()
    new_df['OSM_ID'] = new_df.iloc[0,1]
    new_df = new_df.iloc[1:,:]
    list_of_df.append(new_df)
    
output3 = pd.concat(list_of_df).reset_index()
output3 = output3.rename(columns={"index": "year"})
output3 = output3.drop(['level_0'], axis=1)

In [5]:
df = pd.read_csv('data/stock dynamics.csv')

list_of_df = []
for i in range(len(df)):
    new_df = df.iloc[i,:].to_frame(name="stock").reset_index()
    new_df['OSM_ID'] = new_df.iloc[0,1]
    new_df = new_df.iloc[1:,:]
    list_of_df.append(new_df)
    
output4 = pd.concat(list_of_df).reset_index()
output4 = output4.rename(columns={"index": "year"})
output4 = output4.drop(['level_0'], axis=1)

In [6]:
df = pd.read_csv('data/temper dynamics.csv')

list_of_df = []
for i in range(len(df)):
    new_df = df.iloc[i,:].to_frame(name="t").reset_index()
    new_df['OSM_ID'] = new_df.iloc[0,1]
    new_df = new_df.iloc[1:,:]
    list_of_df.append(new_df)
    
output5 = pd.concat(list_of_df).reset_index()
output5 = output5.rename(columns={"index": "year"})
output5 = output5.drop(['level_0'], axis=1)

In [7]:
df = output1.merge(output2, on=['OSM_ID','year'])
df = df.merge(output3, on=['OSM_ID','year'])
df = df.merge(output4, on=['OSM_ID','year'])
df = df.merge(output5, on=['OSM_ID','year'])

df.to_csv('out.csv', index=False)
df.head()

Unnamed: 0,year,flammability,region,OSM_ID,NDVI,precipitation,stock,t
0,2001,2862,Ахтубинский район,-1850408,3443553826,3053000031,,2286428595
1,2002,1594,Ахтубинский район,-1850408,3283658314,3279750061,,2246428585
2,2003,66,Ахтубинский район,-1850408,2986687553,3180750046,,2098928547
3,2004,252,Ахтубинский район,-1850408,3269350491,3197250061,,2293571472
4,2005,1146,Ахтубинский район,-1850408,3305540993,3122750092,,2280000067


In [8]:
df = df.fillna(-9999)
df = df.replace(to_replace=r',', value='.', regex=True)
df = df.astype({'flammability':'float64', 'stock':'float64','t':'float64', 'NDVI':'float64', 'precipitation':'float64'})

In [9]:
df['flammability'] =  df['flammability']
df['flammability'] = np.log1p(df['flammability'])

df['previous_precipitation'] = df['precipitation'].shift(+1)
df.loc[df['year'] == '2001', 'previous_precipitation'] = np.nan
df['previous_precipitation'] = df.groupby(['region']).previous_precipitation.transform(lambda x: x.fillna(x.mean()))

df['previous_NDVI'] = df['precipitation'].shift(+2)
df.loc[df['year'] == '2001', 'previous_NDVI'] = np.nan
df.loc[df['year'] == '2002', 'previous_NDVI'] = np.nan
df['previous_NDVI'] = df.groupby(['region']).previous_NDVI.transform(lambda x: x.fillna(x.mean()))

In [10]:
df = df.drop(['OSM_ID', 'year'], axis=1)
df.corr()

Unnamed: 0,flammability,NDVI,precipitation,stock,t,previous_precipitation,previous_NDVI
flammability,1.0,-0.126839,-0.016776,-0.071312,-0.160536,0.164865,0.065425
NDVI,-0.126839,1.0,0.524621,-0.167132,-0.360951,0.386499,0.327169
precipitation,-0.016776,0.524621,1.0,-0.230533,-0.741837,0.628083,0.622656
stock,-0.071312,-0.167132,-0.230533,1.0,0.257455,-0.231002,-0.234444
t,-0.160536,-0.360951,-0.741837,0.257455,1.0,-0.675664,-0.684048
previous_precipitation,0.164865,0.386499,0.628083,-0.231002,-0.675664,1.0,0.642569
previous_NDVI,0.065425,0.327169,0.622656,-0.234444,-0.684048,0.642569,1.0


In [11]:
enc = OneHotEncoder()
df = df.drop(['region'], axis = 1).join(enc.fit_transform(df[['region']], axis = 0))
df = df.drop(['precipitation'], axis=1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['flammability'], axis=1), 
                                                    df['flammability'], test_size=0.20, 
                                                    random_state=1)

cv = 10
scalers = (RobustScaler(), StandardScaler(), MinMaxScaler())

regressors = (
    RidgeCV(cv=cv, alphas=[0.0001, 0.001,0.01, 0.1, 1]), 
    LassoCV(cv=cv, alphas=[0.0001, 0.001,0.01, 0.1, 1], n_alphas=[100, 200, 300, 400]), 
    ElasticNetCV(cv=cv, alphas=[0.0001, 0.001,0.01, 0.1, 1, 10], n_alphas=[100, 200, 300, 400])
)

for scaler in scalers:
    for regressor in regressors:
        
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        regressor.fit(X_train_scaled, y_train)
        y_pred = regressor.predict(X_test_scaled)
        print('R2: ' + str(regressor.score(X_test_scaled, y_test)))
        print('RMSE: ' + str(mean_squared_error(y_test, y_pred, squared=False)))
        print('MAE: ' + str(mean_absolute_error(y_test, y_pred)))
        print('Scaler: ' + scaler.__class__.__name__)
        print('Regressor\'s type:' + regressor.__class__.__name__)
        print('Best Alpha: ' + str(regressor.alpha_))
#         for col, coef in zip(df.columns, regressor.coef_):
#             print(col, coef)
        print('==' * 20)

R2: 0.45638297329576916
RMSE: 0.7635273862910139
MAE: 0.5719469809737439
Scaler: RobustScaler
Regressor's type:RidgeCV
Best Alpha: 0.1
R2: 0.4551778009381886
RMSE: 0.764373269164342
MAE: 0.572331234741802
Scaler: RobustScaler
Regressor's type:LassoCV
Best Alpha: 0.0001
R2: 0.45584185815837286
RMSE: 0.7639072984752331
MAE: 0.5720998477547296
Scaler: RobustScaler
Regressor's type:ElasticNetCV
Best Alpha: 0.0001
R2: 0.4583936274800742
RMSE: 0.7621140643882415
MAE: 0.570633061603417
Scaler: StandardScaler
Regressor's type:RidgeCV
Best Alpha: 1.0
R2: 0.45839139934599005
RMSE: 0.7621156320309637
MAE: 0.5706579959520373
Scaler: StandardScaler
Regressor's type:LassoCV
Best Alpha: 0.0001
R2: 0.45568308993226037
RMSE: 0.7640187324058587
MAE: 0.5719100740574349
Scaler: StandardScaler
Regressor's type:ElasticNetCV
Best Alpha: 0.001


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


R2: 0.45589277986389265
RMSE: 0.763871554847057
MAE: 0.5720250743109294
Scaler: MinMaxScaler
Regressor's type:RidgeCV
Best Alpha: 0.1
R2: 0.4545457787890764
RMSE: 0.7648164970768618
MAE: 0.572566153199987
Scaler: MinMaxScaler
Regressor's type:LassoCV
Best Alpha: 0.0001
R2: 0.45538592288645174
RMSE: 0.7642272600171761
MAE: 0.5722026869566043
Scaler: MinMaxScaler
Regressor's type:ElasticNetCV
Best Alpha: 0.0001


In [14]:
#Test Default CatBoost
model = CatBoostRegressor(logging_level='Silent')
model.fit(X_train, y_train)

pred = model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)

print("Testing default Catboost")
print('RMSE: {:.2f}'.format(rmse))
print('R2: {:.4f}'.format(r2))
print()
print(model.get_all_params())

Testing default Catboost
RMSE: 0.77
R2: 0.4482

{'nan_mode': 'Min', 'eval_metric': 'RMSE', 'iterations': 1000, 'sampling_frequency': 'PerTree', 'leaf_estimation_method': 'Newton', 'grow_policy': 'SymmetricTree', 'penalties_coefficient': 1, 'boosting_type': 'Plain', 'model_shrink_mode': 'Constant', 'feature_border_type': 'GreedyLogSum', 'bayesian_matrix_reg': 0.10000000149011612, 'l2_leaf_reg': 3, 'random_strength': 1, 'rsm': 1, 'boost_from_average': True, 'model_size_reg': 0.5, 'pool_metainfo_options': {'tags': {}}, 'subsample': 0.800000011920929, 'use_best_model': False, 'random_seed': 0, 'depth': 6, 'posterior_sampling': False, 'border_count': 254, 'classes_count': 0, 'auto_class_weights': 'None', 'sparse_features_conflict_fraction': 0, 'leaf_estimation_backtracking': 'AnyImprovement', 'best_model_min_trees': 1, 'model_shrink_rate': 0, 'min_data_in_leaf': 1, 'loss_function': 'RMSE', 'learning_rate': 0.03851500153541565, 'score_function': 'Cosine', 'task_type': 'CPU', 'leaf_estimation

In [49]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline

param_grid = {
    'C': [1, 5, 100, 200, 1000],
    'epsilon': [0.01, 0.1, 0.05, 0.0003, 1, 0.2, 5, 10],
    'gamma': [0.001, 0.1, 1, 5, 10, 100]
}
grid_search = GridSearchCV(estimator = SVR(), param_grid = param_grid, cv = 10, n_jobs = -1, verbose = 2)
regr = make_pipeline(RobustScaler(), grid_search)


regr.fit(X_train, y_train)
print(regr.score(X_test, y_test))
print(regr[1].best_params_)

Fitting 10 folds for each of 240 candidates, totalling 2400 fits
0.5153056632405655
{'C': 5, 'epsilon': 0.2, 'gamma': 0.1}
