In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from category_encoders import OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import Ridge, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score, f1_score, mean_squared_error, mean_absolute_error

In [2]:
df = pd.read_csv('Statistika_gorimost_po_rayonam - Динамика горимости.csv')

list_of_df = []
for i in range(len(df)):
    new_df = df.iloc[i,:].to_frame(name="flammability").reset_index()
    new_df['region'] = new_df.iloc[1,1]
    new_df['OSM_ID'] = new_df.iloc[0,1]
    new_df = new_df.iloc[3:,:]
    list_of_df.append(new_df)
    
output1 = pd.concat(list_of_df).reset_index()
output1 = output1.rename(columns={"index": "year"})
output1 = output1.drop(['level_0'], axis=1)

In [3]:
df = pd.read_csv('Statistika_gorimost_po_rayonam - Динамика NDVI.csv')

list_of_df = []
for i in range(len(df)):
    new_df = df.iloc[i,:].to_frame(name="NDVI").reset_index()
    new_df['OSM_ID'] = new_df.iloc[0,1]
    new_df = new_df.iloc[1:,:]
    list_of_df.append(new_df)
    
output2 = pd.concat(list_of_df).reset_index()
output2 = output2.rename(columns={"index": "year"})
output2 = output2.drop(['level_0'], axis=1)

In [4]:
df = pd.read_csv('Statistika_gorimost_po_rayonam - Динамика Осадки.csv')

list_of_df = []
for i in range(len(df)):
    new_df = df.iloc[i,:].to_frame(name="precipitation").reset_index()
    new_df['OSM_ID'] = new_df.iloc[0,1]
    new_df = new_df.iloc[1:,:]
    list_of_df.append(new_df)
    
output3 = pd.concat(list_of_df).reset_index()
output3 = output3.rename(columns={"index": "year"})
output3 = output3.drop(['level_0'], axis=1)

In [5]:
df = pd.read_csv('Statistika_gorimost_po_rayonam - Динамика Скот.csv')

list_of_df = []
for i in range(len(df)):
    new_df = df.iloc[i,:].to_frame(name="stock").reset_index()
    new_df['OSM_ID'] = new_df.iloc[0,1]
    new_df = new_df.iloc[1:,:]
    list_of_df.append(new_df)
    
output4 = pd.concat(list_of_df).reset_index()
output4 = output4.rename(columns={"index": "year"})
output4 = output4.drop(['level_0'], axis=1)

In [6]:
df = pd.read_csv('Statistika_gorimost_po_rayonam - Динамика Т макс.csv')

list_of_df = []
for i in range(len(df)):
    new_df = df.iloc[i,:].to_frame(name="t").reset_index()
    new_df['OSM_ID'] = new_df.iloc[0,1]
    new_df = new_df.iloc[1:,:]
    list_of_df.append(new_df)
    
output5 = pd.concat(list_of_df).reset_index()
output5 = output5.rename(columns={"index": "year"})
output5 = output5.drop(['level_0'], axis=1)

In [7]:
df = output1.merge(output2, on=['OSM_ID','year'])
df = df.merge(output3, on=['OSM_ID','year'])
df = df.merge(output4, on=['OSM_ID','year'])
df = df.merge(output5, on=['OSM_ID','year'])
df.to_csv('out.csv', index=False)
df = df.fillna(-9999)
df = df.drop(['OSM_ID', 'year'], axis=1)

df = df.replace(to_replace=r',', value='.', regex=True)

df = df.astype({'flammability':'float64', 'stock':'float64','t':'float64', 'NDVI':'float64', 'precipitation':'float64'})
df['flammability'] =  df['flammability'] + 0.0001
df['flammability'] = np.log1p(df['flammability'])
df

Unnamed: 0,flammability,region,NDVI,precipitation,stock,t
0,3.388453,Ахтубинский район,3443.553826,305.300003,-9999.0,22.864286
1,2.829684,Ахтубинский район,3283.658314,327.975006,-9999.0,22.464286
2,0.506878,Ахтубинский район,2986.687553,318.075005,-9999.0,20.989285
3,1.258489,Ахтубинский район,3269.350491,319.725006,-9999.0,22.935715
4,2.522532,Ахтубинский район,3305.540993,312.275009,-9999.0,22.800001
...,...,...,...,...,...,...
1155,0.000100,Уральск Г.А.,5255.108136,503.277455,4.2,22.314868
1156,0.810975,Уральск Г.А.,4987.325018,350.355044,4.4,20.957724
1157,1.757875,Уральск Г.А.,4460.976181,281.203069,4.0,21.128864
1158,0.000100,Уральск Г.А.,4650.686799,325.610196,4.4,21.443149


In [8]:
enc = OneHotEncoder()
df = df.drop(['region'], axis = 1).join(enc.fit_transform(df[['region']], axis = 0))

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['flammability'], axis=1), 
                                                    df['flammability'], test_size=0.20, 
                                                    random_state=123)

cv = 10
scalers = (RobustScaler(), StandardScaler(), MinMaxScaler())

regressors = (
    RidgeCV(cv=cv, alphas=[0.001,0.01, 0.1, 1, 10]), 
    LassoCV(cv=cv, alphas=[0.001,0.01, 0.1, 1, 10], n_alphas=[100, 200, 300, 400]), 
    ElasticNetCV(cv=cv, alphas=[0.001,0.01, 0.1, 1, 10], n_alphas=[100, 200, 300, 400])
)

for scaler in scalers:
    for regressor in regressors:
        
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        regressor.fit(X_train_scaled, y_train)
        y_pred = regressor.predict(X_test_scaled)
        print('R2: ' + str(regressor.score(X_test_scaled, y_test)))
        print('RMSE: ' + str(mean_squared_error(y_test, y_pred, squared=False)))
        print('MAE: ' + str(mean_absolute_error(y_test, y_pred)))
        print('Scaler: ' + scaler.__class__.__name__)
        print('Regressor\'s type:' + regressor.__class__.__name__)
        print('==' * 20)

R2: 0.45239455701072373
RMSE: 0.7429671645256937
MAE: 0.5644833338493997
Scaler: RobustScaler
Regressor's type:RidgeCV
R2: 0.44089716847555516
RMSE: 0.7507262267227033
MAE: 0.5724614023416614
Scaler: RobustScaler
Regressor's type:LassoCV
R2: 0.4412403067437155
RMSE: 0.7504958197031235
MAE: 0.5731939070428114
Scaler: RobustScaler
Regressor's type:ElasticNetCV
R2: 0.45296754922569726
RMSE: 0.7425783573766827
MAE: 0.5636896790420493
Scaler: StandardScaler
Regressor's type:RidgeCV
R2: 0.4519829180130457
RMSE: 0.7432463589803506
MAE: 0.5646224655720142
Scaler: StandardScaler
Regressor's type:LassoCV
R2: 0.4524683769684843
RMSE: 0.7429170849895039
MAE: 0.5641776989983591
Scaler: StandardScaler
Regressor's type:ElasticNetCV
R2: 0.4512900926760629
RMSE: 0.7437160316209309
MAE: 0.5646312187701636
Scaler: MinMaxScaler
Regressor's type:RidgeCV
R2: 0.4352187246335206
RMSE: 0.7545289147789178
MAE: 0.5752237350383417
Scaler: MinMaxScaler
Regressor's type:LassoCV
R2: 0.43490652822012166
RMSE: 0.75473

In [10]:
from catboost import CatBoostRegressor, Pool
scalers = (RobustScaler(), StandardScaler(), MinMaxScaler())
#transform to best

for scaler in scalers:
    print('Scaler: ' + scaler.__class__.__name__)
    
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    train_dataset = Pool(X_train_scaled, y_train)

    model = CatBoostRegressor(loss_function='MAE', logging_level='Silent')
    grid = {'iterations': [250, 500, 1000],
            'learning_rate': [0.04, 0.05, 0.1, 0.2, 0.5],
            'depth': [2, 4, 6, 8],
            'l2_leaf_reg': [0.2, 0.5, 1, 3]}
    grid_search_result = model.grid_search(grid, train_dataset, verbose=False)
    print(grid_search_result['params'])
    
    pred = model.predict(X_test_scaled)
    rmse = (np.sqrt(mean_squared_error(y_test, pred)))
    r2 = r2_score(y_test, pred)

    print("Testing performance")
    print('RMSE: {:.2f}'.format(rmse))
    print('R2: {:.4f}'.format(r2))
    print('==' * 20)

Scaler: RobustScaler
{'depth': 4, 'iterations': 1000, 'learning_rate': 0.2, 'l2_leaf_reg': 0.5}
Testing performance
RMSE: 0.79
R2: 0.3828
Scaler: StandardScaler
{'depth': 4, 'iterations': 1000, 'learning_rate': 0.2, 'l2_leaf_reg': 0.5}
Testing performance
RMSE: 0.79
R2: 0.3825
Scaler: MinMaxScaler
{'depth': 4, 'iterations': 1000, 'learning_rate': 0.2, 'l2_leaf_reg': 0.2}
Testing performance
RMSE: 0.83
R2: 0.3202


In [11]:
#Test Default CatBoost
model = CatBoostRegressor(logging_level='Silent')
model.fit(X_train, y_train)

pred = model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)

print("Testing default Catboost")
print('RMSE: {:.2f}'.format(rmse))
print('R2: {:.4f}'.format(r2))
print()
print(model.get_all_params())

Testing default Catboost
RMSE: 0.76
R2: 0.4264

{'nan_mode': 'Min', 'eval_metric': 'RMSE', 'iterations': 1000, 'sampling_frequency': 'PerTree', 'leaf_estimation_method': 'Newton', 'grow_policy': 'SymmetricTree', 'penalties_coefficient': 1, 'boosting_type': 'Plain', 'model_shrink_mode': 'Constant', 'feature_border_type': 'GreedyLogSum', 'bayesian_matrix_reg': 0.10000000149011612, 'force_unit_auto_pair_weights': False, 'l2_leaf_reg': 3, 'random_strength': 1, 'rsm': 1, 'boost_from_average': True, 'model_size_reg': 0.5, 'pool_metainfo_options': {'tags': {}}, 'subsample': 0.800000011920929, 'use_best_model': False, 'random_seed': 0, 'depth': 6, 'posterior_sampling': False, 'border_count': 254, 'classes_count': 0, 'auto_class_weights': 'None', 'sparse_features_conflict_fraction': 0, 'leaf_estimation_backtracking': 'AnyImprovement', 'best_model_min_trees': 1, 'model_shrink_rate': 0, 'min_data_in_leaf': 1, 'loss_function': 'RMSE', 'learning_rate': 0.04046199843287468, 'score_function': 'Cosine