In [2]:
import csv
import math
import pickle
import datetime as dt
import pandas as pd
import numpy as np
import sklearn

from scipy import stats
from dateutil.parser import parse
from datetime import datetime
from IPython.display import display_html
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from catboost import CatBoostRegressor
from sklearn.model_selection import GroupKFold

import matplotlib.pyplot as plt
import seaborn as sns
import time

In [3]:
load_ads = pd.read_csv('gs://aes-analytics-0002-curated/Outage_Restoration/Repredictions/Master_Dataset/OMS_IPL_V1.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# Converting data type of CIRCT_ID & SUBST_ID to integer and passing it as a categorical column
load_ads['CIRCT_ID'] = load_ads['CIRCT_ID'].astype(int)
load_ads['SUBST_ID'] = load_ads['SUBST_ID'].astype(int)

In [5]:
load_ads['CREATION_DATETIME'] = pd.to_datetime(load_ads['CREATION_DATETIME'], errors='coerce')
load_ads['Year'] = load_ads['CREATION_DATETIME'].dt.year

In [6]:
load_ads['Year'].unique()

array([2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017,
       2018, 2019])

In [7]:
load_ads_v1 = load_ads[(load_ads.Year>2014) & ((load_ads.Year<2020))]
load_ads_v1.reset_index(drop=True, inplace=True)
print(load_ads_v1.shape)

(240482, 162)


In [8]:
catboost = pd.read_csv('gs://aes-analytics-0002-curated/Outage_Restoration/Repredictions/Model/IPL_TTR_Catboost_12092020.csv')
final_features = list(catboost.Features)

In [9]:
# train test split using group k fold

X = load_ads_v1
y = load_ads_v1['TR']

groups = load_ads_v1['OUTAGE_ID']
group_kfold = GroupKFold(n_splits=3)
group_kfold.get_n_splits(X, y, groups)

3

In [10]:
# Creating variable with having only categorical features index
categorical_features_indices = np.where((X[final_features].dtypes != np.int) & (X[final_features].dtypes != np.float))[0];
print(categorical_features_indices)

[12 13 15 27 32 33 35 37]


In [11]:
# Adding Substation id and Circuit id as categorical feature
arr1 = np.array([6, 11])
categorical_features_indices = np.append(arr1, categorical_features_indices)
print(categorical_features_indices)

[ 6 11 12 13 15 27 32 33 35 37]


In [12]:
# parameter grid for CatBoost Randomized Search Model

grid_params = {'n_estimators': [int(x) for x in np.linspace(start=1000, stop=5000, num=5)],
               'depth': [int(x) for x in np.linspace(3, 20, num=5)],
               'l2_leaf_reg': [3,4,5,6],
               'leaf_estimation_iterations': [1,2,3,4]}

In [15]:
%%time

cat_ran = CatBoostRegressor(silent=True)

cat_random = RandomizedSearchCV(estimator=cat_ran, param_distributions=grid_params, cv=group_kfold.split(X, y, groups), n_iter=50, verbose=True, n_jobs=12)
results = cat_random.fit(X[final_features], y, early_stopping_rounds=50, cat_features=categorical_features_indices)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.


KeyboardInterrupt: 

In [None]:
df_results = pd.DataFrame(cat_random.cv_results_)
df_results.to_csv('gs://aes-analytics-0002-curated/Outage_Restoration/Repredictions/Model/IPL_CatBoost_GridSearch.csv', index=False)

In [None]:
print(cat_random.best_params_)
print(cat_random.cv_results_)

In [None]:
df_parameters = pd.DataFrame()
df_parameters['Parameter'] = cat_random.best_params_.keys()
df_parameters['Value'] = cat_random.best_params_.values()

df_parameters.to_csv('gs://aes-analytics-0002-curated/Outage_Restoration/Repredictions/Model/IPL_CatBoost_BestParameters.csv', index=False)