# <center>Train Steel Data</center>

In [17]:
import pandas as pd
import numpy  as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics  import r2_score, mean_squared_error
import warnings
warnings.filterwarnings("ignore")
import statistics
import pickle
from sklearn.model_selection import cross_val_score

In [18]:
df = pd.read_csv('train_cluster.csv')
df.head()

Unnamed: 0,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,WeekStatus,Load_Type,hour,Usage_kWh,Cluster
0,-0.607869,-0.522712,-0.712161,-0.287349,0.514084,-0.845782,0,0,6,3.53,0
1,-0.466496,-0.522712,-0.712161,-1.723965,0.514084,-1.351516,1,0,2,2.95,0
2,-0.459735,-0.522712,-0.712161,-1.666394,0.514084,-1.351516,1,0,2,3.1,0
3,-0.466496,-0.522712,-0.712161,-1.866042,0.514084,-1.170897,1,0,3,2.74,0
4,2.045033,-0.522712,2.389134,0.589937,0.514084,0.274057,1,2,13,106.7,2


In [33]:
class Model_Finder:
    def __init__(self):
        self.clf = RandomForestRegressor()
        self.xgb = XGBRegressor()

    def get_best_params_for_random_forest(self, train_x, train_y):
        self.param_grid = {"n_estimators": [10, 50, 100, 130],
                            "max_depth": range(2, 4, 1), "max_features": ['auto', 'log2']
                          }
        self.grid = GridSearchCV(estimator=self.clf, param_grid=self.param_grid, cv=5,  verbose=0)
        self.grid.fit(train_x, train_y)
        self.max_depth = self.grid.best_params_['max_depth']
        self.max_features = self.grid.best_params_['max_features']
        self.n_estimators = self.grid.best_params_['n_estimators']
        self.clf = RandomForestRegressor(n_estimators=self.n_estimators,
                                            max_depth=self.max_depth, max_features=self.max_features)
        self.clf.fit(train_x, train_y)
        return self.clf

    def get_best_params_for_xgboost(self, train_x, train_y):
        self.param_grid_xgboost = {
            'learning_rate': [0.5, 0.1, 0.01, 0.001],
            'max_depth': [3, 5, 10, 20],
            'n_estimators': [10, 50, 100, 200]

        }
        self.grid= GridSearchCV(self.xgb, self.param_grid_xgboost, verbose=0,cv=5)
        self.grid.fit(train_x, train_y)
        self.learning_rate = self.grid.best_params_['learning_rate']
        self.max_depth = self.grid.best_params_['max_depth']
        self.n_estimators = self.grid.best_params_['n_estimators']
        self.xgb = XGBRegressor(learning_rate=self.learning_rate, max_depth=self.max_depth, n_estimators=self.n_estimators)
        self.xgb.fit(train_x, train_y)
        return self.xgb

    def get_best_model(self, train_x, train_y):
        self.forest = self.get_best_params_for_random_forest(train_x, train_y)
        self.xgboost= self.get_best_params_for_xgboost(train_x, train_y)
        print("Random Forest: ")
        score = cross_val_score(self.forest, train_x, train_y, cv=10, scoring="r2")
        print("Minimum Score: ", min(score))
        print("Maximum Score: ", max(score))
        median_forest = statistics.median(score)
        print("Median Score: ", median_forest)
        print("XGBoost: ")
        score = cross_val_score(self.xgboost, train_x, train_y, cv=10, scoring="r2")
        print("Minimum Score: ", min(score))
        print("Maximum Score: ", max(score))
        median_boost = statistics.median(score)
        print("Median Score: ", median_boost)
        if median_forest >= median_boost:
            return self.forest, "random_forest"
        else:
            return self.xgboost, "xgboost"

In [34]:
finder = Model_Finder()
for cluster in range(3):
    print("Cluster No: ", cluster)
    temp = df[df['Cluster'] == cluster]
    x_train = temp.iloc[:,:-2].values
    y_train = temp.iloc[:, -2].values
    model, name = finder.get_best_model(x_train, y_train)
    filename = name + "_for_cluster_" + str(cluster)
    pickle.dump(model, open(filename, 'wb'))

Cluster No:  0
Random Forest: 
Minimum Score:  0.9787215411349837
Maximum Score:  0.9862306656047835
Median Score:  0.984088697923263
XGBoost: 
Minimum Score:  0.9975817978272691
Maximum Score:  0.9995400871353354
Median Score:  0.9992037814240122
Cluster No:  1
Random Forest: 
Minimum Score:  0.972729560161112
Maximum Score:  0.9814627913818073
Median Score:  0.9793704315143191
XGBoost: 
Minimum Score:  0.9972008637520964
Maximum Score:  0.998526416698624
Median Score:  0.998022059557711
Cluster No:  2
Random Forest: 
Minimum Score:  0.9477734030797174
Maximum Score:  0.965525207270519
Median Score:  0.9624821394540015
XGBoost: 
Minimum Score:  0.997751893124255
Maximum Score:  0.9992046715404318
Median Score:  0.9986984745870088
