In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier

In [2]:
data = pd.read_csv('file_02.csv')
data

Unnamed: 0,index,Date,Region,Thermal Generation Actual (in MU),Thermal Generation Estimated (in MU),Nuclear Generation Actual (in MU),Nuclear Generation Estimated (in MU),Hydro Generation Actual (in MU),Hydro Generation Estimated (in MU)
0,0,2017-09-01,Northern,624.23,484.21,30.36,35.57,273.27,320.81
1,1,2017-09-01,Western,1106.89,1024.33,25.17,3.81,72.00,21.53
2,2,2017-09-01,Southern,576.66,578.55,62.73,49.80,111.57,64.78
3,3,2017-09-01,Eastern,441.02,429.39,,,85.94,69.36
4,4,2017-09-01,NorthEastern,29.11,15.91,,,24.64,21.21
...,...,...,...,...,...,...,...,...,...
4940,305,2020-08-01,Northern,669.47,602.96,26.88,23.41,348.72,351.98
4941,306,2020-08-01,Western,1116.00,1262.10,42.37,36.63,54.67,20.28
4942,307,2020-08-01,Southern,494.66,415.53,61.83,26.28,93.49,77.25
4943,308,2020-08-01,Eastern,482.86,547.03,,,87.22,93.78


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4945 entries, 0 to 4944
Data columns (total 9 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   index                                 4945 non-null   int64  
 1   Date                                  4945 non-null   object 
 2   Region                                4945 non-null   object 
 3   Thermal Generation Actual (in MU)     4945 non-null   object 
 4   Thermal Generation Estimated (in MU)  4945 non-null   object 
 5   Nuclear Generation Actual (in MU)     2967 non-null   float64
 6   Nuclear Generation Estimated (in MU)  2967 non-null   float64
 7   Hydro Generation Actual (in MU)       4945 non-null   float64
 8   Hydro Generation Estimated (in MU)    4945 non-null   float64
dtypes: float64(4), int64(1), object(4)
memory usage: 347.8+ KB


In [4]:
data.shape

(4945, 9)

In [5]:
data.describe()

Unnamed: 0,index,Nuclear Generation Actual (in MU),Nuclear Generation Estimated (in MU),Hydro Generation Actual (in MU),Hydro Generation Estimated (in MU)
count,4945.0,2967.0,2967.0,4945.0,4945.0
mean,2181.433771,37.242208,36.987877,73.305921,76.842965
std,1397.754755,15.883968,11.491292,74.482145,82.043952
min,0.0,0.0,0.0,0.0,0.0
25%,926.0,26.14,30.19,26.91,23.31
50%,2162.0,30.72,34.84,52.96,50.27
75%,3398.0,46.83,43.075,85.94,95.8
max,4634.0,68.74,76.64,348.72,397.38


In [6]:
data = data.drop('index', axis=1)

In [7]:
data.isna().mean()

Date                                    0.0
Region                                  0.0
Thermal Generation Actual (in MU)       0.0
Thermal Generation Estimated (in MU)    0.0
Nuclear Generation Actual (in MU)       0.4
Nuclear Generation Estimated (in MU)    0.4
Hydro Generation Actual (in MU)         0.0
Hydro Generation Estimated (in MU)      0.0
dtype: float64

In [8]:
for column in ['Nuclear Generation Actual (in MU)', 'Nuclear Generation Estimated (in MU)']:
    data[column] = data[column].fillna(data[column].mean())

In [9]:
print("Total missing values:", data.isna().sum().sum())

Total missing values: 0


In [10]:
data

Unnamed: 0,Date,Region,Thermal Generation Actual (in MU),Thermal Generation Estimated (in MU),Nuclear Generation Actual (in MU),Nuclear Generation Estimated (in MU),Hydro Generation Actual (in MU),Hydro Generation Estimated (in MU)
0,2017-09-01,Northern,624.23,484.21,30.360000,35.570000,273.27,320.81
1,2017-09-01,Western,1106.89,1024.33,25.170000,3.810000,72.00,21.53
2,2017-09-01,Southern,576.66,578.55,62.730000,49.800000,111.57,64.78
3,2017-09-01,Eastern,441.02,429.39,37.242208,36.987877,85.94,69.36
4,2017-09-01,NorthEastern,29.11,15.91,37.242208,36.987877,24.64,21.21
...,...,...,...,...,...,...,...,...
4940,2020-08-01,Northern,669.47,602.96,26.880000,23.410000,348.72,351.98
4941,2020-08-01,Western,1116.00,1262.10,42.370000,36.630000,54.67,20.28
4942,2020-08-01,Southern,494.66,415.53,61.830000,26.280000,93.49,77.25
4943,2020-08-01,Eastern,482.86,547.03,37.242208,36.987877,87.22,93.78


In [11]:
data['Year'] = data['Date'].apply(lambda x: np.int(x[0:4]))
data['Month'] = data['Date'].apply(lambda x: np.int(x[5:7]))

data = data.drop('Date', axis=1)

In [12]:
for column in ['Thermal Generation Actual (in MU)', 'Thermal Generation Estimated (in MU)']:
    data[column] = data[column].apply(lambda x: np.float(x.replace(',', '')))

Encoding Labels

In [13]:
label_encoder = LabelEncoder()

data['Region'] = label_encoder.fit_transform(data['Region'])

In [14]:
data.Region

0       2
1       4
2       3
3       0
4       1
       ..
4940    2
4941    4
4942    3
4943    0
4944    1
Name: Region, Length: 4945, dtype: int64

In [15]:
data.head(3)

Unnamed: 0,Region,Thermal Generation Actual (in MU),Thermal Generation Estimated (in MU),Nuclear Generation Actual (in MU),Nuclear Generation Estimated (in MU),Hydro Generation Actual (in MU),Hydro Generation Estimated (in MU),Year,Month
0,2,624.23,484.21,30.36,35.57,273.27,320.81,2017,9
1,4,1106.89,1024.33,25.17,3.81,72.0,21.53,2017,9
2,3,576.66,578.55,62.73,49.8,111.57,64.78,2017,9


In [16]:
data.dtypes

Region                                    int64
Thermal Generation Actual (in MU)       float64
Thermal Generation Estimated (in MU)    float64
Nuclear Generation Actual (in MU)       float64
Nuclear Generation Estimated (in MU)    float64
Hydro Generation Actual (in MU)         float64
Hydro Generation Estimated (in MU)      float64
Year                                      int64
Month                                     int64
dtype: object

In [17]:
y = data['Region'].copy()
X = data.drop(['Region','Year','Month'], axis=1).copy()

In [18]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [20]:
model = DecisionTreeClassifier()

In [21]:
result = []
model.fit(X_train, y_train)
result.append(model.score(X_test, y_test))

In [22]:
result

[0.9993261455525606]

In [23]:
# y = data['Region'].copy()
# X = data.drop('Region', axis=1).copy()

In [24]:
# scaler = StandardScaler()

# X = scaler.fit_transform(X)

In [25]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [26]:
# models = [
#     LogisticRegression(),
#     SVC(),
#     MLPClassifier(),
#     DecisionTreeClassifier(),
#     AdaBoostClassifier(),
#     BaggingClassifier(),
#     GradientBoostingClassifier(),
#     RandomForestClassifier()
# ]

# model_names = [
#     "         Logistic Regression",
#     "      Support Vector Machine",
#     "              Neural Network",
#     "               Decision Tree",
#     "         AdaBoost Classifier",
#     "          Bagging Classifier",
#     "Gradient Boosting Classifier",
#     "    Random Forest Classifier"
# ]

In [27]:
# model = DecisionTreeClassifier()

In [28]:
# result = []
# model.fit(X_train, y_train)
# result.append(model.score(X_test, y_test))

In [29]:
# result

In [30]:
# results = []

# for i in range(len(models)):
#     models[i].fit(X_train, y_train)
#     results.append(models[i].score(X_test, y_test))

In [31]:
# for i in range(len(models)):
#     print(model_names[i] + ": {:.5f}".format(results[i]))

In [32]:
#new_input = [[543.5, 678.5, 44.5, 40.5, 870, 800, 2017, 8]]

In [33]:
#for i in range(len(models)):
#    new_output = models[i].predict(new_input)
#    print(new_output)

In [34]:
new_input = [[624.23, 484.21, 30.36, 35.57, 273.27, 320.81]]

In [35]:
new_output = model.predict(new_input)
print(new_output)

[4]


In [36]:
import pickle

In [37]:
pkl_file= open('decision_tree_model_final.pkl',"wb")

In [38]:
pickle.dump(model,pkl_file)

In [39]:
pkl_file.close()

Northern - 2

Western - 4

Southern - 3

Eastern - 0

Northeastern - 1