In [3]:
import pandas as pd

load_profiles_df_w_cluster = pd.read_csv('./loads_profiles_w_cluster.csv')

train_df = load_profiles_df_w_cluster[load_profiles_df_w_cluster['date'] < '2011-12-01']
test_df = load_profiles_df_w_cluster[load_profiles_df_w_cluster['date'] >= '2011-12-01']

# Regression
Because the clusters were very well-separated based on two variables, 'Max Power (W)' and 'Difference in Peak and Max Power (W)',
can use these two variables as predictors in regression

More than two classes, so have to use multinomial regression. See https://machinelearningmastery.com/multinomial-logistic-regression-with-python/

In [4]:
from sklearn.linear_model import LogisticRegression

regression_predictors = ['Max Power (W)', 'Different in Peak and Minimum Power (W)']
#regression_predictors = ['Max Power (W)']
response_variable = 'cluster'

multinom_model = LogisticRegression(multi_class='multinomial', solver='lbfgs') # solver is lbfgs, cross entropy loss. 
multinom_model.fit(train_df[regression_predictors], train_df[response_variable])

In [5]:
train_df_w_classifier = train_df.copy()
train_df_w_classifier['predicted_cluster'] = multinom_model.predict(train_df[regression_predictors])
pd.crosstab(train_df_w_classifier['cluster'], train_df_w_classifier['predicted_cluster'])

predicted_cluster,0,1,2
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,90,0,0
1,0,78,0
2,0,0,166


In [6]:
multinom_model.score(train_df[regression_predictors], train_df[response_variable])

1.0

In [8]:
multinom_model.coef_

array([[-0.76974055, -0.24158634],
       [ 0.88307695,  0.17790363],
       [-0.1133364 ,  0.06368271]])

In [9]:
multinom_model.intercept_

array([ 1048.71643414, -1257.17779561,   208.46136147])

In [7]:
test_df_w_classifier = test_df.copy()
test_df_w_classifier['predicted_cluster'] = multinom_model.predict(test_df[regression_predictors])
pd.crosstab(test_df_w_classifier['cluster'], test_df_w_classifier['predicted_cluster'])

predicted_cluster,0,1,2
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,27,0,0
1,0,1,0
2,0,0,3


# Random Forest Classifier

In [35]:
from sklearn.ensemble import RandomForestClassifier
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=223)

predictors = [col for col in train_df if col not in [response_variable, 'date']]
random_forest_classifier.fit(train_df[predictors], train_df[response_variable])

In [40]:
rf_train_df = train_df.copy()
train_pred = random_forest_classifier.predict(train_df[predictors])

rf_train_df['predicted_cluster'] = train_pred
pd.crosstab(rf_train_df['cluster'], rf_train_df['predicted_cluster'])

test_pred = random_forest_classifier.predict(test_df[predictors])
rf_test_df = test_df.copy()
rf_test_df['predicted_cluster'] = test_pred
pd.crosstab(rf_test_df['cluster'], rf_test_df['predicted_cluster'])


predicted_cluster,0,1,2
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,27,0,0
1,0,1,0
2,0,0,3
