# 1. Water Consumer Data test models

## 1.1. Initialization

In [1]:
from IPython.display import display
import pandas as pd

df_train = pd.read_csv('src/train.csv')
df_train

Unnamed: 0,Year,Month,Consumer_type,Consumption,Consumer_number,Installation_zone
0,2013,1,domestic,0,MOGV36480546611521,Installation_zone 1
1,2013,1,industrial,5,BECS02817768252637,Installation_zone 2
2,2013,1,domestic,6,VRFW65577141436242,Installation_zone 2
3,2013,1,domestic,1,QLLI18662653137621,Installation_zone 2
4,2013,1,domestic,13,HYUO61823402850645,Installation_zone 2
...,...,...,...,...,...,...
329970,2020,12,domestic,0,ZSOE03393580274296,Installation_zone 2
329971,2020,12,rural domestic,0,RVZG94535208799772,Installation_zone 47
329972,2020,12,rural domestic,0,DWGA69725679861554,Installation_zone 7
329973,2020,12,rural domestic,0,VJDC45865594518045,Installation_zone 32


## 2. Training Tests

### 2.0. Pré processing

In [2]:
from numpy import random

SEED = 42
random.seed(SEED)

In [3]:
from datetime import datetime

df = df_train.copy()
map_names = {
    'Year': 'year',
    'Month': 'month',
    'Consumer_type': 'consumer_type',
    'Consumption': 'consumption',
    'Consumer_number': 'consumer_number',
    'Installation_zone': 'installation_zone',
}
df = df.rename(columns=map_names)
df = df.drop(columns=['consumer_number'])
df.installation_zone = df.installation_zone.str.replace('Installation_zone ', '').astype(int)
# df['age'] = datetime.now().year - df.year
# df = df.drop(columns=['year'])

Y = df.consumer_type
X = df.drop(columns=['consumer_type'])

X

Unnamed: 0,year,month,consumption,installation_zone
0,2013,1,0,1
1,2013,1,5,2
2,2013,1,6,2
3,2013,1,1,2
4,2013,1,13,2
...,...,...,...,...
329970,2020,12,0,2
329971,2020,12,0,47
329972,2020,12,0,7
329973,2020,12,0,32


In [4]:

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, stratify=Y)

print('Train size: ', len(x_train))
print('Test size: ', len(x_test))


Train size:  247481
Test size:  82494


In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

x_train

array([[-1.16792715,  1.56368704, -0.34390487, -0.3672939 ],
       [-0.32661792,  1.56368704, -0.08964868, -0.55098   ],
       [ 0.93534593, -1.56594607,  0.16460752, -0.55098   ],
       ...,
       [ 1.35600055,  1.56368704, -0.34390487, -0.45913695],
       [ 0.51469131, -1.56594607, -0.14049992, -0.3672939 ],
       [ 0.0940367 , -0.99692186, -0.19135116, -0.3672939 ]])

### 2.1 Dummy Classifier

In [6]:
# dummy classifier
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier(strategy='most_frequent')

dummy.fit(x_train, y_train)
y_pred = dummy.predict(x_test)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

dummy_accuracy = accuracy_score(y_test, y_pred)*100
dummy_precision = precision_score(y_test, y_pred, average='macro')*100
dummy_recall = recall_score(y_test, y_pred, average='macro')*100
dummy_f1 = f1_score(y_test, y_pred, average='macro')*100

print(f'Accuracy: {dummy_accuracy:.2f}%')
print(f'Precision: {dummy_precision:.2f}%')
print(f'Recall: {dummy_recall:.2f}%')
print(f'F1: {dummy_f1:.2f}%')


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 71.57%
Precision: 10.22%
Recall: 14.29%
F1: 11.92%


### 2.2. Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Create and train the logistic regression model
model = LogisticRegression(solver='liblinear')
model.fit(x_train,y_train)

# Make predictions on the test set
y_pred = model.predict(x_test)

# Evaluate the model
logic_regression_accuracy = accuracy_score(y_test, y_pred)*100
logic_regression_precision = precision_score(y_test, y_pred, average='macro')*100
logic_regression_recall = recall_score(y_test, y_pred, average='macro')*100
logic_regression_f1 = f1_score(y_test, y_pred, average='macro')*100

# confusion = confusion_matrix(y_test, y_pred)
# classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {logic_regression_accuracy:.2f}%')
print(f'Precision: {logic_regression_precision:.2f}%')
print(f'Recall: {logic_regression_recall:.2f}%')
print(f'F1: {logic_regression_f1:.2f}%')


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 88.66%
Precision: 36.06%
Recall: 27.10%
F1: 26.56%


### 2.3. SVM

In [8]:
# from sklearn.svm import SVC

# model = SVC(kernel='linear')
# model.fit(x_train, y_train)

# y_pred = model.predict(x_test)

# svm_accuracy = accuracy_score(y_test, y_pred)*100
# svm_precision = precision_score(y_test, y_pred, average='macro')*100
# svm_recall = recall_score(y_test, y_pred, average='macro')*100
# svm_f1 = f1_score(y_test, y_pred, average='macro')*100

# print(f'Accuracy: {svm_accuracy:.2f}%')
# print(f'Precision: {svm_precision:.2f}%')
# print(f'Recall: {svm_recall:.2f}%')
# print(f'F1: {svm_f1:.2f}%')


### 2.4. Decision Tree

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model = DecisionTreeClassifier(max_depth=3)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

decision_tree_accuracy = accuracy_score(y_test, y_pred)*100
decision_tree_precision = precision_score(y_test, y_pred, average='macro')*100
decision_tree_recall = recall_score(y_test, y_pred, average='macro')*100
decision_tree_f1 = f1_score(y_test, y_pred, average='macro')*100

print(f'Accuracy: {decision_tree_accuracy:.2f}%')
print(f'Precision: {decision_tree_precision:.2f}%')
print(f'Recall: {decision_tree_recall:.2f}%')
print(f'F1: {decision_tree_f1:.2f}%')


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 90.94%
Precision: 40.00%
Recall: 30.88%
F1: 31.47%


### 2.5. Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model = RandomForestClassifier(max_depth=3)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

random_forest_accuracy = accuracy_score(y_test, y_pred)*100
random_forest_precision = precision_score(y_test, y_pred, average='macro')*100
random_forest_recall = recall_score(y_test, y_pred, average='macro')*100
random_forest_f1 = f1_score(y_test, y_pred, average='macro')*100

print(f'Accuracy: {random_forest_accuracy:.2f}%')
print(f'Precision: {random_forest_precision:.2f}%')
print(f'Recall: {random_forest_recall:.2f}%')
print(f'F1: {random_forest_f1:.2f}%')


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 90.67%
Precision: 26.35%
Recall: 28.56%
F1: 27.41%


### 2.6. KNeighbors Classifier

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

knn_accuracy = accuracy_score(y_test, y_pred)*100
knn_precision = precision_score(y_test, y_pred, average='macro')*100
knn_recall = recall_score(y_test, y_pred, average='macro')*100
knn_f1 = f1_score(y_test, y_pred, average='macro')*100

print(f'Accuracy: {knn_accuracy:.2f}%')
print(f'Precision: {knn_precision:.2f}%')
print(f'Recall: {knn_recall:.2f}%')
print(f'F1: {knn_f1:.2f}%')


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 90.29%
Precision: 41.26%
Recall: 32.09%
F1: 33.50%


### 2.7. Naive Bayes

In [12]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model = GaussianNB()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

naive_bayes_accuracy = accuracy_score(y_test, y_pred)*100
naive_bayes_precision = precision_score(y_test, y_pred, average='macro')*100
naive_bayes_recall = recall_score(y_test, y_pred, average='macro')*100
naive_bayes_f1 = f1_score(y_test, y_pred, average='macro')*100

print(f'Accuracy: {naive_bayes_accuracy:.2f}%')
print(f'Precision: {naive_bayes_precision:.2f}%')
print(f'Recall: {naive_bayes_recall:.2f}%')
print(f'F1: {naive_bayes_f1:.2f}%')

  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 89.43%
Precision: 32.88%
Recall: 28.33%
F1: 28.20%


## 3. Comparing

In [13]:
data = {
    'method': ['dummy', 'logistic regression', 'decision tree', 'random forest', 'knn', 'naive bayes'],
    'accuracy': [dummy_accuracy, logic_regression_accuracy, decision_tree_accuracy, random_forest_accuracy, knn_accuracy, naive_bayes_accuracy],
    'precision': [dummy_precision, logic_regression_precision, decision_tree_precision, random_forest_precision, knn_precision, naive_bayes_precision],
    'recall': [dummy_recall, logic_regression_recall, decision_tree_recall, random_forest_recall, knn_recall, naive_bayes_recall],
    'f1': [dummy_f1, logic_regression_f1, decision_tree_f1, random_forest_f1, knn_f1, naive_bayes_f1]
}

df_results = pd.DataFrame(data)
df_results

Unnamed: 0,method,accuracy,precision,recall,f1
0,dummy,71.571266,10.224467,14.285714,11.918623
1,logistic regression,88.660994,36.062478,27.096171,26.557315
2,decision tree,90.935098,39.997301,30.87669,31.465136
3,random forest,90.6672,26.354072,28.559529,27.406829
4,knn,90.287779,41.257925,32.087628,33.496105
5,naive bayes,89.42711,32.881048,28.330407,28.199337
