# Treinamento 

## Imports

In [1]:
import os
from tempfile import mkdtemp

import pickle
import kagglehub
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler

In [2]:
os.chdir("..")

from src.preprocessing import preprocess_data

## Carregamento e tratamento dos dados

In [3]:
if not os.path.exists("data/ev_charging_patterns.csv"):
    df_original = pd.read_csv(kagglehub.dataset_download("valakhorasani/electric-vehicle-charging-patterns", path='ev_charging_patterns.csv'))
    df_original.to_csv("data/ev_charging_patterns.csv", index=False)
else:
    df_original = pd.read_csv("data/ev_charging_patterns.csv")
df_original.head()

Unnamed: 0,User ID,Vehicle Model,Battery Capacity (kWh),Charging Station ID,Charging Station Location,Charging Start Time,Charging End Time,Energy Consumed (kWh),Charging Duration (hours),Charging Rate (kW),Charging Cost (USD),Time of Day,Day of Week,State of Charge (Start %),State of Charge (End %),Distance Driven (since last charge) (km),Temperature (°C),Vehicle Age (years),Charger Type,User Type
0,User_1,BMW i3,108.463007,Station_391,Houston,2024-01-01 00:00:00,2024-01-01 00:39:00,60.712346,0.591363,36.389181,13.087717,Evening,Tuesday,29.371576,86.119962,293.602111,27.947953,2.0,DC Fast Charger,Commuter
1,User_2,Hyundai Kona,100.0,Station_428,San Francisco,2024-01-01 01:00:00,2024-01-01 03:01:00,12.339275,3.133652,30.677735,21.128448,Morning,Monday,10.115778,84.664344,112.112804,14.311026,3.0,Level 1,Casual Driver
2,User_3,Chevy Bolt,75.0,Station_181,San Francisco,2024-01-01 02:00:00,2024-01-01 04:48:00,19.128876,2.452653,27.513593,35.66727,Morning,Thursday,6.854604,69.917615,71.799253,21.002002,2.0,Level 2,Commuter
3,User_4,Hyundai Kona,50.0,Station_327,Houston,2024-01-01 03:00:00,2024-01-01 06:42:00,79.457824,1.266431,32.88287,13.036239,Evening,Saturday,83.120003,99.624328,199.577785,38.316313,1.0,Level 1,Long-Distance Traveler
4,User_5,Hyundai Kona,50.0,Station_108,Los Angeles,2024-01-01 04:00:00,2024-01-01 05:46:00,19.629104,2.019765,10.215712,10.161471,Morning,Saturday,54.25895,63.743786,203.661847,-7.834199,1.0,Level 1,Long-Distance Traveler


### Pré-processamento

- Remoção das colunas: 
  - 'Charging Start Time'
  - 'Charging End Time'
  - 'User ID'
  - 'Charging Station ID'
  
Pois o user ID e o ID da estação de carregamento não são relevantes para o modelo, e o tempo de início e fim de carregamento são redundantes com a duração do carregamento. Também foram removidas as linhas com target "commutter" para simplificar o problema.

In [4]:
if not os.path.exists("data/ev_charging_patterns_preprocessed.csv"):
    df_preprocessed = preprocess_data(df_original).dropna()
    df_preprocessed.to_csv("data/ev_charging_patterns_preprocessed.csv", index=False)
else:
    df_preprocessed = pd.read_csv("data/ev_charging_patterns_preprocessed.csv")

df_preprocessed.head()

Unnamed: 0,Vehicle Model,Battery Capacity (kWh),Charging Station Location,Energy Consumed (kWh),Charging Duration (hours),Charging Rate (kW),Charging Cost (USD),Time of Day,Day of Week,State of Charge (Start %),State of Charge (End %),Distance Driven (since last charge) (km),Temperature (°C),Vehicle Age (years),Charger Type,User Type
0,Hyundai Kona,100.0,San Francisco,12.339275,3.133652,30.677735,21.128448,Morning,Monday,10.115778,84.664344,112.112804,14.311026,3.0,Level 1,Casual Driver
1,Hyundai Kona,50.0,Houston,79.457824,1.266431,32.88287,13.036239,Evening,Saturday,83.120003,99.624328,199.577785,38.316313,1.0,Level 1,Long-Distance Traveler
2,Hyundai Kona,50.0,Los Angeles,19.629104,2.019765,10.215712,10.161471,Morning,Saturday,54.25895,63.743786,203.661847,-7.834199,1.0,Level 1,Long-Distance Traveler
3,Nissan Leaf,50.0,San Francisco,43.181137,1.16764,14.334523,36.900341,Evening,Saturday,75.217748,71.982288,143.680046,-5.274218,0.0,DC Fast Charger,Long-Distance Traveler
4,Chevy Bolt,75.0,Los Angeles,51.467617,2.655396,26.702908,9.796821,Afternoon,Monday,56.201703,63.786815,116.543166,-4.41746,0.0,Level 2,Long-Distance Traveler


### Separação entre categorical features, numerical features e target

In [5]:
feature_column = "User Type"

categorical_features = [
    "Vehicle Model",
    "Charging Station Location",
    "Time of Day",
    "Day of Week",
    "Charger Type",
]

numerical_features = df_preprocessed.columns.difference(categorical_features + [feature_column])

### Separação entre features e target

In [6]:
X = df_preprocessed.drop(columns=['User Type'])
y = df_preprocessed['User Type']

### Separação entre treino e teste

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Pipeline de pré-processamento

In [8]:
num_preprocessor = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=2, interaction_only=False)),
    ],
    memory=mkdtemp()
)

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_preprocessor, numerical_features),
        ('cat', OneHotEncoder(drop="first"), categorical_features)
    ]
)

preprocessor

## Treinamento

Embora tenham sido feitos diversos treinamentos, não foi possível obter um modelo melhor que o ```DummyClassifier```, que obteve acurácia de 0.5. Como foi observado na análise exploratória, as probabilidades de cada classse em cada feature são muito similares, o que dificulta a classificação e mesmo aplicando diversas técnicas de pré-processamento e treinamento, não foi possível obter um modelo melhor que o ```DummyClassifier```. Abaixo estão alguns dos modelos treinados.

### Dummy Classifier

In [10]:
dummy = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', DummyClassifier(strategy="most_frequent", random_state=42))
    ],
    memory=mkdtemp()
)

In [11]:
dummy.fit(X_train, y_train)

In [14]:
y_pred = dummy.predict(X_test)
accuracy_score(y_test, y_pred)

0.5

In [12]:
pd.Series(y_train).value_counts(normalize=True)

User Type
Long-Distance Traveler    0.53012
Casual Driver             0.46988
Name: proportion, dtype: float64

In [13]:
pd.Series(y_test).value_counts(normalize=True)

User Type
Long-Distance Traveler    0.5
Casual Driver             0.5
Name: proportion, dtype: float64

### Procurando melhores hiperparamentros do Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

pipeline_gs = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', GradientBoostingClassifier(random_state=42))
    ],
    memory=mkdtemp()
)

param_grid = {
    'classifier__loss': ['log_loss', 'exponential'],
    'classifier__learning_rate': [0.1, 0.01, 0.001],
    'classifier__n_estimators': range(10, 100, 10),
    'classifier__subsample': [0.5, 0.75, 1],
    'classifier__max_depth': range(1, 5),
    'classifier__criterion': ['friedman_mse', 'squared_error'],
    'classifier__min_samples_split': range(2, 5),
    'classifier__min_samples_leaf':  range(1, 5),
    'classifier__max_features': ['sqrt', 'log2', None],
}

grid = GridSearchCV(pipeline_gs, param_grid=param_grid, cv=5, n_jobs=-1)

In [18]:
grid.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [22]:
print(grid.best_params_)

model = grid.best_estimator_

y_pred   = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Acurácia: {accuracy:.2f}')

print(classification_report(y_test, y_pred))

{'classifier__criterion': 'friedman_mse', 'classifier__learning_rate': 0.1, 'classifier__loss': 'log_loss', 'classifier__max_depth': 4, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 3, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50, 'classifier__subsample': 0.5}
Acurácia: 0.48
                        precision    recall  f1-score   support

         Casual Driver       0.47      0.33      0.39        73
Long-Distance Traveler       0.48      0.63      0.55        73

              accuracy                           0.48       146
             macro avg       0.48      0.48      0.47       146
          weighted avg       0.48      0.48      0.47       146



### GaussianNB

In [32]:
from sklearn.naive_bayes import GaussianNB

pipeline_nb = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', GaussianNB())
    ],
    memory=mkdtemp()
)

param_grid_nb = {
    'classifier__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6],  # Ajuste fino de suavização
}

grid_nb = GridSearchCV(
    pipeline_nb,
    param_grid=param_grid_nb,
    cv=5,
    n_jobs=-1,
    scoring='accuracy'
)

grid_nb.fit(X_train, y_train)

In [33]:
print(grid_nb.best_params_)

nb_model = grid_nb.best_estimator_

nb_pred   = nb_model.predict(X_test)

print(f'Acurácia: {accuracy_score(y_test, nb_pred):.2f}')

print(classification_report(y_test, nb_pred))

{'classifier__var_smoothing': 1e-09}
Acurácia: 0.45
                        precision    recall  f1-score   support

         Casual Driver       0.42      0.29      0.34        73
Long-Distance Traveler       0.46      0.60      0.52        73

              accuracy                           0.45       146
             macro avg       0.44      0.45      0.43       146
          weighted avg       0.44      0.45      0.43       146



### KNeighborsClassifier

In [28]:
from sklearn.neighbors import KNeighborsClassifier

pipeline_knn = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', KNeighborsClassifier())
    ],
    memory=mkdtemp()
)

param_grid_knn = {
    'classifier__n_neighbors': [3, 5, 7, 10],  # Diferentes valores de K
    'classifier__weights': ['uniform', 'distance'],  # Pesos dos vizinhos
    'classifier__metric': ['euclidean', 'manhattan', 'minkowski'],  # Métricas de distância
}

grid_knn = GridSearchCV(
    pipeline_knn,
    param_grid=param_grid_knn,
    cv=5,
    n_jobs=-1,
    scoring='accuracy'
)

grid_knn.fit(X_train, y_train)

 0.52152962 0.53187445        nan 0.51121426        nan 0.54216033
        nan 0.53361332        nan 0.55075155 0.51977601 0.51977601
 0.50947539 0.50947539 0.52673151 0.52673151 0.52152962 0.53187445]


In [34]:
print(grid_knn.best_params_)

knn_model = grid_nb.best_estimator_

knn_pred   = nb_model.predict(X_test)

print(f'Acurácia: {accuracy_score(y_test, knn_pred):.2f}')

print(classification_report(y_test, knn_pred))

{'classifier__metric': 'manhattan', 'classifier__n_neighbors': 10, 'classifier__weights': 'distance'}
Acurácia: 0.45
                        precision    recall  f1-score   support

         Casual Driver       0.42      0.29      0.34        73
Long-Distance Traveler       0.46      0.60      0.52        73

              accuracy                           0.45       146
             macro avg       0.44      0.45      0.43       146
          weighted avg       0.44      0.45      0.43       146



### SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier

pipeline_sgd = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', SGDClassifier(random_state=42))
    ],
    memory=mkdtemp()
)

param_grid_sgd = {
    'classifier__loss': ['hinge', 'log_loss', 'modified_huber'],
    'classifier__penalty': ['l2', 'l1', 'elasticnet'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['optimal', 'invscaling'],
    'classifier__max_iter': [1000, 2000],
    'classifier__eta0': [0.01, 0.1, 1.0]
}

grid_sgd = GridSearchCV(
    pipeline_sgd,
    param_grid=param_grid_sgd,
    cv=5,
    n_jobs=-1,
    scoring='accuracy'
)

grid_sgd.fit(X_train, y_train)

In [36]:
print(grid_sgd.best_params_)

sgd_model = grid_nb.best_estimator_

sgd_pred   = nb_model.predict(X_test)

print(f'Acurácia: {accuracy_score(y_test, sgd_pred):.2f}')

print(classification_report(y_test, sgd_pred))

{'classifier__alpha': 0.01, 'classifier__eta0': 0.01, 'classifier__learning_rate': 'invscaling', 'classifier__loss': 'log_loss', 'classifier__max_iter': 1000, 'classifier__penalty': 'l1'}
Acurácia: 0.45
                        precision    recall  f1-score   support

         Casual Driver       0.42      0.29      0.34        73
Long-Distance Traveler       0.46      0.60      0.52        73

              accuracy                           0.45       146
             macro avg       0.44      0.45      0.43       146
          weighted avg       0.44      0.45      0.43       146



### SequentialFeatureSelector com RandomForestClassifier

In [23]:
model = GradientBoostingClassifier(max_depth=4, max_features='sqrt', min_samples_leaf=3,
                           n_estimators=50, random_state=42, subsample=0.5)

sfs   = SequentialFeatureSelector(model, n_features_to_select='auto', direction='forward', n_jobs=-1)

pipeline_final = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('feature_selector', sfs),
        ('classifier', model)
    ],
    memory=mkdtemp()
)

In [24]:
pipeline_final.fit(X_train, y_train)

In [None]:
# with open('models/model.pkl', 'rb') as file:
#     model = pickle.load(file)

y_pred = pipeline_final.predict(X_test)

print(f'Acurácia: {accuracy_score(y_test, y_pred):.2f}')

print(classification_report(y_test, y_pred))

Acurácia: 0.49
                        precision    recall  f1-score   support

         Casual Driver       0.49      0.44      0.46        73
Long-Distance Traveler       0.49      0.55      0.52        73

              accuracy                           0.49       146
             macro avg       0.49      0.49      0.49       146
          weighted avg       0.49      0.49      0.49       146



## Exportando o modelo

Devido a baixa acurácia obtida, o modelo exportado foi o ```DummyClassifier```.

In [11]:
dummy.fit(X, y)

In [12]:
with open('models/model.pkl', 'wb') as file:
    pickle.dump(dummy, file)