# Treinamento 

## Imports

In [39]:
import os

import pickle
import kagglehub
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (OneHotEncoder, PolynomialFeatures,
                                   StandardScaler)

In [2]:
os.chdir("..")

from src.preprocessing import preprocess_data

## Carregamento e tratamento dos dados

In [3]:
if not os.path.exists("data/ev_charging_patterns.csv"):
    df_original = pd.read_csv(kagglehub.dataset_download("valakhorasani/electric-vehicle-charging-patterns", path='ev_charging_patterns.csv'))
    df_original.to_csv("data/ev_charging_patterns.csv", index=False)
else:
    df_original = pd.read_csv("data/ev_charging_patterns.csv")
df_original.head()

Unnamed: 0,User ID,Vehicle Model,Battery Capacity (kWh),Charging Station ID,Charging Station Location,Charging Start Time,Charging End Time,Energy Consumed (kWh),Charging Duration (hours),Charging Rate (kW),Charging Cost (USD),Time of Day,Day of Week,State of Charge (Start %),State of Charge (End %),Distance Driven (since last charge) (km),Temperature (°C),Vehicle Age (years),Charger Type,User Type
0,User_1,BMW i3,108.463007,Station_391,Houston,2024-01-01 00:00:00,2024-01-01 00:39:00,60.712346,0.591363,36.389181,13.087717,Evening,Tuesday,29.371576,86.119962,293.602111,27.947953,2.0,DC Fast Charger,Commuter
1,User_2,Hyundai Kona,100.0,Station_428,San Francisco,2024-01-01 01:00:00,2024-01-01 03:01:00,12.339275,3.133652,30.677735,21.128448,Morning,Monday,10.115778,84.664344,112.112804,14.311026,3.0,Level 1,Casual Driver
2,User_3,Chevy Bolt,75.0,Station_181,San Francisco,2024-01-01 02:00:00,2024-01-01 04:48:00,19.128876,2.452653,27.513593,35.66727,Morning,Thursday,6.854604,69.917615,71.799253,21.002002,2.0,Level 2,Commuter
3,User_4,Hyundai Kona,50.0,Station_327,Houston,2024-01-01 03:00:00,2024-01-01 06:42:00,79.457824,1.266431,32.88287,13.036239,Evening,Saturday,83.120003,99.624328,199.577785,38.316313,1.0,Level 1,Long-Distance Traveler
4,User_5,Hyundai Kona,50.0,Station_108,Los Angeles,2024-01-01 04:00:00,2024-01-01 05:46:00,19.629104,2.019765,10.215712,10.161471,Morning,Saturday,54.25895,63.743786,203.661847,-7.834199,1.0,Level 1,Long-Distance Traveler


### Pré-processamento

- Remoção das colunas: 
  - 'Charging Start Time'
  - 'Charging End Time'
  - 'User ID'
  - 'Charging Station ID'
  
Pois o user ID e o ID da estação de carregamento não são relevantes para o modelo, e o tempo de início e fim de carregamento são redundantes com a duração do carregamento. Também foram removidas as linhas com target "commutter" para simplificar o problema.

In [4]:
if not os.path.exists("data/ev_charging_patterns_preprocessed.csv"):
    df_preprocessed = preprocess_data(df_original).dropna()
    df_preprocessed.to_csv("data/ev_charging_patterns_preprocessed.csv", index=False)
else:
    df_preprocessed = pd.read_csv("data/ev_charging_patterns_preprocessed.csv")

df_preprocessed.head()

Unnamed: 0,Vehicle Model,Battery Capacity (kWh),Charging Station Location,Energy Consumed (kWh),Charging Duration (hours),Charging Rate (kW),Charging Cost (USD),Time of Day,Day of Week,State of Charge (Start %),State of Charge (End %),Distance Driven (since last charge) (km),Temperature (°C),Vehicle Age (years),Charger Type,User Type
0,Hyundai Kona,100.0,San Francisco,12.339275,3.133652,30.677735,21.128448,Morning,Monday,10.115778,84.664344,112.112804,14.311026,3.0,Level 1,Casual Driver
1,Hyundai Kona,50.0,Houston,79.457824,1.266431,32.88287,13.036239,Evening,Saturday,83.120003,99.624328,199.577785,38.316313,1.0,Level 1,Long-Distance Traveler
2,Hyundai Kona,50.0,Los Angeles,19.629104,2.019765,10.215712,10.161471,Morning,Saturday,54.25895,63.743786,203.661847,-7.834199,1.0,Level 1,Long-Distance Traveler
3,Nissan Leaf,50.0,San Francisco,43.181137,1.16764,14.334523,36.900341,Evening,Saturday,75.217748,71.982288,143.680046,-5.274218,0.0,DC Fast Charger,Long-Distance Traveler
4,Chevy Bolt,75.0,Los Angeles,51.467617,2.655396,26.702908,9.796821,Afternoon,Monday,56.201703,63.786815,116.543166,-4.41746,0.0,Level 2,Long-Distance Traveler


### Separação entre categorical features, numerical features e target

In [5]:
feature_column = "User Type"

categorical_features = [
    "Vehicle Model",
    "Charging Station Location",
    "Time of Day",
    "Day of Week",
    "Charger Type",
]

numerical_features = df_preprocessed.columns.difference(categorical_features + [feature_column])

### Separação entre features e target

In [6]:
X = df_preprocessed.drop(columns=['User Type'])
y = df_preprocessed['User Type']

### Separação entre treino e teste

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Pipeline de pré-processamento

In [8]:
num_preprocessor = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
])

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_preprocessor, numerical_features),
        ('cat', OneHotEncoder(drop="first"), categorical_features)
    ])

preprocessor

## Treinamento

O treinamento consiste em primeiro realizar o GridSearchCV para encontrar os melhores hiperparâmetros para o modelo escolhido, que foi o RandomForestClassifier. Entretanto a acurácia ficou em 0.48, abaixo do dummy classifier com a estratégia de "most_frequent", de 0.5. 

Para aprimorar a acurácia do modelo, foi utilizado o SequentialFeatureSelector, utilizando o modelo RandomForestClassifier e os hiperparâmetros encontrados anteriormente.

Com essa técnica, foi possível aumentar a acurácia do modelo para 0.79.

A acurácia foi utilizada como métrica de avaliação e certificação do modelo, pois como o dataset é balanceado, a acurácia apresenta uma boa representação do desempenho do modelo.

### Dummy Classifier

In [10]:
dummy = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DummyClassifier(strategy="most_frequent"))
])

In [11]:
dummy.fit(X_train, y_train)

In [12]:
y_pred = dummy.predict(X_test)
accuracy_score(y_test, y_pred)

0.5

In [13]:
pd.Series(y_train).value_counts(normalize=True)

User Type
Long-Distance Traveler    0.53012
Casual Driver             0.46988
Name: proportion, dtype: float64

In [14]:
pd.Series(y_test).value_counts(normalize=True)

User Type
Long-Distance Traveler    0.5
Casual Driver             0.5
Name: proportion, dtype: float64

### GridSearchCV com RandomForestClassifier

In [31]:
pipeline_gs = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid = {
    'classifier__n_estimators': range(10, 100, 10),
    'classifier__max_depth': range(1, 5),
    'classifier__min_samples_split': range(2, 5),
    'classifier__min_samples_leaf': range(1, 5),
    'classifier__max_features': ['sqrt', 'log2', None],
}

grid = GridSearchCV(pipeline_gs, param_grid=param_grid, cv=5, n_jobs=-1)

In [32]:
grid.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [33]:
print(grid.best_params_)

y_pred = grid.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy:.2f}')

{'classifier__max_depth': 4, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 80}
Acurácia: 0.48


### SequentialFeatureSelector com RandomForestClassifier

In [34]:
model = RandomForestClassifier(max_depth=4, max_features='sqrt',
                                        min_samples_leaf=1, n_estimators=80, random_state=42, min_samples_split=3)

sfs = SequentialFeatureSelector(model, n_features_to_select='auto', direction='forward', n_jobs=-1)

pipeline_final = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selector', sfs),
    ('classifier', model)
])

In [35]:
pipeline_final.fit(X, y)

## Certificação

In [44]:
y_pred = pipeline_final.predict(X_test)

In [45]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy:.2f}')

Acurácia: 0.79


## Exportando o modelo

In [41]:
with open('models/model.pkl', 'wb') as file:
    pickle.dump(pipeline_final, file)