In [6]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
import kagglehub

In [7]:
os.chdir("..")

from src.preprocessing import preprocess_data

In [8]:
df = pd.read_csv(kagglehub.dataset_download("valakhorasani/electric-vehicle-charging-patterns", path='ev_charging_patterns.csv'))
df.head()




Unnamed: 0,User ID,Vehicle Model,Battery Capacity (kWh),Charging Station ID,Charging Station Location,Charging Start Time,Charging End Time,Energy Consumed (kWh),Charging Duration (hours),Charging Rate (kW),Charging Cost (USD),Time of Day,Day of Week,State of Charge (Start %),State of Charge (End %),Distance Driven (since last charge) (km),Temperature (°C),Vehicle Age (years),Charger Type,User Type
0,User_1,BMW i3,108.463007,Station_391,Houston,2024-01-01 00:00:00,2024-01-01 00:39:00,60.712346,0.591363,36.389181,13.087717,Evening,Tuesday,29.371576,86.119962,293.602111,27.947953,2.0,DC Fast Charger,Commuter
1,User_2,Hyundai Kona,100.0,Station_428,San Francisco,2024-01-01 01:00:00,2024-01-01 03:01:00,12.339275,3.133652,30.677735,21.128448,Morning,Monday,10.115778,84.664344,112.112804,14.311026,3.0,Level 1,Casual Driver
2,User_3,Chevy Bolt,75.0,Station_181,San Francisco,2024-01-01 02:00:00,2024-01-01 04:48:00,19.128876,2.452653,27.513593,35.66727,Morning,Thursday,6.854604,69.917615,71.799253,21.002002,2.0,Level 2,Commuter
3,User_4,Hyundai Kona,50.0,Station_327,Houston,2024-01-01 03:00:00,2024-01-01 06:42:00,79.457824,1.266431,32.88287,13.036239,Evening,Saturday,83.120003,99.624328,199.577785,38.316313,1.0,Level 1,Long-Distance Traveler
4,User_5,Hyundai Kona,50.0,Station_108,Los Angeles,2024-01-01 04:00:00,2024-01-01 05:46:00,19.629104,2.019765,10.215712,10.161471,Morning,Saturday,54.25895,63.743786,203.661847,-7.834199,1.0,Level 1,Long-Distance Traveler


In [9]:
df_preprocessed = preprocess_data(df).dropna()
df_preprocessed.head()

Unnamed: 0,Vehicle Model,Battery Capacity (kWh),Charging Station Location,Energy Consumed (kWh),Charging Duration (hours),Charging Rate (kW),Charging Cost (USD),Time of Day,Day of Week,State of Charge (Start %),State of Charge (End %),Distance Driven (since last charge) (km),Temperature (°C),Vehicle Age (years),Charger Type,User Type
1,Hyundai Kona,100.0,San Francisco,12.339275,3.133652,30.677735,21.128448,Morning,Monday,10.115778,84.664344,112.112804,14.311026,3.0,Level 1,Casual Driver
3,Hyundai Kona,50.0,Houston,79.457824,1.266431,32.88287,13.036239,Evening,Saturday,83.120003,99.624328,199.577785,38.316313,1.0,Level 1,Long-Distance Traveler
4,Hyundai Kona,50.0,Los Angeles,19.629104,2.019765,10.215712,10.161471,Morning,Saturday,54.25895,63.743786,203.661847,-7.834199,1.0,Level 1,Long-Distance Traveler
5,Nissan Leaf,50.0,San Francisco,43.181137,1.16764,14.334523,36.900341,Evening,Saturday,75.217748,71.982288,143.680046,-5.274218,0.0,DC Fast Charger,Long-Distance Traveler
7,Chevy Bolt,75.0,Los Angeles,51.467617,2.655396,26.702908,9.796821,Afternoon,Monday,56.201703,63.786815,116.543166,-4.41746,0.0,Level 2,Long-Distance Traveler


In [10]:
feature_column = "User Type"

categorical_features = [
    "Vehicle Model",
    "Charging Station Location",
    "Time of Day",
    "Day of Week",
    "Charger Type",
]

numerical_features = df_preprocessed.columns.difference(categorical_features + [feature_column])

In [11]:
X = df_preprocessed.drop(columns=['User Type'])
y = df_preprocessed['User Type']


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
num_preprocessor = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
])

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_preprocessor, numerical_features),
        ('cat', OneHotEncoder(drop="first"), categorical_features)
    ])

preprocessor

In [15]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [16]:
pipeline.fit(X_train, y_train)

In [17]:
pd.Series(y_train).value_counts(normalize=True)

User Type
Long-Distance Traveler    0.53012
Casual Driver             0.46988
Name: proportion, dtype: float64

In [18]:
pd.Series(y_test).value_counts(normalize=True)

User Type
Long-Distance Traveler    0.5
Casual Driver             0.5
Name: proportion, dtype: float64

In [19]:
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy:.2f}')

Acurácia: 0.49


In [20]:

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

param_grid = {
    'classifier': [RandomForestClassifier()],
    'classifier__n_estimators': range(10, 100, 10),
    'classifier__max_depth': range(1, 5),
    'classifier__min_samples_split': range(2, 5),
    'classifier__min_samples_leaf': range(1, 5),
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1)

grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)
y_pred = grid.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy:.2f}')

{'classifier': RandomForestClassifier(), 'classifier__max_depth': 3, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 3, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 60}
0.5955201886236369
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler()),
                                                                  ('poly',
                                                                   PolynomialFeatures())]),
                                                  Index(['Battery Capacity (kWh)', 'Charging Cost (USD)',
       'Charging Duration (hours)', 'Charging Rate (kW)',
       'Distance Driven (since last charge) (km)', 'Energy Consumed (kWh)',
       'State of Charge (End %)', 'State of Charge (Start %)',
       'Temperature (°C)', 'Vehicle Age (years)'],

2160 fits failed out of a total of 6480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1000 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lucas\Desktop\Projetos\Insper\MachineLearning\projeto\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lucas\Desktop\Projetos\Insper\MachineLearning\projeto\.venv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\lucas\Desktop\Projetos\Insper\MachineLearning\projeto\.venv\Lib\site-packages\sklearn\pipeline.py", line 473, in f

In [21]:
param_grid = {
    'classifier': [GradientBoostingClassifier()],
    'classifier__n_estimators': range(10, 100, 10),
    'classifier__max_depth': range(1, 5),
    'classifier__min_samples_split': range(2, 5),
    'classifier__min_samples_leaf': range(1, 5),
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1)

grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)
y_pred = grid.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy:.2f}')


{'classifier': GradientBoostingClassifier(), 'classifier__max_depth': 4, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 10}
0.5868552903035662
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler()),
                                                                  ('poly',
                                                                   PolynomialFeatures())]),
                                                  Index(['Battery Capacity (kWh)', 'Charging Cost (USD)',
       'Charging Duration (hours)', 'Charging Rate (kW)',
       'Distance Driven (since last charge) (km)', 'Energy Consumed (kWh)',
       'State of Charge (End %)', 'State of Charge (Start %)',
       'Temperature (°C)', 'Vehicle Age (years

2160 fits failed out of a total of 6480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1043 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lucas\Desktop\Projetos\Insper\MachineLearning\projeto\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lucas\Desktop\Projetos\Insper\MachineLearning\projeto\.venv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\lucas\Desktop\Projetos\Insper\MachineLearning\projeto\.venv\Lib\site-packages\sklearn\pipeline.py", line 473, in f

In [22]:
from sklearn.feature_selection import SequentialFeatureSelector

model = RandomForestClassifier(max_depth=4, max_features='log2',
                                        min_samples_leaf=3, n_estimators=40)

sfs = SequentialFeatureSelector(model, n_features_to_select='auto', direction='forward', n_jobs=-1)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selector', sfs),
    ('classifier', model)
])

pipeline.fit(X, y)

selected_features = pipeline.named_steps['feature_selector'].get_support()
print("Features selecionadas:", selected_features)

Features selecionadas: [ True  True False False False  True False  True False False False  True
 False False False  True  True  True False  True False  True False  True
 False  True  True False False  True  True False False  True False False
  True  True False False False  True  True  True  True  True False False
  True False False  True False  True  True  True  True False  True False
 False  True False False  True False  True False  True False False False
  True False False  True  True  True False  True False  True  True False
  True]


In [23]:
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy:.2f}')

Acurácia: 0.72


In [24]:
from sklearn.feature_selection import SequentialFeatureSelector

model = RandomForestClassifier(max_depth=4, max_features='log2',
                                        min_samples_leaf=3, n_estimators=40)
sfs = SequentialFeatureSelector(model, n_features_to_select='auto', direction='forward', n_jobs=-1)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selector', sfs),
    ('classifier', model)
])

pipeline.fit(X, y)

selected_features = pipeline.named_steps['feature_selector'].get_support()
print("Features selecionadas:", selected_features)

Features selecionadas: [ True  True  True False False  True False  True False  True False False
 False  True  True False False  True  True False False  True  True  True
  True False  True False  True  True  True  True  True  True  True  True
  True False False False  True False  True  True False False False False
 False  True False False  True False  True  True  True False False False
 False  True False  True  True False False False False  True  True  True
  True False False  True False False  True  True False False False False
 False]


In [25]:
from sklearn.feature_selection import SequentialFeatureSelector

model = GradientBoostingClassifier(max_depth=4, max_features='log2',
                                            min_samples_split=3,
                                            n_estimators=10)

sfs = SequentialFeatureSelector(model, n_features_to_select='auto', direction='forward', n_jobs=-1)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selector', sfs),
    ('classifier', model)
])

pipeline.fit(X, y)

selected_features = pipeline.named_steps['feature_selector'].get_support()
print("Features selecionadas:", selected_features)

Features selecionadas: [ True  True False False  True  True False  True  True  True False False
 False False False  True False False False  True  True False False False
  True  True  True False  True False  True False  True  True  True False
 False  True False  True False  True False  True False  True False  True
  True  True False False False False False  True  True  True  True False
 False  True False False  True False  True False False False False False
  True False False False  True  True  True  True  True  True  True False
  True]


In [26]:
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy:.2f}')

Acurácia: 0.74


In [27]:
param_grid = {
    'classifier': [SVC()],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'classifier__degree': [2, 3, 4],
}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1)

grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)
y_pred = grid.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy:.2f}')

KeyboardInterrupt: 

In [None]:
model = GradientBoostingClassifier(max_depth=4, max_features='log2',
                                            min_samples_split=3,
                                            n_estimators=10)

sfs = SequentialFeatureSelector(model, n_features_to_select='auto', direction='forward', n_jobs=-1)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selector', sfs),
    ('classifier', model)
])

pipeline.fit(X, y)

selected_features = pipeline.named_steps['feature_selector'].get_support()
print("Features selecionadas:", selected_features)

In [None]:
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy:.2f}')