In [139]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from math import ceil
from scipy.stats import zscore

In [140]:
df = pd.read_csv('Electric_Vehicles.csv')

In [105]:
df.tail(5)

Unnamed: 0,User ID,Vehicle Model,Battery Capacity (kWh),Charging Station ID,Charging Station Location,Charging Start Time,Charging End Time,Energy Consumed (kWh),Charging Duration (hours),Charging Rate (kW),Charging Cost (USD),Time of Day,Day of Week,State of Charge (Start %),State of Charge (End %),Distance Driven (since last charge) (km),Temperature (°C),Vehicle Age (years),Charger Type,User Type
1315,User_1316,Nissan Leaf,100.0,Station_57,New York,2024-02-24 19:00:00,2024-02-24 20:30:00,42.011654,1.426444,5.895475,22.081164,Evening,Sunday,39.204102,83.915952,239.601075,1.919655,7.0,DC Fast Charger,Commuter
1316,User_1317,BMW i3,100.0,Station_40,New York,2024-02-24 20:00:00,2024-02-24 20:44:00,68.185853,3.238212,18.388012,5.067806,Evening,Tuesday,31.456375,93.096461,164.376022,34.029775,4.0,Level 2,Casual Driver
1317,User_1318,Nissan Leaf,100.0,Station_374,New York,2024-02-24 21:00:00,2024-02-24 23:03:00,18.895102,3.267122,45.482066,37.255002,Evening,Tuesday,71.903081,78.678879,226.519258,20.358761,5.0,DC Fast Charger,Commuter
1318,User_1319,Chevy Bolt,85.0,Station_336,San Francisco,2024-02-24 22:00:00,2024-02-24 23:20:00,13.756252,2.754527,38.148183,39.046146,Afternoon,Sunday,76.187997,65.926573,291.494076,24.134598,5.0,Level 2,Commuter
1319,User_1320,Nissan Leaf,120.447195,Station_128,Los Angeles,2024-02-24 23:00:00,2024-02-24 23:56:00,63.65257,3.74097,33.704226,10.863674,Evening,Monday,59.338076,56.692439,14.449236,-6.966593,5.0,DC Fast Charger,Commuter


In [106]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1320 entries, 0 to 1319
Data columns (total 20 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   User ID                                   1320 non-null   object 
 1   Vehicle Model                             1320 non-null   object 
 2   Battery Capacity (kWh)                    1320 non-null   float64
 3   Charging Station ID                       1320 non-null   object 
 4   Charging Station Location                 1320 non-null   object 
 5   Charging Start Time                       1320 non-null   object 
 6   Charging End Time                         1320 non-null   object 
 7   Energy Consumed (kWh)                     1254 non-null   float64
 8   Charging Duration (hours)                 1320 non-null   float64
 9   Charging Rate (kW)                        1254 non-null   float64
 10  Charging Cost (USD)                 

### Data Processing

Besides float644, there are a lot of object Dtypes which can be converted to their respective variables which take up less memory, and improve model performance compared to object Dtype.

In [107]:
df = df.astype({
    'User ID': 'str',
    'Vehicle Model': 'category',
    'Charging Station ID': 'str',
    'Charging Station Location': 'category',
    'Charging Start Time': 'datetime64[ns]',
    'Charging End Time': 'datetime64[ns]',
    'Time of Day': 'category',
    'Day of Week': 'category',
    'Charger Type': 'category',
    'User Type': 'category'
})

print(df.dtypes)

User ID                                             object
Vehicle Model                                     category
Battery Capacity (kWh)                             float64
Charging Station ID                                 object
Charging Station Location                         category
Charging Start Time                         datetime64[ns]
Charging End Time                           datetime64[ns]
Energy Consumed (kWh)                              float64
Charging Duration (hours)                          float64
Charging Rate (kW)                                 float64
Charging Cost (USD)                                float64
Time of Day                                       category
Day of Week                                       category
State of Charge (Start %)                          float64
State of Charge (End %)                            float64
Distance Driven (since last charge) (km)           float64
Temperature (°C)                                   float

Potentially need to do target encoding later on. 

In [108]:
df.isnull().sum()

User ID                                      0
Vehicle Model                                0
Battery Capacity (kWh)                       0
Charging Station ID                          0
Charging Station Location                    0
Charging Start Time                          0
Charging End Time                            0
Energy Consumed (kWh)                       66
Charging Duration (hours)                    0
Charging Rate (kW)                          66
Charging Cost (USD)                          0
Time of Day                                  0
Day of Week                                  0
State of Charge (Start %)                    0
State of Charge (End %)                      0
Distance Driven (since last charge) (km)    66
Temperature (°C)                             0
Vehicle Age (years)                          0
Charger Type                                 0
User Type                                    0
dtype: int64

In [109]:
df[['Energy Consumed (kWh)', 'Charging Rate (kW)', 'Distance Driven (since last charge) (km)']] = df[['Energy Consumed (kWh)', 'Charging Rate (kW)', 'Distance Driven (since last charge) (km)']].fillna(df[['Energy Consumed (kWh)', 'Charging Rate (kW)', 'Distance Driven (since last charge) (km)']].median())

In [110]:
df.isnull().sum()

User ID                                     0
Vehicle Model                               0
Battery Capacity (kWh)                      0
Charging Station ID                         0
Charging Station Location                   0
Charging Start Time                         0
Charging End Time                           0
Energy Consumed (kWh)                       0
Charging Duration (hours)                   0
Charging Rate (kW)                          0
Charging Cost (USD)                         0
Time of Day                                 0
Day of Week                                 0
State of Charge (Start %)                   0
State of Charge (End %)                     0
Distance Driven (since last charge) (km)    0
Temperature (°C)                            0
Vehicle Age (years)                         0
Charger Type                                0
User Type                                   0
dtype: int64

Successfully filled in all missing values with their respective median. 

In [111]:
print(df.dtypes)

User ID                                             object
Vehicle Model                                     category
Battery Capacity (kWh)                             float64
Charging Station ID                                 object
Charging Station Location                         category
Charging Start Time                         datetime64[ns]
Charging End Time                           datetime64[ns]
Energy Consumed (kWh)                              float64
Charging Duration (hours)                          float64
Charging Rate (kW)                                 float64
Charging Cost (USD)                                float64
Time of Day                                       category
Day of Week                                       category
State of Charge (Start %)                          float64
State of Charge (End %)                            float64
Distance Driven (since last charge) (km)           float64
Temperature (°C)                                   float

In [112]:
#verifying that within each column they all follow the same kind
for col in df:
    print(df[col].unique())

['User_1' 'User_2' 'User_3' ... 'User_1318' 'User_1319' 'User_1320']
['BMW i3', 'Hyundai Kona', 'Chevy Bolt', 'Nissan Leaf', 'Tesla Model 3']
Categories (5, object): ['BMW i3', 'Chevy Bolt', 'Hyundai Kona', 'Nissan Leaf', 'Tesla Model 3']
[108.46300741 100.          75.          50.          85.
  62.          97.68181223  79.79920376  48.79648264  76.87610038
  69.88407425 124.31591139  59.83272651  48.05069473  24.59992215
  77.29894529  95.15365777 147.39535434  63.09631665  93.27378067
  48.46959856  93.09486955  50.87061708  58.0121307  102.94949428
  95.69778087 102.83943001  89.03206383  81.65318205  46.5379115
  27.03325783 120.51063277  94.82402915  45.38021848 143.47520974
  10.18928677  78.38373536  65.04709891  33.37702357  59.79413925
   6.16889584  45.62653203  77.60847805  48.63693125  60.78928025
  71.64548707  46.87679655  69.44504561   3.9765965    1.53280653
  67.54202075 127.28027404 107.43195455  52.33224826  39.29742253
 129.52415942   3.83851807  78.84320421 104.

In [113]:
# Check for logical inconsistencies
inconsistent_charge = df[df['State of Charge (Start %)'] > df['State of Charge (End %)']]
print("Inconsistent Charge:")
print("There are", len(inconsistent_charge), "instances of inconsistent charges where \nstarting charge is > ending charge somehow.")


print("Inconsistent Stations:")
inconsistent_stations = df.groupby('Charging Station ID')['Charging Station Location'].nunique()
inconsistent_stations = inconsistent_stations[inconsistent_stations > 1]
print('\n')
print(inconsistent_stations)


print('\n')
df[df['Charging Station ID'] == 'Station_1']


Inconsistent Charge:
There are 268 instances of inconsistent charges where 
starting charge is > ending charge somehow.
Inconsistent Stations:


Charging Station ID
Station_1      3
Station_10     4
Station_100    2
Station_101    3
Station_103    4
              ..
Station_93     3
Station_96     2
Station_97     5
Station_98     2
Station_99     2
Name: Charging Station Location, Length: 335, dtype: int64




Unnamed: 0,User ID,Vehicle Model,Battery Capacity (kWh),Charging Station ID,Charging Station Location,Charging Start Time,Charging End Time,Energy Consumed (kWh),Charging Duration (hours),Charging Rate (kW),Charging Cost (USD),Time of Day,Day of Week,State of Charge (Start %),State of Charge (End %),Distance Driven (since last charge) (km),Temperature (°C),Vehicle Age (years),Charger Type,User Type
88,User_89,BMW i3,85.0,Station_1,Los Angeles,2024-01-04 16:00:00,2024-01-04 17:56:00,33.07295,1.46966,30.344748,15.550838,Evening,Wednesday,105.032224,83.023473,164.682892,-6.514432,1.0,Level 2,Long-Distance Traveler
793,User_794,Nissan Leaf,75.0,Station_1,Chicago,2024-02-03 01:00:00,2024-02-03 03:40:00,74.826446,1.398582,43.133283,26.218718,Evening,Sunday,23.725996,53.813699,238.743561,30.955969,3.04357,Level 2,Long-Distance Traveler
1171,User_1172,Nissan Leaf,50.0,Station_1,New York,2024-02-18 19:00:00,2024-02-18 22:35:00,45.364578,0.56411,25.04146,33.712647,Afternoon,Thursday,39.082326,91.733706,109.931915,37.330822,5.0,DC Fast Charger,Commuter


In [114]:
# Check for negative values
print("\n\nRows with Negative Energy Consumed:")
print(df[df['Energy Consumed (kWh)'] < 0])

print("\n\nRows with Negative Temperature:")
print(df[df['Temperature (°C)'] < 0])

print("\n\nRows with Negative Distance Driven:")
print(df[df['Distance Driven (since last charge) (km)'] < 0])



Rows with Negative Energy Consumed:
Empty DataFrame
Columns: [User ID, Vehicle Model, Battery Capacity (kWh), Charging Station ID, Charging Station Location, Charging Start Time, Charging End Time, Energy Consumed (kWh), Charging Duration (hours), Charging Rate (kW), Charging Cost (USD), Time of Day, Day of Week, State of Charge (Start %), State of Charge (End %), Distance Driven (since last charge) (km), Temperature (°C), Vehicle Age (years), Charger Type, User Type]
Index: []


Rows with Negative Temperature:
        User ID  Vehicle Model  Battery Capacity (kWh) Charging Station ID  \
4        User_5   Hyundai Kona               50.000000         Station_108   
5        User_6    Nissan Leaf               50.000000         Station_335   
7        User_8     Chevy Bolt               75.000000         Station_302   
15      User_16    Nissan Leaf              100.000000         Station_147   
21      User_22   Hyundai Kona               62.000000         Station_485   
...         .

There are no rows for negative energy consumed and negative distance driven.



### Check for outliers.

In [None]:
df.head()

In [123]:
print(df.columns)

Index(['User ID', 'Vehicle Model', 'Battery Capacity (kWh)',
       'Charging Station ID', 'Charging Station Location',
       'Charging Start Time', 'Charging End Time', 'Energy Consumed (kWh)',
       'Charging Duration (hours)', 'Charging Rate (kW)',
       'Charging Cost (USD)', 'Time of Day', 'Day of Week',
       'State of Charge (Start %)', 'State of Charge (End %)',
       'Distance Driven (since last charge) (km)', 'Temperature (°C)',
       'Vehicle Age (years)', 'Charger Type', 'User Type'],
      dtype='object')


In [146]:
df['User ID'].value_counts()

User ID
User_1320    1
User_1       1
User_2       1
User_3       1
User_4       1
            ..
User_13      1
User_12      1
User_11      1
User_10      1
User_9       1
Name: count, Length: 1320, dtype: int64

### Feature engineering. 

- charging cost per kWh
- how much charge is actually being added during the process --> identify inefficiences/charging behaviour
- ave energy consumed per charging station / freq of charges per station
- isWeekend --> dist. bw weekend/weekday behaviour
