In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('Electric_Vehicles.csv')

In [3]:
#Optimising memory by appling correct data types to features
df = df.astype({
    'User ID': 'str',
    'Vehicle Model': 'category',
    'Charging Station ID': 'str',
    'Charging Station Location': 'category',
    'Charging Start Time': 'datetime64[ns]',
    'Charging End Time': 'datetime64[ns]',
    'Time of Day': 'category',
    'Day of Week': 'category',
    'Charger Type': 'category',
    'User Type': 'category'
})

print(df.dtypes)


#Replace missing values with median
df[['Energy Consumed (kWh)', 'Charging Rate (kW)', 'Distance Driven (since last charge) (km)']] = df[['Energy Consumed (kWh)', 'Charging Rate (kW)', 'Distance Driven (since last charge) (km)']].fillna(df[['Energy Consumed (kWh)', 'Charging Rate (kW)', 'Distance Driven (since last charge) (km)']].median())
df.isnull().sum()

User ID                                             object
Vehicle Model                                     category
Battery Capacity (kWh)                             float64
Charging Station ID                                 object
Charging Station Location                         category
Charging Start Time                         datetime64[ns]
Charging End Time                           datetime64[ns]
Energy Consumed (kWh)                              float64
Charging Duration (hours)                          float64
Charging Rate (kW)                                 float64
Charging Cost (USD)                                float64
Time of Day                                       category
Day of Week                                       category
State of Charge (Start %)                          float64
State of Charge (End %)                            float64
Distance Driven (since last charge) (km)           float64
Temperature (°C)                                   float

User ID                                     0
Vehicle Model                               0
Battery Capacity (kWh)                      0
Charging Station ID                         0
Charging Station Location                   0
Charging Start Time                         0
Charging End Time                           0
Energy Consumed (kWh)                       0
Charging Duration (hours)                   0
Charging Rate (kW)                          0
Charging Cost (USD)                         0
Time of Day                                 0
Day of Week                                 0
State of Charge (Start %)                   0
State of Charge (End %)                     0
Distance Driven (since last charge) (km)    0
Temperature (°C)                            0
Vehicle Age (years)                         0
Charger Type                                0
User Type                                   0
dtype: int64

In [4]:
inconsistent_charge = df[df['State of Charge (Start %)'] > df['State of Charge (End %)']]
print("Inconsistent charges: ", len(inconsistent_charge))


print("\nInconsistent stations:")
inconsistent_stations = df.groupby('Charging Station ID')['Charging Station Location'].nunique()
inconsistent_stations = inconsistent_stations[inconsistent_stations > 1]
print(inconsistent_stations.head(100))  


# Check for negative values
print("\n\nRows with Negative Energy Consumed:")
print(df[df['Energy Consumed (kWh)'] < 0])

print("\n\nRows with Negative Distance Driven:")
print(df[df['Distance Driven (since last charge) (km)'] < 0])

Inconsistent charges:  268

Inconsistent stations:
Charging Station ID
Station_1      3
Station_10     4
Station_100    2
Station_101    3
Station_103    4
              ..
Station_213    2
Station_214    3
Station_215    2
Station_216    4
Station_217    2
Name: Charging Station Location, Length: 100, dtype: int64


Rows with Negative Energy Consumed:
Empty DataFrame
Columns: [User ID, Vehicle Model, Battery Capacity (kWh), Charging Station ID, Charging Station Location, Charging Start Time, Charging End Time, Energy Consumed (kWh), Charging Duration (hours), Charging Rate (kW), Charging Cost (USD), Time of Day, Day of Week, State of Charge (Start %), State of Charge (End %), Distance Driven (since last charge) (km), Temperature (°C), Vehicle Age (years), Charger Type, User Type]
Index: []


Rows with Negative Distance Driven:
Empty DataFrame
Columns: [User ID, Vehicle Model, Battery Capacity (kWh), Charging Station ID, Charging Station Location, Charging Start Time, Charging End Time

In [5]:
# df = pd.get_dummies(df, columns=['Vehicle Model', 'Charging Station Location', 'Time of Day', 'Day of Week', 'Charger Type', 'User Type'])

# # Convert only one-hot encoded columns to integers
# df = df.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)

# print(df.dtypes)

In [11]:
# Count the number of unique charging stations in each location
station_count_by_location = df.groupby('Charging Station Location')['Charging Station ID'].nunique()

# Display the number of charging stations in each location
print(station_count_by_location)

Charging Station Location
Chicago          193
Houston          201
Los Angeles      219
New York         204
San Francisco    199
Name: Charging Station ID, dtype: int64


  station_count_by_location = df.groupby('Charging Station Location')['Charging Station ID'].nunique()


In [None]:
# Calculate total hours of operation for each station
df['Charging Hours'] = (df['Charging End Time'] - df['Charging Start Time']).dt.total_seconds() / 3600

# Sum up charging hours by station
station_utilization = df.groupby('Charging Station ID')['Charging Hours'].sum()

# Assuming each station operates 24/7 with N chargers, calculate utilization rate
stations_per_Houston = 201  # number of charging stations IDs in Houston
station_utilization_rate = (station_utilization / (24 * stations_per_Houston * 7)) * 100

print(station_utilization_rate)

Charging Station ID
Station_1      0.024234
Station_10     0.053256
Station_100    0.014609
Station_101    0.022803
Station_103    0.033316
                 ...   
Station_95     0.001925
Station_96     0.014659
Station_97     0.038498
Station_98     0.024579
Station_99     0.010562
Name: Charging Hours, Length: 462, dtype: float64
