In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('Electric_Vehicles.csv')

In [4]:
#Optimising memory by appling correct data types to features
df = df.astype({
    'User ID': 'str',
    'Vehicle Model': 'category',
    'Charging Station ID': 'str',
    'Charging Station Location': 'category',
    'Charging Start Time': 'datetime64[ns]',
    'Charging End Time': 'datetime64[ns]',
    'Time of Day': 'category',
    'Day of Week': 'category',
    'Charger Type': 'category',
    'User Type': 'category'
})

print(df.dtypes)


#Replace missing values with median
df[['Energy Consumed (kWh)', 'Charging Rate (kW)', 'Distance Driven (since last charge) (km)']] = df[['Energy Consumed (kWh)', 'Charging Rate (kW)', 'Distance Driven (since last charge) (km)']].fillna(df[['Energy Consumed (kWh)', 'Charging Rate (kW)', 'Distance Driven (since last charge) (km)']].median())
# df.isnull().sum()

User ID                                             object
Vehicle Model                                     category
Battery Capacity (kWh)                             float64
Charging Station ID                                 object
Charging Station Location                         category
Charging Start Time                         datetime64[ns]
Charging End Time                           datetime64[ns]
Energy Consumed (kWh)                              float64
Charging Duration (hours)                          float64
Charging Rate (kW)                                 float64
Charging Cost (USD)                                float64
Time of Day                                       category
Day of Week                                       category
State of Charge (Start %)                          float64
State of Charge (End %)                            float64
Distance Driven (since last charge) (km)           float64
Temperature (°C)                                   float

In [5]:
inconsistent_charge = df[df['State of Charge (Start %)'] > df['State of Charge (End %)']]
df = df.drop(inconsistent_charge.index)
print("Remaining rows after removing inconsistencies: ", len(df))

Remaining rows after removing inconsistencies:  1052


In [6]:
df.columns

Index(['User ID', 'Vehicle Model', 'Battery Capacity (kWh)',
       'Charging Station ID', 'Charging Station Location',
       'Charging Start Time', 'Charging End Time', 'Energy Consumed (kWh)',
       'Charging Duration (hours)', 'Charging Rate (kW)',
       'Charging Cost (USD)', 'Time of Day', 'Day of Week',
       'State of Charge (Start %)', 'State of Charge (End %)',
       'Distance Driven (since last charge) (km)', 'Temperature (°C)',
       'Vehicle Age (years)', 'Charger Type', 'User Type'],
      dtype='object')

In [7]:
mismatched_durations = df[
    (df['Charging End Time'] - df['Charging Start Time']).dt.total_seconds() / 3600 != df['Charging Duration (hours)']
]

# Display the number of mismatched rows and their details
print(f"Number of mismatched rows: {len(mismatched_durations)}")
print(mismatched_durations[['Charging Start Time', 'Charging End Time', 'Charging Duration (hours)']])

Number of mismatched rows: 1052
     Charging Start Time   Charging End Time  Charging Duration (hours)
0    2024-01-01 00:00:00 2024-01-01 00:39:00                   0.591363
1    2024-01-01 01:00:00 2024-01-01 03:01:00                   3.133652
2    2024-01-01 02:00:00 2024-01-01 04:48:00                   2.452653
3    2024-01-01 03:00:00 2024-01-01 06:42:00                   1.266431
4    2024-01-01 04:00:00 2024-01-01 05:46:00                   2.019765
...                  ...                 ...                        ...
1313 2024-02-24 17:00:00 2024-02-24 19:35:00                   2.365991
1314 2024-02-24 18:00:00 2024-02-24 19:58:00                   2.501809
1315 2024-02-24 19:00:00 2024-02-24 20:30:00                   1.426444
1316 2024-02-24 20:00:00 2024-02-24 20:44:00                   3.238212
1317 2024-02-24 21:00:00 2024-02-24 23:03:00                   3.267122

[1052 rows x 3 columns]
