# EDA (Varun's)
The goal of this notebook is to investigate the data. We can answer the following questions

0. Load data and imports
1. Define key metrics
2. perform eda
3. what data is available
4. visualize the data

# 0. Load data and imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
os.getcwd()


In [None]:
# os.chdir('/Users/varunvenkatesh/Documents/Github/ev_charging/')
os.chdir('..')

In [None]:
# get dataframes by reading all parquet files in a directory
# https://stackoverflow.com/a/66104513
df_ct = pd.read_parquet('data/ACN-API/caltech/')#'data/sessions/caltech/')
df_jpl = pd.read_parquet('data/ACN-API/jpl/')#'data/sessions/jpl/')
df_of = pd.read_parquet('data/ACN-API/office001/')#'data/sessions/office001/')
print(f"Caltech Shape: {df_ct.shape}\nJPL Shape: {df_jpl.shape}\nOffice Shape: {df_of.shape}")

In [None]:
df_ct.reset_index(drop=True, inplace=True)
df_ct.head()

In [None]:
df_jpl.reset_index(drop=True, inplace=True)
df_jpl.head()

In [None]:
df_of.reset_index(drop=True, inplace=True)
df_of.head()

In [None]:
print(f"site ID for caltech: {df_ct['siteID'].unique()}")
print(f"site ID for jpl: {df_jpl['siteID'].unique()}")
print(f"site ID for office: {df_of['siteID'].unique()}")

# Convert User Inputs
process taken from here:

https://stackoverflow.com/a/41970572

In [None]:
def convert_userInputs(x):
    """could do try except block instead"""
    # x is a string of a list of dictionaries, like this
    # '[{'userID': 333, 'milesRequested': 20, 'WhPerMile': 400, 'minutesAvailable': 277, 'modifiedAt': 'Wed, 05 Sep 2018 11:08:31 GMT', 'paymentRequired': True, 'requestedDeparture': 'Wed, 05 Sep 2018 15...}]'
    if x:
        x = eval(x) # convert string to list
        x = x[0] # get first and only entry in list (a dictionary)
        return pd.Series(x) # convert dictionary to a series
    else: # x is none
        none_record = {'userID':None, 
                       'milesRequested':None,
                       'WhPerMile':None,
                       'minutesAvailable':None,
                       'modifiedAt':None,
                       'paymentRequired':None,
                       'requestedDeparture':None,
                       'kWhRequested':None
                       }
        return pd.Series(none_record)
    
def userinput_processing(df):
    if 'userInputs' not in df.columns:
        print(f'the column userInputs was not found in the dataframes columns. userinput processing skipped.')
        return df
    assert 'userInputs' in list(df.columns)
    return pd.concat([df.drop(columns='userInputs'), df['userInputs'].apply(convert_userInputs)], axis=1)
    

In [None]:
df_ct = userinput_processing(df_ct)
df_jpl = userinput_processing(df_jpl)
df_of = userinput_processing(df_of)
# df_ct = pd.concat([df_ct.drop(columns='userInputs'), df_ct['userInputs'].apply(convert_userInputs)], axis=1)


# Convert times
Something is off about the time zone, so I made it utc to stop throwing errors. not my best idea but it works for now

In [None]:
def datetime_processing(df):
    df['connectionTime'] = pd.to_datetime(df['connectionTime'], infer_datetime_format=True, utc=True, errors='coerce')
    df['connectionTimeHour'] = df['connectionTime'].dt.hour
    df['connectionTimeDay'] = df['connectionTime'].dt.day
    df['disconnectTime'] = pd.to_datetime(df['disconnectTime'], infer_datetime_format=True, utc=True, errors='coerce')
    df['disconnectTimeHour'] = df['disconnectTime'].dt.hour
    df['disconnectTimeDay'] = df['disconnectTime'].dt.day
    df['doneChargingTime'] = pd.to_datetime(df['doneChargingTime'], infer_datetime_format=True, utc=True, errors='coerce')
    df['doneChargingTimeHour'] = df['doneChargingTime'].dt.hour
    df['doneChargingTimeDay'] = df['doneChargingTime'].dt.day
    return df

In [None]:
df_ct = datetime_processing(df_ct)
df_jpl = datetime_processing(df_jpl)
df_of = datetime_processing(df_of)

In [None]:
df_ct.head()

In [None]:
df_jpl.head()

In [None]:
df_of.head()

In [None]:
n_bins = 24
colors = ['red', 'tan', 'lime']
fig, (ax0, ax1, ax2) = plt.subplots(figsize=(8,10), nrows=3, ncols=1)
ax0.hist(df_ct['connectionTimeHour'], n_bins, histtype='bar', color=colors[0])
ax1.hist(df_jpl['connectionTimeHour'], n_bins, histtype='bar', color=colors[1])
ax2.hist(df_of['connectionTimeHour'], n_bins, histtype='bar', color=colors[2])
ax0.set_title('Caltech Hourly Connection Time Distribution')
ax0.xaxis.set_ticks(np.arange(0, 24, 1))
ax0.set_xlabel("Hour of Day")
ax0.set_ylabel("Number of Connections")
ax1.set_title('JPL Hourly Connection Time Distribution')
ax1.xaxis.set_ticks(np.arange(0, 24, 1))
ax1.set_xlabel("Hour of Day")
ax1.set_ylabel("Number of Connections")
ax2.set_title('Office001 Hourly Connection Time Distribution')
ax2.xaxis.set_ticks(np.arange(0, 24, 1))
ax2.set_xlabel("Hour of Day")
ax2.set_ylabel("Number of Connections")
fig.tight_layout()
plt.show()

In [None]:
n_bins = 7
colors = ['red', 'tan', 'lime']
fig, (ax0, ax1, ax2) = plt.subplots(figsize=(8,10), nrows=3, ncols=1)
ax0.hist(df_ct['connectionTime'].dt.day_name(), n_bins, histtype='bar', color=colors[0])
ax1.hist(df_jpl['connectionTime'].dt.day_name(), n_bins, histtype='bar', color=colors[1])
ax2.hist(df_of['connectionTime'].dt.day_name(), n_bins, histtype='bar', color=colors[2])
ax0.set_title('Caltech Weekly Connection Time Distribution')
ax0.xaxis.set_ticks(np.arange(0, n_bins, 1))
ax0.set_xlabel("Day of Week")
ax0.set_ylabel("Number of Connections")
ax1.set_title('JPL Weekly Connection Time Distribution')
ax1.xaxis.set_ticks(np.arange(0, n_bins, 1))
ax1.set_xlabel("Day of Week")
ax1.set_ylabel("Number of Connections")
ax2.set_title('Office001 Weekly Connection Time Distribution')
ax2.xaxis.set_ticks(np.arange(0, n_bins, 1))
ax2.set_xlabel("Day of Week")
ax2.set_ylabel("Number of Connections")
fig.tight_layout()
plt.show()

In [None]:
n_bins = 31
colors = ['red', 'tan', 'lime']
fig, (ax0, ax1, ax2) = plt.subplots(figsize=(8,10), nrows=3, ncols=1)
ax0.hist(df_ct['connectionTimeDay'], n_bins, histtype='bar', color=colors[0])
ax1.hist(df_jpl['connectionTimeDay'], n_bins, histtype='bar', color=colors[1])
ax2.hist(df_of['connectionTimeDay'], n_bins, histtype='bar', color=colors[2])
ax0.set_title('Caltech Monthly Connection Time Distribution')
ax0.xaxis.set_ticks(np.arange(0, n_bins, 1))
ax0.set_xlabel("Day")
ax0.set_ylabel("Number of Connections")
ax1.set_title('JPL Monthly Connection Time Distribution')
ax1.xaxis.set_ticks(np.arange(0, n_bins, 1))
ax1.set_xlabel("Day")
ax1.set_ylabel("Number of Connections")
ax2.set_title('Office001 Monthly Connection Time Distribution')
ax2.xaxis.set_ticks(np.arange(0, n_bins, 1))
ax2.set_xlabel("Day")
ax2.set_ylabel("Number of Connections")
fig.tight_layout()
plt.show()

In [None]:
n_bins = 12
colors = ['red', 'tan', 'lime']
fig, (ax0, ax1, ax2) = plt.subplots(figsize=(8,10), nrows=3, ncols=1)
ax0.hist(df_ct['connectionTime'].dt.month, n_bins, histtype='bar', color=colors[0])
ax1.hist(df_jpl['connectionTime'].dt.month, n_bins, histtype='bar', color=colors[1])
ax2.hist(df_of['connectionTime'].dt.month, n_bins, histtype='bar', color=colors[2])
ax0.set_title('Caltech Yearly Connection Time Distribution')
ax0.xaxis.set_ticks(np.arange(0, n_bins, 1))
ax0.set_xlabel("Month")
ax0.set_ylabel("Number of Connections")
ax1.set_title('JPL Yearly Connection Time Distribution')
ax1.xaxis.set_ticks(np.arange(0, n_bins, 1))
ax1.set_xlabel("Month")
ax1.set_ylabel("Number of Connections")
ax2.set_title('Office001 Yearly Connection Time Distribution')
ax2.xaxis.set_ticks(np.arange(0, n_bins, 1))
ax2.set_xlabel("Month")
ax2.set_ylabel("Number of Connections")
fig.tight_layout()
plt.show()

In [None]:
len(df_ct) + len(df_jpl) + len(df_of)

In [None]:
min(df_ct['connectionTime'].min(), df_jpl['connectionTime'].min(), df_of['connectionTime'].min())

In [None]:
max(df_ct['connectionTime'].max(), df_jpl['connectionTime'].max(), df_of['connectionTime'].max())

In [None]:
# df_ct['connectionTime'].describe(datetime_is_numeric=True)

In [None]:
# df_jpl['connectionTime'].describe(datetime_is_numeric=True)

In [None]:
# df_of['connectionTime'].describe(datetime_is_numeric=True)

# Number of Charges per User

In [None]:
df_ct.userID.value_counts()

In [None]:
df_jpl.userID.value_counts()

In [None]:
df_of.userID.value_counts()

In [None]:
print((set(df_ct.userID)) & (set(df_jpl.userID)) & (set(df_of.userID)))

# Distribution of Charges by Location

In [None]:
df_ct.groupby(df_ct.stationID).spaceID.value_counts()

In [None]:
df_ct.groupby(df_ct.stationID).spaceID.value_counts().plot(kind='bar', figsize=(10,5))

In [None]:
df_jpl.groupby(df_jpl.stationID).spaceID.value_counts()

In [None]:
df_jpl.groupby(df_jpl.stationID).spaceID.value_counts().plot(kind='bar', figsize=(10,5))

In [None]:
df_of.groupby(df_of.stationID).spaceID.value_counts()

In [None]:
df_of.groupby(df_of.stationID).spaceID.value_counts().plot(kind='bar', figsize=(10,5))

In [None]:
df_of.shape
df_of.columns
df_of.head()

# Is_available
this function determines if the charger was active or not during a time period

In [None]:
df_of

In [None]:
# df_jpl.head()
tmp = df_of.copy()
tmp.set_index('connectionTime', inplace=True)
_filter = (tmp['spaceID'] == '02')
tmp = tmp[_filter].sort_index()
tmp = tmp.sort_index().loc['2019-03-20':'2019-04-01',:]
tmp

In [None]:
y = pd.DataFrame(index=pd.date_range('2019-03-25','2019-04-02', inclusive='both', freq='h', tz=0),columns=['is_available','sessionID'])
y['is_available'] = 1

In [None]:
# y['is_available']
start_ = tmp.index[0]
end_ = tmp.loc[start_,'disconnectTime'] 
session_ = tmp.loc[start_,'sessionID']
print(start_, end_)
pd.date_range(start_, end_, inclusive='both', freq='h', tz=0)
y.loc[start_:end_,['is_available','sessionID']] = 0, session_

In [None]:
for i in range(len(tmp)):
    start_ = tmp.index[i]
    end_ = tmp.loc[start_,'disconnectTime'] 
    session_ = tmp.loc[start_,'sessionID']
    print(start_,'\t', end_,'\t', session_)
    y.loc[start_:end_,['is_available','sessionID']] = 0, session_
    # y.loc[pd.date_range(start_, end_, inclusive='both', freq='h', tz=0),['is_available','sessionID']] = 0, session_

In [None]:
y[y['is_available'] == 0]
y


In [None]:
plt.plot(y.index, y.is_available)
plt.title('spaceid 02 availability in late march, 2019')
plt.ylabel('available?')

In [None]:
plt.title('Sessions at space 02')
plt.plot(y.index, y.is_available)
groups = y.groupby('sessionID')
for name, group in groups:
    plt.plot(group.index, group['is_available'], marker='o', linestyle='', markersize=8, label=name)

plt.xticks(rotation=75)
plt.ylabel('available?')
plt.legend()

# Now by spaces

# try spaceid as wide table 

In [None]:
df_of.disconnectTime.max().date()

In [None]:
tmp = df_of.copy()
tmp.set_index('connectionTime', inplace=True)
start_date = '2019-03-20'; end_date = '2021-09-14'
tmp = tmp.sort_index().loc[start_date:end_date,:]

In [None]:
space_cols = tmp.spaceID.unique()
space_cols = (list(space_cols.astype('str')))

y = pd.DataFrame(index=pd.date_range(start_date,end_date, inclusive='both', freq='h', tz=0),columns=space_cols)
y[space_cols] = 1


In [None]:
y.head()

In [None]:
for i in range(len(tmp)):
    start_ = tmp.index[i]
    end_ = tmp.loc[start_,'disconnectTime'] 
    session_ = tmp.loc[start_,'sessionID']
    space_ = tmp.loc[start_,'spaceID']
    print(start_,'\t', end_,'\t', session_, '\t', space_)
    y.loc[start_:end_,space_] = 0

In [None]:
fig, ax = plt.subplots(figsize=(12,4))
for space_ in sorted(space_cols):
    ax.plot(y.index, y[space_], label='space '+space_)
# ax.plot(y.index, y['03'])
ax.legend()


In [None]:
y['percent_available'] = y.mean(axis=1)
y

In [None]:
plt.figure(figsize=(10,6))
plt.title('Charger availability in the office001')
plt.plot(y.index, y.percent_available)
plt.xlabel('date')
plt.xticks(rotation=80)
plt.ylabel('fraction of available spaces')
plt.ylim([0,1.1])

In [None]:
y['percent_full'] = 1- y['percent_available']
plt.plot(y.index, y.percent_full, c='r')
plt.title('the fullness of the lot')
plt.ylim([0,1.05])

In [None]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
def holiday_processing(df):
    assert 'connectionTime' in df.columns
    cal = calendar()
    holidays = cal.holidays(start=df['connectionTime'].min().date(), end=df['connectionTime'].max().date())
    df['is_holiday'] = df['connectionTime'].isin(holidays)
    return df

# 1. Define key metrics
Total charges | 66393
What is the date range of charges | 5-1-2018 to 9-13-2021
How many unique cars are there?
What are the locations of charging
What is the distribution of charges over time
What is the distribution of charges over the hours of a day
What is the distribution of charges by vehicle
What is the distribution of charges over location
Is the rate of charging the same for a vehicle
Does the energy of daily charging change over time
Do the number of charges change over time
whats the probability of being available for each charger?
what's the multinomial distribution of charger availability for each location (ie whats the probaility 0 are full, 1 is full, 2 are full. ..etc) assume a bernouli random trial for each location