In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Timeseries techniques for handling missing value
# https://www.kaggle.com/code/juejuewang/handle-missing-values-in-time-series-for-beginners

# https://pandas.pydata.org/pandas-docs/stable/missing_data.html

In [None]:
df = pd.read_csv('data/vehicle_traffic.csv')

In [None]:
# View data
df

In [None]:
df.index

In [None]:
# Columns and data types
# Date is stored as string (default behavior)
df.dtypes

In [None]:
# Let's ask pandas to parse TimeStamp as date column 
# For timeseries data, use timestamp as index
# df = pd.read_csv('VehicleTraffic.csv', parse_dates=[0], index_col=0)
df = pd.read_csv('data/vehicle_traffic.csv', parse_dates=["TimeStamp"], index_col="TimeStamp")

In [None]:
df

In [None]:
# TimeStamp values are set as index and data type is datetime
df.index

In [None]:
# Vehicles column
df.dtypes

In [None]:
df.index[:5]

In [None]:
# With TimeStamp as index, it is very easy to look for data by datetime
# For example, let's look for all data from 2018
df.loc['2018']

In [None]:

# Or data from December 2018
df.loc['2018-12']

In [None]:
# Data between two datetime value
df.loc['2018-12-04 02':'2018-12-04 03']

In [None]:
# Or specific date and time
df.loc['2018-12-04 02']

In [None]:
# Or starting with specific date and time
df.loc['2018-12-04 02':]

In [None]:
# Plot the data
# X-axis: TimeStamp
# y-axis: Vehicles
df.plot()
#df.plot(style='bo')

In [None]:
df.index.min()

In [None]:
list(df.Vehicles)

In [None]:
df

In [None]:
# TimeSeries has gaps - We don't have data for 5AM and 6AM
# Let's resample so that there is no gap in the timeseries
df.resample('1h').mean()

In [None]:
df_resampled = df.resample('1h').mean()

In [None]:
df_resampled

In [None]:
# Now we can see the missing values
df_resampled.plot()

In [None]:
df_resampled.index.min()

In [None]:
list(df_resampled.Vehicles)

<h2>Working With Missing Data</h2>
Let's see how we can handle the missing values  
We will look at forward fill, backfill, linear interpolation and quadratic interpolation  

https://pandas.pydata.org/pandas-docs/stable/missing_data.html

In [None]:
df_resampled

In [None]:
# Forward Fill values
df_resampled['ffill'] = df_resampled['Vehicles'].fillna(method='ffill')
# Back fill values
df_resampled['bfill'] = df_resampled['Vehicles'].fillna(method='bfill')
# Interpolate missing values
# Linear
df_resampled['linear'] = df_resampled['Vehicles'].interpolate()
df_resampled['quadratic'] = df_resampled['Vehicles'].interpolate(method='quadratic')

In [None]:
df_resampled

In [None]:
df_resampled['Vehicles'].plot()
df_resampled.plot()