# Data exploration #2

### In this notebook we will transform the dataset based on our previous observations.

In [2]:
import tensorflow as tf
import matplotlib.pylab as plt
from plotly.offline import iplot
from scipy import stats
import seaborn as sns
import plotly.figure_factory as ff
import datetime
import numpy as np
import pandas as pd
import os
import sys
import plotly.plotly as py
from plotly import tools
import numpy as np
import cufflinks
from statsmodels.tsa.stattools import adfuller
# Standard plotly imports
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode

# Using plotly + cufflinks in offline mode
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)


# Using plotly + cufflinks in offline mode
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
CSV_PATH = module_path + "/data/raw/routes.csv"


dataset = pd.read_csv(
    CSV_PATH, sep='	', low_memory=False)

# Lets fix the date into a datetime:
dataset['request_date'] = pd.to_datetime(
    dataset['request_date'], format='%Y-%m-%d %H:%M:%S')

## Because we want to predict the volume we must start aggregating by a granularity window. 
### First though, we must set a volume column on the dataset.

In [4]:
# 1 ride == 1 volume unit
dataset['ride_volume'] = 1

### We then drop the source and destination columns as they will be useless after our aggregation

In [5]:
del dataset['source_latitude']
del dataset['source_longitude']
del dataset['source_address']
del dataset['destination_latitude']
del dataset['destination_longitude']
del dataset['destination_address']
del dataset['passenger_id']

## Before aggregating, we must get make sure that the outliers shown in the previous notebook are not seasonal.

### With a quick look at the outliers from the previous notebook, it is clear that some of them are due to seasonal circumstances ( Ex. Dec 25th) 

### Next, we must aggregate by the prefered granularity window and sum the ride volume

In [6]:
from src.preparation import csvInterface
from src.processing import dataProc


dt_agg15min = dataProc.aggregate_by_mins(dataset=dataset, datetime_column='request_date', minutes=15 )
outliers_15min = dataProc.z_detect_outliers(values= dt_agg15min['ride_volume'], threshold= 3 )

dt_agg1hour = dataProc.aggregate_by_hours(dataset=dataset, datetime_column='request_date', hours= 1)
outliers_1h = dataProc.z_detect_outliers(values= dt_agg1hour['ride_volume'], threshold= 3 )



900


In [7]:
dt_agg1hour.iplot()

## We will check if the time series trend we were given is stationary:

In [9]:
dt_agg1hour['rol_mean'] = dt_agg1hour['ride_volume'].rolling(12).mean()
dt_agg1hour['rol_std'] = dt_agg1hour['ride_volume'].rolling(12).std()
dt_agg1hour.iplot()


In [10]:
#Perform Dickey-Fuller test:
print ('Results of Dickey-Fuller Test:')
dftest = adfuller(dt_agg1hour['ride_volume'], autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
print (dfoutput)

Results of Dickey-Fuller Test:
Test Statistic                   -4.824976
p-value                           0.000049
#Lags Used                       28.000000
Number of Observations Used    2876.000000
Critical Value (1%)              -3.432626
Critical Value (5%)              -2.862545
Critical Value (10%)             -2.567305
dtype: float64


### Since the std is following the mean and the Test Statistic is smaller than the Critical Value:
# Conclusion: The time series trend is stationary.