# Data exploration #1

### In this notebook we will perform a preliminary statistical exploration of the dataset in order to get a better sense of the problem's formulation and any outliers or extreme scenarios that we should take care.

In [9]:
import tensorflow as tf
import datetime
import numpy as np
import pandas as pd
import os
import sys
from scipy import stats
import plotly.plotly as py
from plotly import tools
import numpy as np
import cufflinks
import plotly.figure_factory as ff
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
import numpy as np
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
# Using plotly + cufflinks in offline mode
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
CSV_PATH = module_path + "/data/raw/routes.csv"


dataset = pd.read_csv(
    CSV_PATH, sep='	', low_memory=False)

### Next, lets extract some series of variable granularity to get a sense of the std and the mean of the volume per week, day, hour and 15 mins.

### "Quick and Dirty" Aggregation:

In [4]:
from src.processing import dataProc

dataset['ride_volume'] = 1

quart_sum = dataProc.aggregate_by_mins(
    dataset=dataset, datetime_column='request_date', minutes=15)

hour_sum = dataProc.aggregate_by_hours(
    dataset=dataset, datetime_column='request_date', hours=1)

day_sum = dataProc.aggregate_by_hours(
    dataset=dataset, datetime_column='request_date', hours=24)

week_sum = dataProc.aggregate_by_hours(
    dataset=dataset, datetime_column='request_date', hours=24 * 7)

900


# Now that we have the sums for a gradient granularity, we can start checking for outliers:

In [5]:
def plot_outlier_boxes(data_array):
    # Create distplot with custom bin_size
    txt = hour_sum.index.tolist()

    trace0 = go.Box(
        y=data_array['ride_volume'],
        name="All Points",
        jitter=0.3,
        pointpos=-1.8,
        boxpoints='all',
        text=txt,
        marker=dict(
            color='rgb(7,40,89)'),
        line=dict(
            color='rgb(7,40,89)')
    )

    trace1 = go.Box(
        y=data_array['ride_volume'],
        text=txt,
        name="Only Whiskers",
        boxpoints=False,
        marker=dict(
            color='rgb(9,56,125)'),
        line=dict(
            color='rgb(9,56,125)')
    )

    trace2 = go.Box(
        y=data_array['ride_volume'],
        text=txt,
        name="Suspected Outliers",
        boxpoints='suspectedoutliers',
        marker=dict(
            color='rgb(8,81,156)',
            outliercolor='rgba(219, 64, 82, 0.6)',
            line=dict(
                outliercolor='rgba(219, 64, 82, 0.6)',
                outlierwidth=2)),
        line=dict(
            color='rgb(8,81,156)')
    )

    trace3 = go.Box(
        y=data_array['ride_volume'],
        text=txt,
        name="Whiskers and Outliers",
        boxpoints='outliers',
        marker=dict(
            color='rgb(107,174,214)'),
        line=dict(
            color='rgb(107,174,214)')
    )

    data = [trace0, trace1, trace2, trace3]

    layout = go.Layout(
        title="Box Plot Styling Outliers"
    )

    fig = go.Figure(data=data, layout=layout)
    return fig


# Checking per hour outliers
fig = plot_outlier_boxes(hour_sum)
iplot(fig, filename="Box Plot Styling Outliers")

In [6]:
# Plotting the histogram per hour for verification
data = [go.Histogram(x=hour_sum['ride_volume'])]
iplot(data, filename='Day histogram')

## From the graphs above it is clear that the aggregation produces outliers that must be removed. We are going to use the z score to do that:

In [7]:
threshold = 3
hour_list = hour_sum['ride_volume']
z = np.abs(stats.zscore(hour_list))
outliers = list(np.where(z > threshold)[0])
# for index in sorted(outliers, reverse=True):
#     hour_sum.drop(hour_sum.index[index])

    
hour_sum = hour_sum.drop(hour_sum.index[outliers])

In [8]:
# Checking per hour outliers
fig = plot_outlier_boxes(hour_sum)
iplot(fig, filename="Box Plot Styling Outliers")


### Outliers can be trimmed in order to achieve a greater prediction accurasy but in our case our data is seasonal, and many outliers are due to that seasonality. 
### Also, trimming a time series dataset can produce time gaps which may be more detrimental to the model's training.
# Hence, we will be working with the untrimmed dataset as is for our predictions.