# FORECASTING BUS DEMAND

In [58]:
import pandas as pd
import numpy as np
import datetime

In [49]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)  # Show all raws
pd.set_option('display.float_format', lambda x: '%.3f' %
              x)
pd.set_option('display.width', 500)

In [50]:
MUNICIPALITY_COUNT = 10
TOTAL_DATA_RAW_COUNT = 13070
TOTAL_DATA_FEATURE_COUNT = 4
EACH_MUNICIPALITY_DATA_RAW_COUNT = TOTAL_DATA_RAW_COUNT/MUNICIPALITY_COUNT

In [51]:
def check_df(dataframe, head=100):
    '''
    Provides general information about the loaded data.

    Parameters
    ----------
    dataframe: dataframe
    head: int

    Notes
    ----------
    The head value is set to a constant 5.
    '''
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)
    print("##################### Corr #####################")
    print(dataframe.corr())
    print("##################### Describe #####################")
    print(dataframe.describe())

In [52]:
data = pd.read_csv(r"data/municipality_bus_utilization.csv",header=0)
check_df(data)

##################### Shape #####################
(13070, 4)
##################### Types #####################
timestamp          object
municipality_id     int64
usage               int64
total_capacity      int64
dtype: object
##################### Head #####################
              timestamp  municipality_id  usage  total_capacity
0   2017-06-04 07:59:42                9    454            1332
1   2017-06-04 07:59:42                8    556            2947
2   2017-06-04 07:59:42                4   1090            3893
3   2017-06-04 07:59:42                0    204            2813
4   2017-06-04 07:59:42                7    718            2019
5   2017-06-04 07:59:42                5     70             587
6   2017-06-04 07:59:42                2    273             697
7   2017-06-04 07:59:42                1    129             397
8   2017-06-04 07:59:42                6    597            3113
9   2017-06-04 07:59:42                3    623            1930
10  2017-06-04 08:

### Split dataset by municipality numbers

In [53]:
def sprit_municipality(data,municipality_column_name):
    ''' 
    Allows the given dataset to be sorted by municipality numbers.
    
    Parameters
    ----------
    data: dataframe
    municipality_column_name: str

    Returns
    -------
    df: dataframe
    '''
    df = pd.DataFrame()

    for idx in range(MUNICIPALITY_COUNT):
        each_municipality = data.groupby([municipality_column_name]).get_group(idx)
        df = pd.concat([df,each_municipality], ignore_index=True)
    
    return df

In [54]:
def selecet_max_usage_in_an_hour(data,time_column_name,municipality_column_name,first_character_count=13):
    '''
    It serves to select the one with the maximum value from the measurements within the 1-hour period in the data set.

    Parameters
    ----------
    data: dataframe
    time_column_name: str
    municipality_column_name: str
    first_character_count: int

    Notes
    -----
    first_character_count is set to 13 depending on the time display structure. Because our goal is to determine the measurements within 1 hour.

    Returns
    -------
    data: dataframe
    '''
    data[time_column_name] = data.timestamp.apply(lambda x: x[:first_character_count])
    data = data.groupby([time_column_name,municipality_column_name]).agg({'usage':'max','total_capacity':'max'}).reset_index()
    data.timestamp = pd.to_datetime(data.timestamp)
    data = sprit_municipality(data,municipality_column_name)

    return data

In [55]:
data = selecet_max_usage_in_an_hour(data,"timestamp","municipality_id")

In [56]:
check_df(data)

##################### Shape #####################
(6740, 4)
##################### Types #####################
timestamp          datetime64[ns]
municipality_id             int64
usage                       int64
total_capacity              int64
dtype: object
##################### Head #####################
             timestamp  municipality_id  usage  total_capacity
0  2017-06-04 07:00:00                0    204            2813
1  2017-06-04 08:00:00                0    332            2813
2  2017-06-04 09:00:00                0    485            2813
3  2017-06-04 10:00:00                0    583            2813
4  2017-06-04 11:00:00                0    614            2813
5  2017-06-04 12:00:00                0    613            2813
6  2017-06-04 13:00:00                0    605            2813
7  2017-06-04 14:00:00                0    585            2813
8  2017-06-04 15:00:00                0    520            2813
9  2017-06-04 16:00:00                0    470            281

In [59]:
def find_missing_dates(df):
    tempdf = df.copy()
    tempdf = tempdf['timestamp'].apply(lambda date: datetime.datetime.strptime(str(date)[:str(date).find(' ')], "%Y-%m-%d"))
    missing_dates = pd.date_range(start="2017-06-04", end="2017-08-19").difference(tempdf.values)
    return missing_dates

In [74]:
def find_missing_hours(df):
    temp = df[['timestamp', 'municipality_id']].set_index('timestamp')
    rec_hours = temp[temp['municipality_id'] == 0].loc['2017-06-04'].index.hour

    df = df[['timestamp', 'municipality_id']].set_index('timestamp').copy()
    
    missing = {}
    for day in pd.date_range('2017-06-04', '2017-08-19'):
        hrs = temp[temp['municipality_id'] == 0].loc[str(day.date())].index.hour.to_list()
        missing_hours = set(rec_hours.to_list()).difference(hrs)
        if (missing_hours != set()):
            missing[str(day.date())] = missing_hours

    return missing

In [75]:
missing_hours = find_missing_hours(data)
missing_hours

In [60]:
missing_data = find_missing_dates(data)
missing_data

DatetimeIndex(['2017-06-20', '2017-06-21', '2017-07-31', '2017-08-03', '2017-08-04'], dtype='datetime64[ns]', freq=None)