# 1.3 | Data Acquisition: DAILY data
* [01 API Data Requests](01_API_pulls.ipynb)
* [01.1 Additional BART Data](01_v2_bart.ipynb.ipynb)
* _[01.3 Daily BART Data](01_v3_bart.ipynb.ipynb)_
* [02 Initial EDA](02_EDA.ipynb)
* [03 First Model: PROPHET](03_prophet.ipynb)
---

### <b>Daily</b> BART ridership

Pre-Processing a massive CSV from `bart.gov`
* collapse from HOURLY to DAILY counts (sum)

No header/header file, in format: 
date | hour (of day, 24hr) | origin station | destination station | riders
---  |---                  | ---            | ---                   | ---

<br>

> for `datetime`, `pandas.DatetimeIndex.dayofweek` returns day of week, with `0 = Monday` and `6 = Sunday`. 

* Initial modeling will look at _daily_, _system-wide_ ridership. 
* Subsequent analysis will consider _hourly_
* More granualar analysis consider fuel prices will consider trips `>10mi` to assess long-distance _commuter_ sensitivity to fuel prices without the intra-city _urban_ rides.

In [3]:
##### BASIC IMPORTS
import glob
import pandas as pd

import gcutsoms as gf

In [4]:
path = '../data/raw/bart/hourly/'
files = os.listdir(path)

# Print file list to verify file types, count 
files

['.DS_Store',
 'date-hour-soo-dest-2019.csv',
 'date-hour-soo-dest-2018.csv',
 'date-hour-soo-dest-2022.csv',
 'date-hour-soo-dest-2020.csv',
 'date-hour-soo-dest-2021.csv',
 'date-hour-soo-dest-2011.csv',
 'date-hour-soo-dest-2013.csv',
 'date-hour-soo-dest-2012.csv',
 'date-hour-soo-dest-2016.csv',
 'date-hour-soo-dest-2017.csv',
 'date-hour-soo-dest-2015.csv',
 'date-hour-soo-dest-2014.csv']

---
This function iterates through directory holding yearly files: 
* eliminates same-station exits `origin = destination'
* output is single `dataframe` with date as index, `ds` = date column, and ridership column
* rider count is `aggregate` by: 
  * date & station
  * _by data & by exit station_ * add this to analysis 
  * _by weekly sum of per weekday_  * add this to analysis 

In [35]:
filename = "../data/raw/bart/hourly/date-hour-soo-dest-2019.csv"
df = pd.read_csv(filename)
df.columns = ['d', 'hour', 'origin', 'exit', 'riders']

df['date'] = pd.to_datetime(df['d'])
df = df.set_index('date')

df.rename(columns = {'riders' : 'y'}, inplace = True)
df = df[ df['origin'] != df['exit'] ] 
df.head()

Unnamed: 0_level_0,d,hour,origin,exit,y
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-01,2019-01-01,0,12TH,16TH,4
2019-01-01,2019-01-01,0,12TH,ANTC,1
2019-01-01,2019-01-01,0,12TH,BAYF,1
2019-01-01,2019-01-01,0,12TH,CIVC,2
2019-01-01,2019-01-01,0,12TH,COLM,1


In [38]:
# df_2 = df['y'].groupby(['date']).sum()
# df_2['d'] = 
# df_2.head()

df = df['y'].groupby(['date']).sum()
df.head(

)
# df_2 = df['riders'].groupby(['date']).sum()
# df_2.head()

# # desc_post = bart[split_date:].groupby(['day']).describe()

date
2019-01-01     98940
2019-01-02    328218
2019-01-03    364931
2019-01-04    340950
2019-01-05    112630
Name: y, dtype: int64

In [6]:
def date_index(df): 
    # df.dropna(inplace=True)
    df['d'] = df['date']
    df['date'] = pd.to_datetime(df['date'])
    df = df.set_index('date')
    print(df.head(3))
    return(df)

In [7]:
def agg_station_day(path_name):

    df1 = pd.DataFrame()
    n = 0

    # for file in os.listdir(path_name):
    for file in glob.glob(path_name + '*.csv'):
        n += 1
        print(file)
        
        df = pd.read_csv(file) 

        # add columns header for ease of manipulation 
        df.columns = ['d', 'hour', 'origin', 'exit', 'riders']

        # change date col to datetime format (from string/object type )
        df['date'] = pd.to_datetime(df['d'])

        # set date column as index
        df = df.set_index('date')     
        # df['ds'] = df_y.index
        
        # rename columns
        # df.rename(columns = {'riders' : 'y'}, inplace = True)
        
        # filter out origin = destination rides
        df = df[ df['origin'] != df['exit'] ] 
        
        # group / sum / aggregate data for each day 
        # must use dt or numeric, doesn't seem like I can groub by objec/string? 
        # df_1 = df.groupby(['d']).agg({'riders': ['sum']})
        # df_1 = df['riders'].groupby(['d']).sum()
        # df_1 = df.groupby(['date']).agg({'riders': ['sum']})
        df_1 = df['riders'].groupby(['date']).sum()

        # Group / sum / aggregate data for each day BY STATION
        # df = df.groupby(['dt', 'exit']).agg({'riders': ['sum']}).reset_index()
        # df.groupby("dummy")['returns'].agg(['mean', 'sum'])
        # df = df.groupby(['dt', 'exit'])['riders'].agg(['sum']).reset_index()
        # df = df.groupby(['dt', 'exit']).agg({'riders': ['sum']}) ####
        # df.groupby("dummy")['returns'].agg(['mean', 'sum'])
        # df_2 = df_y['riders'].groupby(['exit']).sum()
        
        # add each year to running list 
        df1 = pd.concat([df1, df_1])

    df1.sort_index(inplace=True)
    # df1.columns = ['date', 'ridership']
    df1.columns = ['d', 'ridership']

    return(pd.DataFrame(df1))

In [8]:
df_daily= agg_station_day(path)

../data/raw/bart/hourly/date-hour-soo-dest-2019.csv
../data/raw/bart/hourly/date-hour-soo-dest-2018.csv
../data/raw/bart/hourly/date-hour-soo-dest-2022.csv
../data/raw/bart/hourly/date-hour-soo-dest-2020.csv
../data/raw/bart/hourly/date-hour-soo-dest-2021.csv
../data/raw/bart/hourly/date-hour-soo-dest-2011.csv
../data/raw/bart/hourly/date-hour-soo-dest-2013.csv
../data/raw/bart/hourly/date-hour-soo-dest-2012.csv
../data/raw/bart/hourly/date-hour-soo-dest-2016.csv
../data/raw/bart/hourly/date-hour-soo-dest-2017.csv
../data/raw/bart/hourly/date-hour-soo-dest-2015.csv
../data/raw/bart/hourly/date-hour-soo-dest-2014.csv


ValueError: Length mismatch: Expected axis has 1 elements, new values have 2 elements

In [30]:
df_daily.head(12)

Unnamed: 0,ridership
2011-01-01,124162.0
2011-01-02,93666.0
2011-01-03,285891.0
2011-01-04,322306.0
2011-01-05,327006.0
2011-01-06,329001.0
2011-01-07,323306.0
2011-01-08,142343.0
2011-01-09,109395.0
2011-01-10,316511.0


In [31]:
df_daily.tail()

Unnamed: 0,ridership
2022-06-28,144133.0
2022-06-29,147056.0
2022-06-30,140010.0
2022-07-01,121914.0
2022-07-02,78063.0


In [32]:
df_daily.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4195 entries, 2011-01-01 to 2022-07-02
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ridership  4195 non-null   float64
dtypes: float64(1)
memory usage: 65.5 KB


In [33]:
df_daily.describe()

Unnamed: 0,ridership
count,4195.0
mean,274235.952086
std,150632.711245
min,2795.0
25%,131495.0
50%,354215.0
75%,413056.5
max,567020.0


In [36]:
# # current name of column holding date 
# col_title = 'd'
# # sets date as time index
# df_daily2 = gf.dt_index(df_daily, col_title)
# # rename date column to either fb prophet or linkedin greykite format 
# # df_daily.dt 
# df_daily2.head()

### Print out merged, clean csv.

In [None]:
# df_out = df_daily[['dt', 'exit', 'sum']]

df_out.head()

In [37]:
df_daily.to_csv('../data/processed/bart_daily_station.csv', index = False)