# BART Ridership data for cleaning

In [1]:
# Dependencies
import os
import pandas as pd
import numpy as np
from functools import reduce

In [2]:
# List down the files
path = "Data/"

files = os.listdir(path) # shows the files in the Data folder
files.sort() # sort the file by order of month (based on file name)
files # preview the files list

['Ridership_201801.xlsx',
 'Ridership_201802.xlsx',
 'Ridership_201803.xlsx',
 'Ridership_201804.xlsx',
 'Ridership_201805.xlsx',
 'Ridership_201806.xlsx',
 'Ridership_201807.xlsx',
 'Ridership_201808.xlsx',
 'Ridership_201809.xlsx',
 'Ridership_201810.xlsx',
 'Ridership_201811.xlsx',
 'Ridership_201812.xlsx']

In [3]:
# Create a list of dataframes arranged by month
df_list = [pd.read_excel(path + file) for file in files]

# Create a list of year-month based on the file names
filenames = [filename[10:14] + "-" + filename[14:16] for filename in files]

In [4]:
# Create an empty list that will be populated by the data with converted column headers
df_list2 = []

# Create a for-loop that cleans each of the original dataframes
for df in df_list:
    
    # Assign the top row of the original dataframe as the column header
    df.columns = df.iloc[0] 
    
    # The dataframe values come from row 1 onwards, removing the last row ("total entries")
    df = df[1:-1] 
    
    # Remove the "Exits" column
    df = df.drop("Exits", axis = 1) 
    
    # Call the first column "Exit_Station"
    df = df.rename(columns = {np.nan: "Exit_Station"}) 
    
    # Add each df to the df_list2 list
    df_list2.append(df)

# Preview the list
df_list2

[0  Exit_Station       RM       EN       EP       NB       BK       AS  \
 1            RM  12.7619  108.619  81.9048  64.9524  365.619  101.571   
 2            EN  131.952  20.3333  91.2857  95.7619  825.857  139.381   
 3            EP  86.4286   81.381  12.8571  46.9048   659.19  83.0476   
 4            NB   72.381  87.1429  51.0476   15.619  199.619  61.3333   
 5            BK  405.619  897.095   667.81      208   35.619  360.762   
 6            AS  105.048  127.714  82.6667  61.4286  328.857  15.1429   
 7            MA  154.143  267.524      121  86.7619  351.286  83.5238   
 8            19  162.905   478.19  302.381  291.381      513  262.238   
 9            12  203.238   583.81  323.714  252.857  454.286   239.81   
 10           LM  36.4762  84.1429  60.4762  44.9524  212.667  62.1905   
 11           FV  91.1429  103.762  54.8571  62.9048  260.476  93.9048   
 12           CL  60.3333  111.286  33.5714   42.619  144.619  63.1905   
 13           SL   33.381  58.5238  31

In [5]:
# Define the function "pivot" which unpivots the dataframe to three columns
# NB: list(df.columns[1:]) = create a list of column headers excluding "Exit_Station"
def unpivot(df):
    transformed_df = pd.melt(df, id_vars = ["Exit_Station"], 
                             value_vars = list(df.columns[1:]), var_name = "Entry_Station")
    return transformed_df

In [6]:
# Create a new list of dataframes containing transformed data (exit, entry, avg values)
df_list3 = [unpivot(df) for df in df_list2]

# Rename the "value" column using the ith item in the filenames list
df_list4 = []

for i in range(len(filenames)):
    value_renamed = df_list3[i].rename(columns = {"value": filenames[i]})
    df_list4.append(value_renamed)

# View the new dataframe list
df_list4

[     Exit_Station Entry_Station  2018-01
 0              RM            RM  12.7619
 1              EN            RM  131.952
 2              EP            RM  86.4286
 3              NB            RM   72.381
 4              BK            RM  405.619
 5              AS            RM  105.048
 6              MA            RM  154.143
 7              19            RM  162.905
 8              12            RM  203.238
 9              LM            RM  36.4762
 10             FV            RM  91.1429
 11             CL            RM  60.3333
 12             SL            RM   33.381
 13             BF            RM  20.5714
 14             HY            RM  41.4762
 15             SH            RM  18.6667
 16             UC            RM   17.381
 17             FM            RM  32.0476
 18             CN            RM  23.8571
 19             PH            RM  18.4762
 20             WC            RM  20.1429
 21             LF            RM  5.09524
 22             OR            RM  

In [7]:
# Merge dataframes
merged = reduce(lambda left, right: pd.merge(left, right, 
                                             on = ["Exit_Station", "Entry_Station"],
                                             how = "outer"), df_list4)
merged

Unnamed: 0,Exit_Station,Entry_Station,2018-01,2018-02,2018-03,2018-04,2018-05,2018-06,2018-07,2018-08,2018-09,2018-10,2018-11,2018-12
0,RM,RM,12.7619,16.0526,17,13.9048,15.3182,13.9048,15.2381,14.8261,20.1053,16.2273,16.1579,16.1667
1,EN,RM,131.952,135.211,126.136,121.714,122.591,122.476,119.381,120.696,120.053,121.682,106.632,110.333
2,EP,RM,86.4286,93,84.5909,85.9048,94.2273,86.9048,81.619,84.087,88.2632,93.2727,83.4737,84.0556
3,NB,RM,72.381,74.7368,69.0909,70.9048,74.3636,62.381,61.8571,66.8261,78.6842,79.0909,68.1053,61.5556
4,BK,RM,405.619,473.263,449.636,438.524,406.636,410.571,432.619,438.348,496.105,479.864,431.158,363
5,AS,RM,105.048,115.211,114.273,105.476,107.455,109.524,102.429,100.217,109.789,110.545,103.158,97.2222
6,MA,RM,154.143,152.789,137.818,141.81,150.682,138.762,137.905,145.652,149.105,155.273,144.632,136.778
7,19,RM,162.905,173.947,168.636,157.571,173.545,205.333,167.619,171.261,195.105,196.636,175.895,168.778
8,12,RM,203.238,218.263,211.773,202.81,206.955,228.857,215.381,216.348,233.421,228.773,211.895,187.111
9,LM,RM,36.4762,47.7368,42.7273,39.2381,40.0455,43.1429,37.5714,40.087,47.5263,43.3636,36.7368,29.9444
