# BART Ridership data for cleaning

In [1]:
# Dependencies
import pandas as pd
import os
import re

In [2]:
# List down the files
path = "Data/"

files = os.listdir(path) # shows the files in the Data folder
files.sort() # sort the file by order of month (based on file name)
files # preview the files list

['Ridership_201801.xlsx',
 'Ridership_201802.xlsx',
 'Ridership_201803.xlsx',
 'Ridership_201804.xlsx',
 'Ridership_201805.xlsx',
 'Ridership_201806.xlsx',
 'Ridership_201807.xlsx',
 'Ridership_201808.xlsx',
 'Ridership_201809.xlsx',
 'Ridership_201810.xlsx',
 'Ridership_201811.xlsx',
 'Ridership_201812.xlsx']

In [3]:
# Create a list of dataframes arranged by month
df_list = [pd.read_excel(path + file) for file in files]
df_list

[   Exit stations Entry stations-> Unnamed: 2  WEEKDAY Unnamed: 4 Unnamed: 5  \
 0            NaN               RM         EN       EP         NB         BK   
 1             RM          12.7619    108.619  81.9048    64.9524    365.619   
 2             EN          131.952    20.3333  91.2857    95.7619    825.857   
 3             EP          86.4286     81.381  12.8571    46.9048     659.19   
 4             NB           72.381    87.1429  51.0476     15.619    199.619   
 5             BK          405.619    897.095   667.81        208     35.619   
 6             AS          105.048    127.714  82.6667    61.4286    328.857   
 7             MA          154.143    267.524      121    86.7619    351.286   
 8             19          162.905     478.19  302.381    291.381        513   
 9             12          203.238     583.81  323.714    252.857    454.286   
 10            LM          36.4762    84.1429  60.4762    44.9524    212.667   
 11            FV          91.1429    10

In [7]:
# Create an empty list that will be populated by the data with converted column headers
df_list2 = []

# Create a for loop
for df in df_list:
    df.columns = df.iloc[0] # the column header is the top row of the original dataframe
    df = df[1:] # the values come from row 1 onwards
    df.drop(labels = "Exits", axis = 1)
    df_list2.append(df)

# Preview the list
df_list2[0]

KeyError: "['Exits'] not found in axis"

In [5]:
df_list[0].columns = df_list[0].iloc[0]    
df_list[0] = df_list[0][1:]
df_list[0]

Unnamed: 0,nan,RM,EN,EP,NB,BK,AS,MA,19.0,12.0,...,NC,WP,SS,SB,SO,MB,WD,OA,WS,Exits
1,RM,12.7619,108.619,81.9048,64.9524,365.619,101.571,138.048,149.190476,180.095238,...,2.19048,35.0,12.1905,18.9048,58.7143,26.6667,7.61905,12.3333,7.47619,3934.95
2,EN,131.952,20.3333,91.2857,95.7619,825.857,139.381,261.619,471.47619,562.285714,...,5.57143,26.5714,34.0952,42.619,113.619,71.1905,13.2857,30.9048,26.4762,8247.38
3,EP,86.4286,81.381,12.8571,46.9048,659.19,83.0476,133.762,298.0,336.333333,...,3.66667,11.1905,10.2857,11.1429,60.3333,31.8095,7.47619,24.7143,11.0,4702.19
4,NB,72.381,87.1429,51.0476,15.619,199.619,61.3333,91.1905,285.190476,249.333333,...,3.28571,16.2381,7.85714,9.80952,91.381,35.8095,8.0,30.7619,16.5714,4396.14
5,BK,405.619,897.095,667.81,208.0,35.619,360.762,361.905,562.285714,528.428571,...,49.1429,113.048,34.4286,39.2381,179.905,139.095,48.5238,73.5238,78.381,11122.7
6,AS,105.048,127.714,82.6667,61.4286,328.857,15.1429,83.2381,263.666667,251.666667,...,6.66667,28.0476,12.9524,14.4762,63.619,42.0476,9.80952,23.4762,21.619,5085.57
7,MA,154.143,267.524,121.0,86.7619,351.286,83.5238,41.5238,195.619048,228.0,...,62.5714,259.952,31.1429,30.9524,108.81,67.9524,26.5238,26.9524,34.381,8691.81
8,19,162.905,478.19,302.381,291.381,513.0,262.238,189.762,29.571429,58.904762,...,173.048,280.667,69.4762,71.9048,88.9524,135.286,106.476,34.5238,110.19,12962.3
9,12,203.238,583.81,323.714,252.857,454.286,239.81,223.762,56.190476,36.714286,...,174.286,343.048,89.381,83.9048,125.476,157.238,122.714,44.9524,88.1905,13330.5
10,LM,36.4762,84.1429,60.4762,44.9524,212.667,62.1905,58.4286,44.190476,42.333333,...,10.7619,52.8095,11.619,15.0952,36.8571,19.3333,76.8571,19.0476,91.6667,6523.95


In [6]:
# Unpivot the data such that there's an exit, an entry, and a ridership value