In [2]:
# imports
import pandas as pd
import numpy as np
import os
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


In [21]:
path = "../datasets/data_for_mvp/"

fi = [f for f in os.listdir(path)]
files = [file for file in fi if 'txt' in file]
files

['routes.txt',
 'trips.txt',
 'LICENSE.txt',
 'calendar.txt',
 'frequencies.txt',
 'calendar_dates.txt',
 'stops.txt',
 'agency.txt',
 'stop_times.txt',
 'shapes.txt']

In [13]:
# Loading each dataset to each dataframe
agency = pd.read_csv(path + 'agency.txt')
dates = pd.read_csv(path + 'calendar_dates.txt')
stop_times = pd.read_csv(path + 'stop_times.txt')
frequencies = pd.read_csv(path + 'frequencies.txt')
shapes = pd.read_csv(path + 'shapes.txt')
trips = pd.read_csv(path + 'trips.txt')
stops = pd.read_csv(path + 'stops.txt')
calendar = pd.read_csv(path + 'calendar.txt')
routes = pd.read_csv(path + 'routes.txt')

In [14]:
# Creating a dict w/all datasets:
all_ds = {"agency": agency, 
          "dates": dates, 
          "stop times": stop_times, 
          "frequencies": frequencies, 
          "shapes": shapes, 
          "trips": trips, 
          "stops": stops, 
          "calendar": calendar, 
          "routes": routes}

access = f"Dict name: all_ds. \n\nKeys: {all_ds.keys()}, \nDataSets: {len(all_ds)}"
print(access)

Dict name: all_ds. 

Keys: dict_keys(['agency', 'dates', 'stop times', 'frequencies', 'shapes', 'trips', 'stops', 'calendar', 'routes']), 
DataSets: 9


In [15]:
# checking how prevalent missing values are in our data (for each dataset)

# defining a function to check null values:
def null_cols(ds):
    """check whether the value in each field is missing (null) and return either 
    True or False for each field, totaling up the number of True values by column. """
    return ds.isnull().sum()

# applying the filter to each dataset
agency_null_cols = null_cols(agency)
dates_null_cols = null_cols(dates)
stop_times_null_cols = null_cols(stop_times)
frequencies_null_cols = null_cols(frequencies)
shapes_null_cols = null_cols(shapes)
trips_null_cols = null_cols(trips)
stops_null_cols = null_cols(stops)
calendar_null_cols = null_cols(calendar)
routes_null_cols = null_cols(routes)

# Adding a condition that will filter the data and show us only columns where the number 
# of null values were greater than zero for each dataset:

# 'agency_phone' = 1
agency_null_cols[agency_null_cols > 0] 

# dates['exception_type'].value_counts() == 1 
dates_null_cols[dates_null_cols > 0]  

# ['stop_headsign', 'pickup_type', 'drop_off_type', shape_dist_traveled] = 1842 (All entries)
stop_times_null_cols[stop_times_null_cols > 0] 

# frequencies['exact_times'].value_counts() == 0
frequencies_null_cols[frequencies_null_cols > 0] 

# 'shape_dist_traveled' = 182 (All entries)
shapes_null_cols[shapes_null_cols > 0] 

# ['trip_headsign', 'direction_id', 'block_id'] = 132 (All entries)
trips_null_cols[trips_null_cols > 0]

# ['stop_code', 'stop_desc', 'zone_id', 'stop_url', 'location_type', 'parent_station'] = 49 (All entries)
stops_null_cols[stops_null_cols > 0]

# nothing to declare
calendar_null_cols[calendar_null_cols > 0]

# ['route_short_name', 'route_desc', 'route_url', 'route_color', 'route_text_color'] = 10 (All entries)
routes_null_cols[routes_null_cols > 0]

route_short_name    10
route_desc          10
route_url           10
route_color         10
route_text_color    10
dtype: int64

In [16]:
# Judgement call: droping information that we don't think it's going to be very useful 
# to our analysis (removing those columns from your datasets) with the drop method.
# We will add these column names to a list, and then we will pass those columns to the 
# drop method and indicate that we want columns (not rows) dropped by setting the axis 
# parameter to 1.

# defining a function to create a list:
def drop_cols(bad_cols):
    """Adding col names to a list to be droped; in this case as long as the col has a 
    single null value in it, since, in this case, if it has one, their all null."""
    return list(bad_cols[bad_cols > 0].index)

# TODO: Recheck dates, dates_drop_cols (and all other that have 'dates' in the name),
# as well as calendar, for I made a mistake. I rechecked it, but another pass would be good.

# applying the function to each ds
agency_drop_cols = drop_cols(agency_null_cols)
dates_drop_cols = drop_cols(dates_null_cols) # this one has no cols to drop
stop_times_drop_cols = drop_cols(stop_times_null_cols)
frequencies_drop_cols = drop_cols(frequencies_null_cols) # this one has no cols to drop
shapes_drop_cols = drop_cols(shapes_null_cols)
trips_drop_cols = drop_cols(trips_null_cols)
stops_drop_cols = drop_cols(stops_null_cols)
calendar_drop_cols = drop_cols(calendar_null_cols) # this one has no cols to drop
routes_drop_cols = drop_cols(routes_null_cols)

# Passing those columns to the drop method and indicate that we want columns (not rows) 
# dropped by setting the axis parameter to 1:
agency = agency.drop(agency_drop_cols, axis = 1)
dates = dates.drop(dates_drop_cols, axis = 1)
stop_times = stop_times.drop(stop_times_drop_cols, axis = 1)
frequencies = frequencies.drop(frequencies_drop_cols, axis = 1)
shapes = shapes.drop(shapes_drop_cols, axis = 1)
trips = trips.drop(trips_drop_cols, axis = 1)
stops = stops.drop(stops_drop_cols, axis = 1)
calendar = calendar.drop(calendar_drop_cols, axis = 1)
routes = routes.drop(routes_drop_cols, axis = 1)

# this should've left us w/no cols with null values in the datasets

In [22]:
print(access)
# checking Incorrect Values in our data (for each dataset)
# The dataset was uploaded at 01/01/2020

# all the dates in the 'start_date' pertain to 16/09/2019
calendar['start_date'].value_counts()

# all the dates in the 'end_date' pertain to 16/09/2029; what does this mean?
calendar['end_date'].value_counts()

# we have values for dates that are for dates in 2020 that haven't occured yet. Why?
# What does 'exception_type' stand for? And why are all entries in it == 1?
dates['date'].value_counts()
dates['exception_type'].value_counts()

# varies between 18 and 8: yellow == 13, green == 13, red == 12, blue == 16, so 
# I don't think it corresponds to stops in the lines
stop_times['trip_id'].value_counts()

# varies between 24 and 1
stop_times['arrival_time'].value_counts()
stop_times['departure_time'].value_counts()

# varies between 74 and 26
stop_times['stop_id'].value_counts()

# varies between 132 and 40
stop_times['stop_sequence'].value_counts()

# all 'trip_id's == 1
frequencies['trip_id'].value_counts()

# varies between 14 for 485 and 2 for several
frequencies['headway_secs'].value_counts()

# all == 0
frequencies['exact_times'].value_counts()

# between 18 and 8
shapes['shape_id'].value_counts()

# interesting... look at 'route_id' w/ 'shape_id'; where's 167 in shape_id? 
# 167 exists 18 times in shapes['shape_id']
[col and trips[col].value_counts() for col in trips]

# all entries are unique 
# (13+13+12+16-6, where -6 are intersections == 48; it has 49 entries?)
len(stops['stop_id'].unique())

# all (132) unique
calendar['service_id'].value_counts()

# we can drop 'agency_id' and 'route_type'
routes

Dict name: all_ds. 

Keys: dict_keys(['agency', 'dates', 'stop times', 'frequencies', 'shapes', 'trips', 'stops', 'calendar', 'routes']), 
DataSets: 9


NameError: name 'D_routes' is not defined