In [1]:
# imports
import pandas as pd
import numpy as np
import geopy as gp
import datetime
import os
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


In [2]:
# defining the path to the data
path = './datasets/data_for_mvp/'

# creating a dataset list for all datasets
files = [file for file in os.listdir(path) if 'txt' in file]

In [3]:
# Loading each dataset to each dataframe
agency = pd.read_csv(path+files[0])
dates = pd.read_csv(path+files[1])
stop_times = pd.read_csv(path+files[2])
frequencies = pd.read_csv(path+files[3])
shapes = pd.read_csv(path+files[4])
trips = pd.read_csv(path+files[5])
stops = pd.read_csv(path+files[6])
calendar = pd.read_csv(path+files[7])
routes = pd.read_csv(path+files[8])

# Creating a dictionary w/all datasets:
all_ds = {"agency": agency, 
          "dates": dates, 
          "stop times": stop_times, 
          "frequencies": frequencies, 
          "shapes": shapes, 
          "trips": trips, 
          "stops": stops, 
          "calendar": calendar, 
          "routes": routes}

# Creating a quick check acces to the dataset names for visual reference check
access = f"Dict name: all_ds. \n\nKeys: {all_ds.keys()}, \nDataSets: {len(all_ds)}"

In [4]:
# checking how prevalent missing values are in our data (for each dataset)

# defining a function to check null values:
def null_cols(ds):
    """check whether the value in each field is missing (null) and return either 
    True or False for each field, totaling up the number of True values by column. """
    return ds.isnull().sum()

# applying the filter to each dataset
agency_null_cols = null_cols(agency)
dates_null_cols = null_cols(dates)
stop_times_null_cols = null_cols(stop_times)
frequencies_null_cols = null_cols(frequencies)
shapes_null_cols = null_cols(shapes)
trips_null_cols = null_cols(trips)
stops_null_cols = null_cols(stops)
calendar_null_cols = null_cols(calendar)
routes_null_cols = null_cols(routes)

# Adding a condition that will filter the data and show us only columns where the number 
# of null values were greater than zero for each dataset. Greater than zero because, in 
# this case, when a column has a null value, all the column has null values:

# 'agency_phone' = 1
agency_null_cols[agency_null_cols > 0] 

# dates['exception_type'].value_counts() == 1 
dates_null_cols[dates_null_cols > 0]  

# ['stop_headsign', 'pickup_type', 'drop_off_type', shape_dist_traveled] = 1842 (All 
# entries)
stop_times_null_cols[stop_times_null_cols > 0] 

# frequencies['exact_times'].value_counts() == 0
frequencies_null_cols[frequencies_null_cols > 0] 

# 'shape_dist_traveled' = 182 (All entries)
shapes_null_cols[shapes_null_cols > 0] 

# ['trip_headsign', 'direction_id', 'block_id'] = 132 (All entries)
trips_null_cols[trips_null_cols > 0]

# ['stop_code', 'stop_desc', 'zone_id', 'stop_url', 'location_type', 'parent_station'] = 49 (All entries)
stops_null_cols[stops_null_cols > 0]

# nothing to declare
calendar_null_cols[calendar_null_cols > 0]

# ['route_short_name', 'route_desc', 'route_url', 'route_color', 'route_text_color'] = 10 
# (All entries)
routes_null_cols[routes_null_cols > 0]

print(access)

Dict name: all_ds. 

Keys: dict_keys(['agency', 'dates', 'stop times', 'frequencies', 'shapes', 'trips', 'stops', 'calendar', 'routes']), 
DataSets: 9


In [5]:
# Judgement call: droping information that we don't think it's going to be very useful 
# to our analysis (removing those columns from your datasets) with the drop method.
# We will add these column names to a list, and then we will pass those columns to the 
# drop method and indicate that we want columns (not rows) dropped by setting the axis 
# parameter to 1.

# defining a function to create a list:
def drop_cols(bad_cols):
    """Adding col names to a list to be droped; in this case as long as the col has a 
    single null value in it, since, in this case, if it has one, their all null."""
    
    return list(bad_cols[bad_cols > 0].index)

# applying the function to each ds
agency_drop_cols = drop_cols(agency_null_cols) # drops 'agency_phone'
dates_drop_cols = drop_cols(dates_null_cols) # this one has no cols to drop
stop_times_drop_cols = drop_cols(stop_times_null_cols) # drops ['stop_headsign', 
# 'pickup_type', 'drop_off_type', shape_dist_traveled]
frequencies_drop_cols = drop_cols(frequencies_null_cols) # this one has no cols to drop
shapes_drop_cols = drop_cols(shapes_null_cols)
trips_drop_cols = drop_cols(trips_null_cols)
stops_drop_cols = drop_cols(stops_null_cols)
calendar_drop_cols = drop_cols(calendar_null_cols) # this one has no cols to drop
routes_drop_cols = drop_cols(routes_null_cols)

# Passing those columns to the drop method and indicate that we want columns (not rows) 
# dropped by setting the axis parameter to 1:
agency = agency.drop(agency_drop_cols, axis = 1)
dates = dates.drop(dates_drop_cols, axis = 1)
stop_times = stop_times.drop(stop_times_drop_cols, axis = 1)
frequencies = frequencies.drop(frequencies_drop_cols, axis = 1)
shapes = shapes.drop(shapes_drop_cols, axis = 1)
trips = trips.drop(trips_drop_cols, axis = 1)
stops = stops.drop(stops_drop_cols, axis = 1)
calendar = calendar.drop(calendar_drop_cols, axis = 1)
routes = routes.drop(routes_drop_cols, axis = 1)

# this should've left us w/no cols with null values in the datasets
print(access)

Dict name: all_ds. 

Keys: dict_keys(['agency', 'dates', 'stop times', 'frequencies', 'shapes', 'trips', 'stops', 'calendar', 'routes']), 
DataSets: 9


In [6]:
calendar

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
0,25816292,1,1,1,1,1,0,0,20190916,20290916
1,25816294,1,1,1,1,1,0,0,20190916,20290916
2,25816296,1,1,1,1,1,0,0,20190916,20290916
3,25816298,1,1,1,1,1,0,0,20190916,20290916
4,25816300,1,1,1,1,1,0,0,20190916,20290916
...,...,...,...,...,...,...,...,...,...,...
127,25816345,0,0,0,0,0,0,1,20190916,20290916
128,25816349,0,0,0,0,0,0,1,20190916,20290916
129,25816347,0,0,0,0,0,0,1,20190916,20290916
130,25816353,0,0,0,0,0,0,1,20190916,20290916


In [7]:
"""# TRATAR O ROUTES
columns_to_keep = ['route_id', 'route_long_name']
D_routes = routes[columns_to_keep]
#D_routes['line'] = D_routes[D_routes['route_long_name'].split(' - ')]#.split(" - ")
stringlist = [v for v in D_routes['route_long_name']]
stringlist_doubled = [x.split(' - ') for x in stringlist]
stringlist_doubled
D_routes['line'] = [y[0] for y in stringlist_doubled]
D_routes['destination'] = [y[1] for y in stringlist_doubled]


D_routes.rename(columns={"route_id": "destination_id"}, inplace = True)

D_routes
# string.split(' - ')

# TRATAR O TRIPS
columns_to_keep = ['trip_id', 'shape_id']
D_trips = D_trips[columns_to_keep]
#D_trips['trip_id'] = D_trips['ID_OF_STUFF']

##
D_trips.rename(columns={"shape_id": "destination_id"}, inplace = True)

# TRATAR O CALENDAR
calendar = calendar.rename(columns={"service_id": "trip_id"})
calendar

## Fazer um merge da coluna weekday baseado no trip_id por oposicao ao datetime parsing

# TRATAR O DATES
calendar_dates = calendar_dates[['service_id', 'date']]

calendar_dates['year'] = [str(v)[0:4] for v in calendar_dates['date']]
calendar_dates['month'] = [str(v)[4:6] for v in calendar_dates['date']]
calendar_dates['day'] = [str(v)[6:8] for v in calendar_dates['date']]
calendar_dates['pdate'] = calendar_dates['year'] + '-' + calendar_dates['month'] + '-' + calendar_dates['day']
calendar_dates['datetime'] = pd.to_datetime(calendar_dates['pdate'])
calendar_dates.drop(["year", "month", "day", "pdate", "date"], axis = 1, inplace = True)
calendar_dates.rename(columns={"service_id": "trip_id"}, inplace = True)

calendar_dates"""

'# TRATAR O ROUTES\ncolumns_to_keep = [\'route_id\', \'route_long_name\']\nD_routes = routes[columns_to_keep]\n#D_routes[\'line\'] = D_routes[D_routes[\'route_long_name\'].split(\' - \')]#.split(" - ")\nstringlist = [v for v in D_routes[\'route_long_name\']]\nstringlist_doubled = [x.split(\' - \') for x in stringlist]\nstringlist_doubled\nD_routes[\'line\'] = [y[0] for y in stringlist_doubled]\nD_routes[\'destination\'] = [y[1] for y in stringlist_doubled]\n\n\nD_routes.rename(columns={"route_id": "destination_id"}, inplace = True)\n\nD_routes\n# string.split(\' - \')\n\n# TRATAR O TRIPS\ncolumns_to_keep = [\'trip_id\', \'shape_id\']\nD_trips = D_trips[columns_to_keep]\n#D_trips[\'trip_id\'] = D_trips[\'ID_OF_STUFF\']\n\n##\nD_trips.rename(columns={"shape_id": "destination_id"}, inplace = True)\n\n# TRATAR O CALENDAR\ncalendar = calendar.rename(columns={"service_id": "trip_id"})\ncalendar\n\n## Fazer um merge da coluna weekday baseado no trip_id por oposicao ao datetime parsing\n\n# TR

In [15]:
estacoes = stops['stop_name']

verde = ['ALAMEDA', 
         'ALVALADE', 
         'ANJOS', 
         'AREEIRO', 
         'BAIXA-CHIADO', 
         'CAIS DO SODRE', 
         'CAMPO GRANDE', 
         'INTENDENTE', 
         'MARTIM MONIZ', 
         'ROMA',  
         'TELHEIRAS']

azul = ['ALFORNELOS', 
        'ALTO DOS MOINHOS', 
        'AMADORA ESTE', 
        'AVENIDA', 
        'BAIXA-CHIADO', 
        'CARNIDE', 
        'COLÉGIO MILITAR-LUZ', 
        'JARDIM ZOOLÓGICO', 
        'LARANJEIRAS',
        'MARQUÊS DE POMBAL', 
        'PARQUE', 
        'PONTINHA', 
        'PRAÇA DE ESPANHA', 
        'REBOLEIRA', 
        'RESTAURADORES', 
        'ROSSIO',
        'SANTA APOLÓNIA', 
        'SÃO SEBASTIÃO', 
        'TERREIRO DO PAÇO']

vermelha = ['AEROPORTO', 
            'ALAMEDA', 
            'BELA VISTA', 
            'CABO RUIVO', 
            'CHELAS', 
            'ENCARNAÇÃO', 
            'MOSCAVIDE', 
            'OLAIAS', 
            'OLIVAIS', 
            'ORIENTE', 
            'SALDANHA', 
            'SÃO SEBASTIÃO']

amarela = ['AMEIXOEIRA', 
           'CAMPO GRANDE', 
           'CAMPO PEQUENO', 
           'CIDADE UNIVERSITÁRIA',
           'ENTRECAMPOS', 
           'LUMIAR', 
           'MARQUÊS DE POMBAL', 
           'ODIVELAS', 
           'PICOAS', 
           'QUINTA DAS CONCHAS', 
           'RATO', 
           'SALDANHA', 
           'SR. ROUBADO']

roxa = ['CAMPO GRANDE',
        'CAMPO PEQUENO', 
        'CIDADE UNIVERSITÁRIA',
        'ENTRECAMPOS', 
        'MARQUÊS DE POMBAL', 
        'PICOAS', 
        'RATO', 
        'SALDANHA']



 

['ALAMEDA',
 'ALVALADE',
 'ANJOS',
 'AREEIRO',
 'BAIXA-CHIADO',
 'CAIS DO SODRE',
 'CAMPO GRANDE',
 'INTENDENTE',
 'MARTIM MONIZ',
 'ROMA',
 'TELHEIRAS']