# 01A - Data Import, Toronto Bike Share

## Import relevant libraries

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import requests
import json

---

## Using API - get Station and Pricing Data

In [2]:
# Toronto Open Data is stored in a CKAN instance. It's APIs are documented here:
# https://docs.ckan.org/en/latest/api/
# To hit our API, you'll be making requests to:
base_url = "https://ckan0.cf.opendata.inter.prod-toronto.ca"

# Datasets are called "packages". Each package can contain many "resources"
# To retrieve the metadata for this package and its resources, use the package name in this page's URL:
url = base_url + "/api/3/action/package_show"
params = { "id": "bike-share-toronto"}
package = requests.get(url, params = params).json()

# To get resource data:
for idx, resource in enumerate(package["result"]["resources"]):

    # To get metadata for non datastore_active resources:
    if not resource["datastore_active"]:
        url = base_url + "/api/3/action/resource_show?id=" + resource["id"]
        resource_metadata = requests.get(url).json()
        print(resource_metadata)
             # From here, you can use the "url" attribute to download this file
        
        print("") # space out results
        print("Print URL:")
        print(resource_metadata['result']['url'])
        print("") # space out results

{'help': 'https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/help_show?name=resource_show', 'success': True, 'result': {'cache_last_updated': None, 'cache_url': None, 'created': '2019-07-23T16:40:17.467193', 'datastore_active': False, 'format': 'JSON', 'hash': '', 'id': '5c1c2c06-d27f-47b7-ae82-926a6d23d76f', 'is_datastore_cache_file': False, 'is_preview': False, 'last_modified': '2019-07-23T16:40:17.427058', 'metadata_modified': '2022-03-10T16:24:04.865406', 'mimetype': 'application/json', 'mimetype_inner': None, 'name': 'bike-share-json', 'package_id': '2b44db0d-eea9-442d-b038-79335368ad5a', 'position': 0, 'resource_type': None, 'revision_id': 'e19f02c5-3513-46df-a4f6-375e4e530133', 'size': 577, 'state': 'active', 'url': 'https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2b44db0d-eea9-442d-b038-79335368ad5a/resource/5c1c2c06-d27f-47b7-ae82-926a6d23d76f/download/bike-share-json.json', 'url_type': 'upload'}}

Print URL:
https://ckan0.cf.opendata.inter.prod-toronto.ca/da

In [3]:
# Get URls within JSON with links to datasets in JSON 
r = requests.get('https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2b44db0d-eea9-442d-b038-79335368ad5a/resource/5c1c2c06-d27f-47b7-ae82-926a6d23d76f/download/bike-share-json.json').json()

r

{'last_updated': 1563373889,
 'ttl': 24,
 'data': {'en': {'feeds': [{'name': 'system regions',
     'url': 'https://tor.publicbikesystem.net/ube/gbfs/v1/en/system_regions'},
    {'name': 'system_information',
     'url': 'https://tor.publicbikesystem.net/ube/gbfs/v1/en/system_information'},
    {'name': 'station_information',
     'url': 'https://tor.publicbikesystem.net/ube/gbfs/v1/en/station_information'},
    {'name': 'station_status',
     'url': 'https://tor.publicbikesystem.net/ube/gbfs/v1/en/station_status'},
    {'name': 'system_pricing_plans',
     'url': 'https://tor.publicbikesystem.net/ube/gbfs/v1/en/system_pricing_plans'}]}}}

---

### Station Information

In [4]:
# Visualize Station Dataset

r = requests.get('https://tor.publicbikesystem.net/ube/gbfs/v1/en/station_information').json()

r

{'last_updated': 1706414211,
 'ttl': 22,
 'data': {'stations': [{'station_id': '7000',
    'name': 'Fort York  Blvd / Capreol Ct',
    'physical_configuration': 'REGULAR',
    'lat': 43.639832,
    'lon': -79.395954,
    'altitude': 0.0,
    'address': 'Fort York  Blvd / Capreol Ct',
    'capacity': 35,
    'is_charging_station': False,
    'rental_methods': ['KEY', 'TRANSITCARD', 'CREDITCARD', 'PHONE'],
    'groups': [],
    'obcn': '647-643-9607',
    'nearby_distance': 500.0,
    '_ride_code_support': True,
    'rental_uris': {}},
   {'station_id': '7001',
    'name': 'Wellesley Station Green P',
    'physical_configuration': 'ELECTRICBIKESTATION',
    'lat': 43.66496415990742,
    'lon': -79.38355031526893,
    'altitude': 0.0,
    'address': 'Yonge / Wellesley',
    'post_code': 'M4Y 1G7',
    'capacity': 23,
    'is_charging_station': True,
    'rental_methods': ['KEY', 'TRANSITCARD', 'CREDITCARD', 'PHONE'],
    'groups': [],
    'obcn': '416-617-9576',
    'nearby_distance': 500

In [5]:
r = requests.get('https://tor.publicbikesystem.net/ube/gbfs/v1/en/station_information')

df_bike_stations = pd.DataFrame(json.loads(r.content)['data']['stations'])[['station_id', 'name', 'lat', 'lon']].astype({
                            'station_id': 'string', 
                            'name': 'string'
                            })

df_bike_stations.head() # Visualize

Unnamed: 0,station_id,name,lat,lon
0,7000,Fort York Blvd / Capreol Ct,43.639832,-79.395954
1,7001,Wellesley Station Green P,43.664964,-79.38355
2,7002,St. George St / Bloor St W,43.667333,-79.399429
3,7003,Madison Ave / Bloor St W,43.667158,-79.402761
4,7005,King St W / York St,43.648001,-79.383177


In [6]:
# Rename column name
df_bike_stations = df_bike_stations.rename(columns={'name':"station_name"})
    # name to station_name

df_bike_stations.info()

df_bike_stations['station_id'].sort_values(ascending= False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785 entries, 0 to 784
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   station_id    785 non-null    string 
 1   station_name  785 non-null    string 
 2   lat           785 non-null    float64
 3   lon           785 non-null    float64
dtypes: float64(2), string(2)
memory usage: 24.7 KB


784    7925
783    7924
782    7923
781    7922
780    7921
       ... 
4      7005
3      7003
2      7002
1      7001
0      7000
Name: station_id, Length: 785, dtype: string

---

### Pricing Plans

In [7]:
# Visualize Station Dataset

r = requests.get('https://tor.publicbikesystem.net/ube/gbfs/v1/en/system_pricing_plans').json()

r

{'last_updated': 1706414230,
 'ttl': 40,
 'data': {'plans': [{'plan_id': '186',
    'name': 'Annual 30',
    'currency': 'CAD',
    'price': 105.0,
    'description': 'Unlimited 30-min trips on classic bikes',
    'is_taxable': 1},
   {'plan_id': '191',
    'name': 'CMP-City of Toronto',
    'currency': 'CAD',
    'price': 90.0,
    'description': 'CMP-City of Toronto',
    'is_taxable': 1},
   {'plan_id': '208',
    'name': 'Annual 45',
    'currency': 'CAD',
    'price': 120.0,
    'description': 'Unlimited 45-min trips on classic bikes',
    'is_taxable': 1},
   {'plan_id': '209',
    'name': 'Corporate 30',
    'currency': 'CAD',
    'price': 84.0,
    'description': 'Corporate 30',
    'is_taxable': 1},
   {'plan_id': '210',
    'name': 'Corporate 45',
    'currency': 'CAD',
    'price': 96.0,
    'description': 'Corporate 45',
    'is_taxable': 1},
   {'plan_id': '211',
    'name': 'OPTION 2 TEST',
    'currency': 'CAD',
    'price': 0.0,
    'description': '',
    'is_taxable': 

In [8]:
r = requests.get('https://tor.publicbikesystem.net/ube/gbfs/v1/en/system_pricing_plans')

df_bike_priceplans = pd.DataFrame(json.loads(r.content)['data']['plans']).astype({
                    'plan_id': 'string', 
                    'name': 'string',
                    'currency': 'string',
                    'currency': 'string',
                    'description': 'string'
                    })

df_bike_priceplans.head()

Unnamed: 0,plan_id,name,currency,price,description,is_taxable
0,186,Annual 30,CAD,105.0,Unlimited 30-min trips on classic bikes,1
1,191,CMP-City of Toronto,CAD,90.0,CMP-City of Toronto,1
2,208,Annual 45,CAD,120.0,Unlimited 45-min trips on classic bikes,1
3,209,Corporate 30,CAD,84.0,Corporate 30,1
4,210,Corporate 45,CAD,96.0,Corporate 45,1


In [9]:
df_bike_priceplans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   plan_id      49 non-null     string 
 1   name         49 non-null     string 
 2   currency     49 non-null     string 
 3   price        49 non-null     float64
 4   description  49 non-null     string 
 5   is_taxable   49 non-null     int64  
dtypes: float64(1), int64(1), string(4)
memory usage: 2.4 KB


#### Export Station and Price Plans Data

I will export and work with these datasets in future workbooks. Below I will import ridership data across various years and combine them into one dataframe.

*Stations*

In [10]:
# Stations
df_bike_stations.to_csv("data/bike_stations_raw.csv", index = False) # don't include index as a separate column)

# Price Plans
df_bike_priceplans.to_csv("data/bike_priceplans_raw.csv", index = False)

---

## Import Ridership Data

The Bike Share Toronto data is publicly available at the link below: </br>
https://open.toronto.ca/dataset/bike-share-toronto-ridership-data/ </br>
</br>
I have obtained 5 years of the most recent data (2019 - 2023), and saved them by year. The data files were not consistently named so I have imported them by year. After an iteration of importing the data, I realized the datatypes were inconsistent upon import so I have defined the preferred datatypes below.

In [11]:
#Define temporary dataframes to help with imports
df_temp_1 = pd.DataFrame()
df_temp_2 = pd.DataFrame()
df_temp_3 = pd.DataFrame()
df_temp_4 = pd.DataFrame()
df_temp_5 = pd.DataFrame()
df_temp_6 = pd.DataFrame()
df_temp_7 = pd.DataFrame()
df_temp_8 = pd.DataFrame()
df_temp_9 = pd.DataFrame()
df_temp_10 = pd.DataFrame()
df_temp_11 = pd.DataFrame()
df_temp_12 = pd.DataFrame()

In [12]:
#Define dtype
dtype_dictionary = {"Trip Id": 'string', 
                    "Trip  Duration": 'Int64',
                    "Start Station Id": 'string', 
                    "Start Time": 'string', #use parse_dates in pd.read_csv()
                    "Start Station Name": 'string',
                    "End Station Id": 'string',
                    "End Time": 'string', #use parse_dates in pd.read_csv()
                    "End Station Name": 'string',
                    "Bike Id": 'string',
                    "User Type": 'string'
                   }

list_parse_date = ["Start Time", "End Time"]

#### 2019

In [13]:
df_temp_1 = pd.read_csv("data/2019/2019-Q1.csv", 
                        dtype=dtype_dictionary, parse_dates=list_parse_date)
df_temp_2 = pd.read_csv("data/2019/2019-Q2.csv", 
                        dtype=dtype_dictionary, parse_dates=list_parse_date)
df_temp_3 = pd.read_csv("data/2019/2019-Q3.csv", 
                        dtype=dtype_dictionary, parse_dates=list_parse_date)
df_temp_4 = pd.read_csv("data/2019/2019-Q4.csv",
                        dtype=dtype_dictionary, parse_dates=list_parse_date)

temp_dfs_4 = [df_temp_1, df_temp_2, df_temp_3, df_temp_4]
df_2019 = pd.concat(temp_dfs_4) #Combine into dataframe for the year

In [14]:
df_2019.head() #visualize

Unnamed: 0,Trip Id,Trip Duration,Start Station Id,Start Time,Start Station Name,End Station Id,End Time,End Station Name,Bike Id,User Type
0,4581278,1547,7021,2019-01-01 00:08:00,Bay St / Albert St,7233,2019-01-01 00:33:00,King / Cowan Ave - SMART,1296,Annual Member
1,4581279,1112,7160,2019-01-01 00:10:00,King St W / Tecumseth St,7051,2019-01-01 00:29:00,Wellesley St E / Yonge St (Green P),2947,Annual Member
2,4581280,589,7055,2019-01-01 00:15:00,Jarvis St / Carlton St,7013,2019-01-01 00:25:00,Scott St / The Esplanade,2293,Annual Member
3,4581281,259,7012,2019-01-01 00:16:00,Elizabeth St / Edward St (Bus Terminal),7235,2019-01-01 00:20:00,Bay St / College St (West Side) - SMART,283,Annual Member
4,4581282,281,7041,2019-01-01 00:19:00,Edward St / Yonge St,7257,2019-01-01 00:24:00,Dundas St W / St. Patrick St,1799,Annual Member


In [15]:
df_2019.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2439517 entries, 0 to 468415
Data columns (total 10 columns):
 #   Column              Dtype         
---  ------              -----         
 0   Trip Id             string        
 1   Trip  Duration      Int64         
 2   Start Station Id    string        
 3   Start Time          datetime64[ns]
 4   Start Station Name  string        
 5   End Station Id      string        
 6   End Time            datetime64[ns]
 7   End Station Name    string        
 8   Bike Id             string        
 9   User Type           string        
dtypes: Int64(1), datetime64[ns](2), string(7)
memory usage: 207.1 MB


#### 2020 Onwards

I've decided to build a function to help streamline the upload as the data is broken up by month instead of quarter.

In [16]:
def upload_helper(list_temp_dfs, int_m, str_y, str_fn_p):
    
    # Doc String for function
    """
    This function is to help with the upload of data files into a list of dataframes.
    
    Specifically, the data folder would be 'data/{year}/{filenameprefix}{month}'
        Where:
            {year} is the year in yyyy format (i.e 2020)
            {month} is the month in MM format (i.e. 01 for January)
            {filenameprefix} is the filename without the {month} variable
    """
    
    
    #Code
    for i in range(len(list_temp_dfs)):
        
        int_m += 1 #increase month value by 1 (starts at 1, ends at 12)
        
        str_m = str(int_m).zfill(2) #converts month integer into 2 character strings (i.e. int 1 = '01')
        
        str_filepath = "data/"+str_y+"/"+str_fn_p+str_m+".csv" #combine filepath

        temp_dfs_12[i] = pd.read_csv(str_filepath, 
                                     dtype=dtype_dictionary, parse_dates=list_parse_date) #load data into list of dataframes

In [17]:
# Show Doc String
help(upload_helper)

Help on function upload_helper in module __main__:

upload_helper(list_temp_dfs, int_m, str_y, str_fn_p)
    This function is to help with the upload of data files into a list of dataframes.
    
    Specifically, the data folder would be 'data/{year}/{filenameprefix}{month}'
        Where:
            {year} is the year in yyyy format (i.e 2020)
            {month} is the month in MM format (i.e. 01 for January)
            {filenameprefix} is the filename without the {month} variable



In [18]:
# Create a list of Dataframes
temp_dfs_12 = [df_temp_1, df_temp_2, df_temp_3, df_temp_4, df_temp_5, 
               df_temp_6, df_temp_7, df_temp_8, df_temp_9, df_temp_10, df_temp_11, df_temp_12]

#### 2020

In [19]:
# Inputs for the function
str_year = "2020"
str_filename_prefix = "2020-"
int_month = 0

# Upload Helper function (defined above)
upload_helper(list_temp_dfs=temp_dfs_12, 
              int_m= int_month, 
              str_y=str_year, 
              str_fn_p=str_filename_prefix)

df_2020 = pd.concat(temp_dfs_12) #Combine into dataframe for the year

In [20]:
df_2020.head() #visualize

Unnamed: 0,Trip Id,Trip Duration,Start Station Id,Start Time,Start Station Name,End Station Id,End Time,End Station Name,Bike Id,User Type
0,7334128,648,7003,2020-01-01 00:08:00,Madison Ave / Bloor St W,7271,2020-01-01 00:19:00,Yonge St / Alexander St - SMART,3104,Annual Member
1,7334129,419,7007,2020-01-01 00:10:00,College St / Huron St,7163,2020-01-01 00:17:00,Yonge St / Wood St,2126,Annual Member
2,7334130,566,7113,2020-01-01 00:13:00,Parliament St / Aberdeen Ave,7108,2020-01-01 00:22:00,Front St E / Cherry St,4425,Annual Member
3,7334131,1274,7333,2020-01-01 00:17:00,King St E / Victoria St,7311,2020-01-01 00:38:00,Sherbourne St / Isabella St,4233,Annual Member
4,7334132,906,7009,2020-01-01 00:19:00,King St E / Jarvis St,7004,2020-01-01 00:34:00,University Ave / Elm St,2341,Casual Member


#### 2021

The January and May 2021 data would return the following error: </br>
*UnicodeDecodeError: 'utf-8' codec can't decode byte 0x96 in position 40656: invalid start byte* </br>
</br>
I decided to upload them without the function to deal with the csv encoding of those two files.

In [21]:
##### Import Data for all months in 2021 except for January and May

In [22]:
    #Skip January
temp_dfs_12[1] = pd.read_csv("data/2021/Bike share ridership 2021-02.csv",
                             dtype=dtype_dictionary, parse_dates=list_parse_date)
temp_dfs_12[2] = pd.read_csv("data/2021/Bike share ridership 2021-03.csv",
                             dtype=dtype_dictionary, parse_dates=list_parse_date)
temp_dfs_12[3] = pd.read_csv("data/2021/Bike share ridership 2021-04.csv",
                             dtype=dtype_dictionary, parse_dates=list_parse_date)
    #Skip May
temp_dfs_12[5] = pd.read_csv("data/2021/Bike share ridership 2021-06.csv",
                             dtype=dtype_dictionary, parse_dates=list_parse_date)
temp_dfs_12[6] = pd.read_csv("data/2021/Bike share ridership 2021-07.csv",
                             dtype=dtype_dictionary, parse_dates=list_parse_date)
temp_dfs_12[7] = pd.read_csv("data/2021/Bike share ridership 2021-08.csv",
                             dtype=dtype_dictionary, parse_dates=list_parse_date)
temp_dfs_12[8] = pd.read_csv("data/2021/Bike share ridership 2021-09.csv",
                             dtype=dtype_dictionary, parse_dates=list_parse_date)
temp_dfs_12[9] = pd.read_csv("data/2021/Bike share ridership 2021-10.csv",
                             dtype=dtype_dictionary, parse_dates=list_parse_date)
temp_dfs_12[10] = pd.read_csv("data/2021/Bike share ridership 2021-11.csv",
                              dtype=dtype_dictionary, parse_dates=list_parse_date)
temp_dfs_12[11] = pd.read_csv("data/2021/Bike share ridership 2021-12.csv",
                              dtype=dtype_dictionary, parse_dates=list_parse_date)

In [23]:
##### Import Data for January 2021 and May 2021

In [24]:
# Run file details to see encoding of csv file - Jan
with open("data/2021/Bike share ridership 2021-01.csv") as f:
    print(f)

<_io.TextIOWrapper name='data/2021/Bike share ridership 2021-01.csv' mode='r' encoding='cp1252'>


In [25]:
# Run file details to see encoding of csv file - May
with open("data/2021/Bike share ridership 2021-05.csv") as f:
    print(f)

<_io.TextIOWrapper name='data/2021/Bike share ridership 2021-05.csv' mode='r' encoding='cp1252'>


In [26]:
# Import with updated encoding
temp_dfs_12[0] = pd.read_csv("data/2021/Bike share ridership 2021-01.csv", encoding='cp1252',
                             dtype=dtype_dictionary, parse_dates=list_parse_date)
temp_dfs_12[4] = pd.read_csv("data/2021/Bike share ridership 2021-05.csv", encoding='cp1252',
                             dtype=dtype_dictionary, parse_dates=list_parse_date)

In [27]:
df_2021 = pd.concat(temp_dfs_12) #Combine into dataframe for the year

In [28]:
df_2021.head() #Visualize

Unnamed: 0,Trip Id,Trip Duration,Start Station Id,Start Time,Start Station Name,End Station Id,End Time,End Station Name,Bike Id,User Type
0,10644218,1315,7021,2021-01-01 00:04:00,Bay St / Albert St,7164,2021-01-01 00:26:00,Gould St / Yonge St (Ryerson University),6795,Annual Member
1,10644220,396,7534,2021-01-01 00:07:00,Walnut Ave / Queen St W,7524,2021-01-01 00:13:00,Lisgar Park,4176,Casual Member
2,10644221,86,7162,2021-01-01 00:10:00,Hayter St / Laplante Ave,7006,2021-01-01 00:11:00,Bay St / College St (East Side),1814,Annual Member
3,10644222,741,7003,2021-01-01 00:10:00,Madison Ave / Bloor St W,7272,2021-01-01 00:22:00,Yonge St / Dundonald St - SMART,198,Casual Member
4,10644223,2073,7562,2021-01-01 00:11:00,Priscilla Ave / Dundas St W - SMART,7562,2021-01-01 00:45:00,Priscilla Ave / Dundas St W - SMART,6688,Casual Member


#### 2022

In [29]:
# Inputs for the function
str_year = "2022"
str_filename_prefix = "Bike share ridership 2022-"
int_month = 0

# Upload Helper function (defined above)
upload_helper(list_temp_dfs=temp_dfs_12, 
              int_m= int_month, 
              str_y=str_year, 
              str_fn_p=str_filename_prefix)

df_2022 = pd.concat(temp_dfs_12) #Combine into dataframe for the year

In [30]:
df_2022.head() #Visualize

Unnamed: 0,Trip Id,Trip Duration,Start Station Id,Start Time,Start Station Name,End Station Id,End Time,End Station Name,Bike Id,User Type
0,14805109,4335,7334,2022-01-01 00:02:00,Simcoe St / Wellington St North,7269,2022-01-01 01:15:00,Toronto Eaton Centre (Yonge St),5139,Casual Member
1,14805110,126,7443,2022-01-01 00:02:00,Dundas St E / George St,7270,2022-01-01 00:05:00,Church St / Dundas St E - SMART,3992,Annual Member
2,14805112,942,7399,2022-01-01 00:04:00,Lower Jarvis / Queens Quay E,7686,2022-01-01 00:19:00,,361,Annual Member
3,14805113,4256,7334,2022-01-01 00:04:00,Simcoe St / Wellington St North,7269,2022-01-01 01:15:00,Toronto Eaton Centre (Yonge St),4350,Casual Member
4,14805114,4353,7334,2022-01-01 00:05:00,Simcoe St / Wellington St North,7038,2022-01-01 01:17:00,Dundas St W / Yonge St,5074,Casual Member


#### 2023

The April 2023 onwards data would return the following error: </br>
*UnicodeDecodeError: 'utf-8' codec can't decode byte 0x96 in position 40656: invalid start byte* </br>
</br>
I decided to upload them without the function to deal with the csv encoding of these files.


In [31]:
# Files without encoding issues
temp_dfs_12[0] = pd.read_csv("data/2023/Bike share ridership 2023-01.csv", 
                             dtype=dtype_dictionary, parse_dates=list_parse_date)
temp_dfs_12[1] = pd.read_csv("data/2023/Bike share ridership 2023-02.csv", 
                             dtype=dtype_dictionary, parse_dates=list_parse_date)
temp_dfs_12[2] = pd.read_csv("data/2023/Bike share ridership 2023-03.csv", 
                             dtype=dtype_dictionary, parse_dates=list_parse_date)

In [32]:
# Files WITH encoding issues
temp_dfs_12[3] = pd.read_csv("data/2023/Bike share ridership 2023-04.csv", encoding='cp1252', 
                             dtype=dtype_dictionary, parse_dates=list_parse_date)
temp_dfs_12[4] = pd.read_csv("data/2023/Bike share ridership 2023-05.csv", encoding='cp1252', 
                             dtype=dtype_dictionary, parse_dates=list_parse_date)
temp_dfs_12[5] = pd.read_csv("data/2023/Bike share ridership 2023-06.csv", encoding='cp1252', 
                             dtype=dtype_dictionary, parse_dates=list_parse_date)
temp_dfs_12[6] = pd.read_csv("data/2023/Bike share ridership 2023-07.csv", encoding='cp1252', 
                             dtype=dtype_dictionary, parse_dates=list_parse_date)
temp_dfs_12[7] = pd.read_csv("data/2023/Bike share ridership 2023-08.csv", encoding='cp1252', 
                             dtype=dtype_dictionary, parse_dates=list_parse_date)
temp_dfs_12[8] = pd.read_csv("data/2023/Bike share ridership 2023-09.csv", encoding='cp1252', 
                             dtype=dtype_dictionary, parse_dates=list_parse_date)
temp_dfs_12[9] = pd.read_csv("data/2023/Bike share ridership 2023-10.csv", encoding='cp1252', 
                             dtype=dtype_dictionary, parse_dates=list_parse_date)
temp_dfs_12[10] = pd.read_csv("data/2023/Bike share ridership 2023-11.csv", encoding='cp1252', 
                              dtype=dtype_dictionary, parse_dates=list_parse_date)

In [33]:
# December File required different encoding for import
temp_dfs_12[11] = pd.read_csv("data/2023/Bike share ridership 2023-12.csv", encoding='utf-8-sig', 
                              dtype=dtype_dictionary, parse_dates=list_parse_date)

In [34]:
df_2023 = pd.concat(temp_dfs_12) #Combine into dataframe for the year

In [35]:
df_2023.head() #Visualize

Unnamed: 0,Trip Id,Trip Duration,Start Station Id,Start Time,Start Station Name,End Station Id,End Time,End Station Name,Bike Id,User Type
0,20148784,840,7022,2023-01-01 00:00:00,Simcoe St / Queen St W,7703,2023-01-01 00:14:00,,1770,Casual Member
1,20148785,722,7399,2023-01-01 00:01:00,Lower Jarvis / Queens Quay E,7533,2023-01-01 00:13:00,Housey St / Dan Leckie Way,6400,Casual Member
2,20148786,1054,7269,2023-01-01 00:02:00,Toronto Eaton Centre (Yonge St),7076,2023-01-01 00:20:00,York St / Queens Quay W,4068,Annual Member
3,20148790,1329,7721,2023-01-01 00:04:00,,7685,2023-01-01 00:26:00,,427,Casual Member
4,20148791,1291,7721,2023-01-01 00:04:00,,7685,2023-01-01 00:26:00,,1840,Casual Member


In [36]:
for i in range (0,12):
    print(temp_dfs_12[i].columns)

Index(['Trip Id', 'Trip  Duration', 'Start Station Id', 'Start Time',
       'Start Station Name', 'End Station Id', 'End Time', 'End Station Name',
       'Bike Id', 'User Type'],
      dtype='object')
Index(['Trip Id', 'Trip  Duration', 'Start Station Id', 'Start Time',
       'Start Station Name', 'End Station Id', 'End Time', 'End Station Name',
       'Bike Id', 'User Type'],
      dtype='object')
Index(['Trip Id', 'Trip  Duration', 'Start Station Id', 'Start Time',
       'Start Station Name', 'End Station Id', 'End Time', 'End Station Name',
       'Bike Id', 'User Type'],
      dtype='object')
Index(['Trip Id', 'Trip  Duration', 'Start Station Id', 'Start Time',
       'Start Station Name', 'End Station Id', 'End Time', 'End Station Name',
       'Bike Id', 'User Type'],
      dtype='object')
Index(['Trip Id', 'Trip  Duration', 'Start Station Id', 'Start Time',
       'Start Station Name', 'End Station Id', 'End Time', 'End Station Name',
       'Bike Id', 'User Type'],
      d

---

### Individual Year Data Checks - Prior to Combining into one Dataframe

#### 2019 - Data Inspection

In [37]:
df_2019.shape # get shape of dataframe (rows, columns)

(2439517, 10)

In [38]:
df_2019.info() # get info on dataframe

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2439517 entries, 0 to 468415
Data columns (total 10 columns):
 #   Column              Dtype         
---  ------              -----         
 0   Trip Id             string        
 1   Trip  Duration      Int64         
 2   Start Station Id    string        
 3   Start Time          datetime64[ns]
 4   Start Station Name  string        
 5   End Station Id      string        
 6   End Time            datetime64[ns]
 7   End Station Name    string        
 8   Bike Id             string        
 9   User Type           string        
dtypes: Int64(1), datetime64[ns](2), string(7)
memory usage: 207.1 MB


In [39]:
df_2019.isna().sum()  # high-level visualization of null values by column

Trip Id                 0
Trip  Duration         16
Start Station Id        0
Start Time              0
Start Station Name      0
End Station Id        454
End Time                0
End Station Name      454
Bike Id                 0
User Type               0
dtype: int64

#### 2020 - Data Inspection

In [40]:
df_2020.shape # get shape of dataframe (rows, columns)

(2911308, 10)

In [41]:
df_2020.info() # get info on dataframe

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2911308 entries, 0 to 95347
Data columns (total 10 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   Trip Id             string
 1   Trip  Duration      Int64 
 2   Start Station Id    string
 3   Start Time          object
 4   Start Station Name  string
 5   End Station Id      string
 6   End Time            object
 7   End Station Name    string
 8   Bike Id             string
 9   User Type           string
dtypes: Int64(1), object(2), string(7)
memory usage: 247.1+ MB


In [42]:
df_2020.isna().sum()  # high-level visualization of null values by column

Trip Id                  0
Trip  Duration           0
Start Station Id         0
Start Time               0
Start Station Name     793
End Station Id        1301
End Time                 0
End Station Name      2049
Bike Id                 73
User Type              249
dtype: int64

#### 2021 - Data Inspection

In [43]:
df_2021.shape

(3575182, 10)

In [44]:
df_2021.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3575182 entries, 0 to 145848
Data columns (total 10 columns):
 #   Column              Dtype         
---  ------              -----         
 0   Trip Id             string        
 1   Trip  Duration      Int64         
 2   Start Station Id    string        
 3   Start Time          datetime64[ns]
 4   Start Station Name  string        
 5   End Station Id      string        
 6   End Time            datetime64[ns]
 7   End Station Name    string        
 8   Bike Id             string        
 9   User Type           string        
dtypes: Int64(1), datetime64[ns](2), string(7)
memory usage: 303.5 MB


In [45]:
df_2021.isna().sum()

Trip Id                  0
Trip  Duration           0
Start Station Id         0
Start Time               0
Start Station Name    3680
End Station Id        1969
End Time                 0
End Station Name      5894
Bike Id                203
User Type                0
dtype: int64

#### 2022 - Data Inspection

In [46]:
df_2022.shape

(4620469, 10)

In [47]:
df_2022.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4620469 entries, 0 to 180009
Data columns (total 10 columns):
 #   Column              Dtype         
---  ------              -----         
 0   Trip Id             string        
 1   Trip  Duration      Int64         
 2   Start Station Id    string        
 3   Start Time          datetime64[ns]
 4   Start Station Name  string        
 5   End Station Id      string        
 6   End Time            datetime64[ns]
 7   End Station Name    string        
 8   Bike Id             string        
 9   User Type           string        
dtypes: Int64(1), datetime64[ns](2), string(7)
memory usage: 392.2 MB


In [48]:
df_2022.isna().sum()

Trip Id                    0
Trip  Duration             0
Start Station Id           0
Start Time                 0
Start Station Name    183466
End Station Id          1279
End Time                   0
End Station Name      184966
Bike Id                    0
User Type                  0
dtype: int64

#### 2023 - Data Inspection

In [49]:
df_2023.shape

(5713141, 10)

In [50]:
df_2023.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5713141 entries, 0 to 257114
Data columns (total 10 columns):
 #   Column              Dtype         
---  ------              -----         
 0   Trip Id             string        
 1   Trip  Duration      Int64         
 2   Start Station Id    string        
 3   Start Time          datetime64[ns]
 4   Start Station Name  string        
 5   End Station Id      string        
 6   End Time            datetime64[ns]
 7   End Station Name    string        
 8   Bike Id             string        
 9   User Type           string        
dtypes: Int64(1), datetime64[ns](2), string(7)
memory usage: 484.9 MB


In [51]:
df_2023.isna().sum()

Trip Id                    0
Trip  Duration             0
Start Station Id           0
Start Time                 0
Start Station Name    595075
End Station Id          2944
End Time                   0
End Station Name      598563
Bike Id                    0
User Type                  0
dtype: int64

##### Findings:
- Datatypes were imported as expected
- Missing data exist in these dataframes. To investigate as a whole after combining data into one dataframe.
- In 2020, Start Time and End Time are read as object. Should be investigated before combining the datasets

---

#### Investigate 2020 Start Time & End Time 
#### (Update if appropriate)

In 2020, Start Time and End Time are read as object. Should be investigated before combining the datasets

In [52]:
#Check if Trip ID are unique in df_2020
df_2020['Trip Id'].duplicated().sum()

# zero if all unique

0

*Start Time*

In [53]:
# Check for Null values
df_2020['Start Time'].isna().sum()

0

In [54]:
df_temp = pd.to_datetime(df_2020['Start Time'], errors='coerce')
df_temp

0       2020-01-01 00:08:00
1       2020-01-01 00:10:00
2       2020-01-01 00:13:00
3       2020-01-01 00:17:00
4       2020-01-01 00:19:00
                ...        
95343   2020-12-31 23:52:00
95344   2020-12-31 23:54:00
95345   2020-12-31 23:54:00
95346   2020-12-31 23:56:00
95347   2020-12-31 23:57:00
Name: Start Time, Length: 2911308, dtype: datetime64[ns]

In [55]:
# Check how many rows are impacted
df_temp.isna().sum()

249

In [56]:
df_2020[df_temp.isna()]

Unnamed: 0,Trip Id,Trip Duration,Start Station Id,Start Time,Start Station Name,End Station Id,End Time,End Station Name,Bike Id,User Type
25640,10000084625,7120,10/03/2020 13:28,Gerrard St E / River St,7120,10/03/2020 13:38,Gerrard St E / River St,5250,Annual Member,
25837,10000306555,7120,10/03/2020 13:38,Gerrard St E / River St,7576,10/03/2020 13:48,Front St E / Bayview Avenue,5250,Annual Member,
26029,10000519608,7576,10/03/2020 13:48,Front St E / Bayview Avenue,7357,10/03/2020 13:58,Lake Shore Blvd E / Leslie St,5250,Annual Member,
26248,10000755851,7357,10/03/2020 13:58,Lake Shore Blvd E / Leslie St,7313,10/03/2020 14:12,Coxwell Ave / Lake Shore Blvd E,5250,Annual Member,
26560,10001076784,7313,10/03/2020 14:12,Coxwell Ave / Lake Shore Blvd E,7317,10/03/2020 14:26,Hubbard Blvd / Balsam Av,5250,Annual Member,
...,...,...,...,...,...,...,...,...,...,...
266879,10281521317,7417,10/30/2020 17:01,King St W / Jordan St,7253,10/30/2020 17:06,John St / Mercer St - SMART,4888,Annual Member,
268551,10283383488,7017,10/30/2020 19:42,Widmer St / Adelaide St W,7417,10/30/2020 19:50,King St W / Jordan St,5835,Annual Member,
275974,10291748450,7417,10/31/2020 18:00,King St W / Jordan St,7474,10/31/2020 18:07,Clarence Square,3814,Annual Member,
277412,10293410950,7474,10/31/2020 21:27,Clarence Square,7015,10/31/2020 21:42,King St W / Bay St (West Side),3771,Annual Member,


In [57]:
st_trip_ids = df_2020[df_temp.isna()]['Trip Id'].sort_values()

*End Time*

In [58]:
# Check for Null values
df_2022['End Time'].isna().sum()

0

In [59]:
df_temp_et = pd.to_datetime(df_2020['End Time'], errors='coerce')
df_temp_et

0       2020-01-01 00:19:00
1       2020-01-01 00:17:00
2       2020-01-01 00:22:00
3       2020-01-01 00:38:00
4       2020-01-01 00:34:00
                ...        
95343   2020-12-31 23:57:00
95344   2020-12-31 23:58:00
95345   2020-12-31 23:58:00
95346   2021-01-01 00:24:00
95347   2020-12-31 23:58:00
Name: End Time, Length: 2911308, dtype: datetime64[ns]

In [60]:
# Check how many rows are impacted
df_temp_et.isna().sum()

249

In [61]:
df_2020[df_temp_et.isna()]

Unnamed: 0,Trip Id,Trip Duration,Start Station Id,Start Time,Start Station Name,End Station Id,End Time,End Station Name,Bike Id,User Type
25640,10000084625,7120,10/03/2020 13:28,Gerrard St E / River St,7120,10/03/2020 13:38,Gerrard St E / River St,5250,Annual Member,
25837,10000306555,7120,10/03/2020 13:38,Gerrard St E / River St,7576,10/03/2020 13:48,Front St E / Bayview Avenue,5250,Annual Member,
26029,10000519608,7576,10/03/2020 13:48,Front St E / Bayview Avenue,7357,10/03/2020 13:58,Lake Shore Blvd E / Leslie St,5250,Annual Member,
26248,10000755851,7357,10/03/2020 13:58,Lake Shore Blvd E / Leslie St,7313,10/03/2020 14:12,Coxwell Ave / Lake Shore Blvd E,5250,Annual Member,
26560,10001076784,7313,10/03/2020 14:12,Coxwell Ave / Lake Shore Blvd E,7317,10/03/2020 14:26,Hubbard Blvd / Balsam Av,5250,Annual Member,
...,...,...,...,...,...,...,...,...,...,...
266879,10281521317,7417,10/30/2020 17:01,King St W / Jordan St,7253,10/30/2020 17:06,John St / Mercer St - SMART,4888,Annual Member,
268551,10283383488,7017,10/30/2020 19:42,Widmer St / Adelaide St W,7417,10/30/2020 19:50,King St W / Jordan St,5835,Annual Member,
275974,10291748450,7417,10/31/2020 18:00,King St W / Jordan St,7474,10/31/2020 18:07,Clarence Square,3814,Annual Member,
277412,10293410950,7474,10/31/2020 21:27,Clarence Square,7015,10/31/2020 21:42,King St W / Bay St (West Side),3771,Annual Member,


In [62]:
et_trip_ids = df_2020[df_temp_et.isna()]['Trip Id'].sort_values()

*Start Time* vs *End Time*

In [63]:
# Check to see if the Trip IDs are the same
(~(st_trip_ids == et_trip_ids)).sum()
    # ~ works as "NOT" with boolean values, 
        # here it's used for a Pandas Series of Bool values, as NOT function itself does not work

# If they line up, then we should get zero from this cell

0

*Clean or drop?*

In [64]:
# Number of trips
st_trip_ids.count()

249

In [65]:
# Number of total trips in 2020
df_2020.shape[0]

2911308

In [66]:
# Percentage of total trips with poor data
st_trip_ids.count() / df_2020.shape[0]

8.552856654122477e-05

##### Findings:
This represents ~0% of the data. In more simple terms around 250 / 3 million. 

As this is insignificant when compared to 2020, and the remainder of the dataset. These rows are not worth the effort to correct and will be dropped.

*Drop*

In [67]:
# Expected total trips in 2020 after drop
df_2020.shape[0] - st_trip_ids.count()

2911059

In [68]:
# Drop rows with misaligned data
df_2020 = df_2020[~df_2020['Trip Id'].isin(st_trip_ids)]
    # more specifically, filter them out and reassign it to df_2020

In [69]:
df_2020.shape

(2911059, 10)

In [70]:
# Convert to Date Time
df_2020['Start Time'] = pd.to_datetime(df_2020['Start Time'])
df_2020['End Time'] = pd.to_datetime(df_2020['End Time'])

In [71]:
# Check Conversion
df_2020.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2911059 entries, 0 to 95347
Data columns (total 10 columns):
 #   Column              Dtype         
---  ------              -----         
 0   Trip Id             string        
 1   Trip  Duration      Int64         
 2   Start Station Id    string        
 3   Start Time          datetime64[ns]
 4   Start Station Name  string        
 5   End Station Id      string        
 6   End Time            datetime64[ns]
 7   End Station Name    string        
 8   Bike Id             string        
 9   User Type           string        
dtypes: Int64(1), datetime64[ns](2), string(7)
memory usage: 247.1 MB


---

### Combine All data into one DataFrame

In [72]:
#Create list of dataframes
year_dfs = [df_2019, df_2020, df_2021, df_2022, df_2023]

# Combine list of dataframes
df_bike_share_trips_raw = pd.concat(year_dfs)

In [73]:
df_bike_share_trips_raw.head() #visualize

Unnamed: 0,Trip Id,Trip Duration,Start Station Id,Start Time,Start Station Name,End Station Id,End Time,End Station Name,Bike Id,User Type
0,4581278,1547,7021,2019-01-01 00:08:00,Bay St / Albert St,7233,2019-01-01 00:33:00,King / Cowan Ave - SMART,1296,Annual Member
1,4581279,1112,7160,2019-01-01 00:10:00,King St W / Tecumseth St,7051,2019-01-01 00:29:00,Wellesley St E / Yonge St (Green P),2947,Annual Member
2,4581280,589,7055,2019-01-01 00:15:00,Jarvis St / Carlton St,7013,2019-01-01 00:25:00,Scott St / The Esplanade,2293,Annual Member
3,4581281,259,7012,2019-01-01 00:16:00,Elizabeth St / Edward St (Bus Terminal),7235,2019-01-01 00:20:00,Bay St / College St (West Side) - SMART,283,Annual Member
4,4581282,281,7041,2019-01-01 00:19:00,Edward St / Yonge St,7257,2019-01-01 00:24:00,Dundas St W / St. Patrick St,1799,Annual Member


In [74]:
df_bike_share_trips_raw.shape

(19259368, 10)

In [75]:
# Check to see if total rows are as expected
i_total = 0

for i_df in year_dfs:
    i_total += i_df.shape[0]
    
print(i_total)

19259368


In [76]:
# Check - should be zero
i_total - df_bike_share_trips_raw.shape[0]

0

In [77]:
df_bike_share_trips_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19259368 entries, 0 to 257114
Data columns (total 10 columns):
 #   Column              Dtype         
---  ------              -----         
 0   Trip Id             string        
 1   Trip  Duration      Int64         
 2   Start Station Id    string        
 3   Start Time          datetime64[ns]
 4   Start Station Name  string        
 5   End Station Id      string        
 6   End Time            datetime64[ns]
 7   End Station Name    string        
 8   Bike Id             string        
 9   User Type           string        
dtypes: Int64(1), datetime64[ns](2), string(7)
memory usage: 1.6 GB


In [78]:
df_bike_share_trips_raw.isna().sum()

Trip Id                    0
Trip  Duration            16
Start Station Id           0
Start Time                 0
Start Station Name    783014
End Station Id          7947
End Time                   0
End Station Name      791926
Bike Id                  276
User Type                  0
dtype: int64

In [79]:
# Write to CSV 
df_bike_share_trips_raw.to_csv("data/2019_to_2023_bike_ridership_raw.csv",
                               index = False) # don't include index as a separate column)

# This step is done to have a file that serves as a "check point"

##### Findings:
- Yearly dataframes successfully combined into one, spanning from 2019 to 2023
- Exported compiled dataframe to csv for next workbook (1B Data Cleaning) to import.

---