Download the ChromeDriver 97.0.4692.71: https://chromedriver.chromium.org/downloads. This requires Google chrome version 97.0.4692.71.

Download the bikeshare data from here: https://s3.amazonaws.com/capitalbikeshare-data/index.html

Install Python libraries: selenium, pandas

In [1]:
# !pip3 install selenium pandas

In [2]:
import os

In [3]:
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

In [4]:
# Path to the Chrome webdriver on local system
user_name = os.getenv("USERNAME")
webdriver_path = f"/home/{user_name}/chromedriver_linux64/chromedriver"

# Create ChromeDriver service object
driver_service_object = Service(webdriver_path)
options = Options()

In [5]:
driver = webdriver.Chrome(service=driver_service_object, options=options)

## Selenium Webdriver

Download all the metadata

In [6]:
driver.get("https://s3.amazonaws.com/capitalbikeshare-data/index.html")

## Web Scraping

In [7]:
from selenium.webdriver.common.by import By

In [8]:
container = driver.find_element(By.XPATH, './/div[@class="container"]')

table_id = container.find_element(By.XPATH, "//table[@class='hide-while-loading table table-striped']/tbody")

header = container.find_element(By.XPATH, "//table[@class='hide-while-loading table table-striped']/thead")
headers = [h.text for h in header.find_elements(By.CSS_SELECTOR, "th")]

In [9]:
import pandas as pd

In [10]:
# Scrape metadata
mylists = []
for row in table_id.find_elements(By.CSS_SELECTOR, "tr"):
    mylist = []
    zip_file_urls = []
    col_idx = 0
    for cell in row.find_elements(By.TAG_NAME, "td"):
        text = cell.text
        if col_idx == 0:
            data_zip_url = cell.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
            zip_file_urls.append(data_zip_url)
        # Append contents of single row to empty list
        mylist.append(text)
        col_idx += 1
    df_single_row = pd.DataFrame.from_records([{h: r for h, r in zip(headers, mylist)}])
    df_single_row["zip_file_url"] = zip_file_urls
    mylists.append(df_single_row)

## Data Processing

Combine the data

In [11]:
df = pd.concat(mylists, ignore_index=True)

In [12]:
df.head(3)

Unnamed: 0,Name,Date Modified,Size,Type,zip_file_url
0,2010-capitalbikeshare-tripdata.zip,"Mar 15th 2018, 06:33:31 pm",2.41 MB,ZIP file,https://s3.amazonaws.com/capitalbikeshare-data...
1,2011-capitalbikeshare-tripdata.zip,"Mar 15th 2018, 06:45:30 pm",25.33 MB,ZIP file,https://s3.amazonaws.com/capitalbikeshare-data...
2,2012-capitalbikeshare-tripdata.zip,"Mar 15th 2018, 06:55:27 pm",43.46 MB,ZIP file,https://s3.amazonaws.com/capitalbikeshare-data...


Get data from 2021

In [13]:
df = df[df["Name"].str[:4] == "2021"].copy()

df["date_modified_eod"] = pd.to_datetime(df["Date Modified"])

In [14]:
df.head(3)

Unnamed: 0,Name,Date Modified,Size,Type,zip_file_url,date_modified_eod
44,202101-capitalbikeshare-tripdata.zip,"Feb 4th 2021, 04:55:29 pm",3.61 MB,ZIP file,https://s3.amazonaws.com/capitalbikeshare-data...,2021-02-04 16:55:29
45,202102-capitalbikeshare-tripdata.zip,"Mar 9th 2021, 07:07:41 pm",2.78 MB,ZIP file,https://s3.amazonaws.com/capitalbikeshare-data...,2021-03-09 19:07:41
46,202103-capitalbikeshare-tripdata.zip,"Apr 8th 2021, 10:31:40 am",5.88 MB,ZIP file,https://s3.amazonaws.com/capitalbikeshare-data...,2021-04-08 10:31:40


In [15]:
driver.quit()

Download the data

In [18]:
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile

In [19]:
%%time
for _, row in df.iterrows():
    zipurl = row["zip_file_url"]
    # Extract without saving
    with urlopen(zipurl) as zipresp:
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            zfile.extractall(f'data/raw')

CPU times: user 1.87 s, sys: 541 ms, total: 2.41 s
Wall time: 11.3 s
