Download the ChromeDriver 97.0.4692.71: https://chromedriver.chromium.org/downloads. This requires Google chrome version 97.0.4692.71.

Download the bikeshare data from here: https://s3.amazonaws.com/capitalbikeshare-data/index.html

Install Python libraries: selenium, pandas

In [1]:
# !pip3 install selenium pandas

In [3]:
import os

In [None]:
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

In [5]:
# Path to the Chrome webdriver on local system
user_name = os.getenv("USERNAME")
webdriver_path = f"/home/{user_name}/chromedriver_linux64/chromedriver"

# Create ChromeDriver service object
driver_service_object = Service(webdriver_path)
options = Options()

In [None]:
driver = webdriver.Chrome(service=webdriver_service_object, options=chrome_webdriver_options)

## Selenium Webdriver

Download all the metadata

In [None]:
driver.get("https://s3.amazonaws.com/capitalbikeshare-data/index.html")

## Web Scraping

In [None]:
from selenium.webdriver.common.by import By

In [None]:
container = driver.find_element(By.XPATH, './/div[@class="container"]')

table_id = container.find_element(By.XPATH, "//table[@class='hide-while-loading table table-striped']/tbody")

header = container.find_element(By.XPATH, "//table[@class='hide-while-loading table table-striped']/thead")
headers = [h.text for h in header.find_elements(By.CSS_SELECTOR, "th")]

In [None]:
import pandas as pd

In [None]:
# Scrape metadata
mylists = []
for row in table_id.find_elements(By.CSS_SELECTOR, "tr"):
    mylist = []
    zip_file_urls = []
    col_idx = 0
    for cell in row.find_elements(By.TAG_NAME, "td"):
        text = cell.text
        if col_idx == 0:
            data_zip_url = cell.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
            zip_file_urls.append(data_zip_url)
        # Append contents of single row to empty list
        mylist.append(text)
        col_idx += 1
    df_single_row = pd.DataFrame.from_records([{h: r for h, r in zip(headers, mylist)}])
    df_single_row["zip_file_url"] = zip_file_urls
    mylists.append(df_single_row)

## Data Processing

Combine the data

In [None]:
df = pd.concat(mylist, ignore_index=True)

Get data from 2021

In [6]:
df = df[df["Name"].str.startswith("2021")].drop(columns=cols_to_drop)

# Datetime operations
df["train_data_last_date"] = pd.to_datetime(
    df["Name"].str.split("-capital", expand=True)[0] + "01"
)
df["train_data_last_date"] = (
    df["train_data_last_date"] + pd.offsets.MonthEnd(0) + pd.DateOffset(hours=23)
)

df["date_modified_eod"] = pd.to_datetime(
    df["Date Modified"]
).dt.date + pd.DateOffset(hours=24)

df["days_diff"] = pd.to_datetime(df["Date Modified"]) - df["train_data_last_date"]
df = df.sort_values(by="train_data_last_date")

In [7]:
df

Unnamed: 0,Name,Date Modified,zip_file_url,train_data_last_date,date_modified_eod,days_diff
44,202101-capitalbikeshare-tripdata.zip,"Feb 4th 2021, 04:55:29 pm",https://s3.amazonaws.com/capitalbikeshare-data/202101-capitalbikeshare-tripdata.zip,2021-01-31 23:00:00,2021-02-05,3 days 17:55:29
45,202102-capitalbikeshare-tripdata.zip,"Mar 9th 2021, 07:07:41 pm",https://s3.amazonaws.com/capitalbikeshare-data/202102-capitalbikeshare-tripdata.zip,2021-02-28 23:00:00,2021-03-10,8 days 20:07:41
46,202103-capitalbikeshare-tripdata.zip,"Apr 8th 2021, 10:31:40 am",https://s3.amazonaws.com/capitalbikeshare-data/202103-capitalbikeshare-tripdata.zip,2021-03-31 23:00:00,2021-04-09,7 days 11:31:40
47,202104-capitalbikeshare-tripdata.zip,"May 7th 2021, 10:55:16 am",https://s3.amazonaws.com/capitalbikeshare-data/202104-capitalbikeshare-tripdata.zip,2021-04-30 23:00:00,2021-05-08,6 days 11:55:16
48,202105-capitalbikeshare-tripdata.zip,"Jun 11th 2021, 01:15:06 pm",https://s3.amazonaws.com/capitalbikeshare-data/202105-capitalbikeshare-tripdata.zip,2021-05-31 23:00:00,2021-06-12,10 days 14:15:06
49,202106-capitalbikeshare-tripdata.zip,"Jul 15th 2021, 07:24:36 pm",https://s3.amazonaws.com/capitalbikeshare-data/202106-capitalbikeshare-tripdata.zip,2021-06-30 23:00:00,2021-07-16,14 days 20:24:36
50,202107-capitalbikeshare-tripdata.zip,"Aug 14th 2021, 02:10:46 am",https://s3.amazonaws.com/capitalbikeshare-data/202107-capitalbikeshare-tripdata.zip,2021-07-31 23:00:00,2021-08-15,13 days 03:10:46
51,202108-capitalbikeshare-tripdata.zip,"Sep 8th 2021, 02:38:08 pm",https://s3.amazonaws.com/capitalbikeshare-data/202108-capitalbikeshare-tripdata.zip,2021-08-31 23:00:00,2021-09-09,7 days 15:38:08
52,202109-capitalbikeshare-tripdata.zip,"Oct 4th 2021, 01:25:31 pm",https://s3.amazonaws.com/capitalbikeshare-data/202109-capitalbikeshare-tripdata.zip,2021-09-30 23:00:00,2021-10-05,3 days 14:25:31
53,202110-capitalbikeshare-tripdata.zip,"Nov 5th 2021, 09:25:23 am",https://s3.amazonaws.com/capitalbikeshare-data/202110-capitalbikeshare-tripdata.zip,2021-10-31 23:00:00,2021-11-06,4 days 10:25:23


CPU times: user 299 ms, sys: 0 ns, total: 299 ms
Wall time: 5.42 s


- name of file with training data
- ...
- date and time for the start of the last hour for which training data is available
- ...

https://github.com/edesz/miscellaneous/blob/master/wcd_projects/diploma_related/project3/bikeshare-idea/0_live_timeline.ipynb

In [None]:
driver.quit()