# DATA 202 Homework 6: Data Wrangling


In [None]:
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar
import datetime

# Source Data

## Capital Bikeshare Rides Data

Download the [2011 trip data](https://s3.amazonaws.com/capitalbikeshare-data/2011-capitalbikeshare-tripdata.zip) from [Capital Bikeshare](https://www.capitalbikeshare.com/system-data). Don't need to unzip the ZIP file; Pandas will handle it:

In [None]:
rides = pd.read_csv("2011-capitalbikeshare-tripdata.zip")
rides.info()

In [None]:
print('{:,d}'.format(len(rides)))
rides.head()

Let's remove some columns we don't need, to save memory.

In [None]:
del rides["Start station"], rides["End station"]

## Holidays

The following code gets us a table of federal holidays. Please run it without changing it.

In [None]:
# Run this code unchanged.
holidays = pd.DataFrame({
    'date': USFederalHolidayCalendar().holidays(datetime.date(2011,1,1), datetime.date(2015,12,31)).date,
    'is_holiday': True})
holidays.head()

## Weather Data
Our main goal will be to get the hourly temperature data.

The original wranglers used a weather data source that does not seem to provide downloadable data anymore. But we can use the US government's records. They're in a cumbersome format, which will provide us an excuse to practice some **data cleaning**!

First challenge is where to find the data. Here's how we solved this hard problem:

NOAA's [Integrated Surface Database](https://www.ncdc.noaa.gov/data-access/land-based-station-data/land-based-datasets) provides weather data from all over the country. But how to use it? There's a "Find a Station" tool, but it's confusing how to use the results. https://www.ncdc.noaa.gov/data-access/land-based-station-data/station-metadata has a link to a [station list file](ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-history.txt). Searching that, it looks like the code for Reagan Airport is 724050 13743. So the file is
https://www.ncei.noaa.gov/data/global-hourly/access/2011/72405013743.csv

Poking around in that site revealed two documents that look very important:
- https://www.ncei.noaa.gov/data/global-hourly/doc/isd-format-document.pdf
- https://www.ncei.noaa.gov/data/global-hourly/doc/CSV_HELP.pdf



In [None]:
# Run this to load the file directly from the NOAA website.
# You may want to make a local copy and read it in from there instead.
weather = pd.read_csv("https://www.ncei.noaa.gov/data/global-hourly/access/2011/72405013743.csv")

In [None]:
print(len(weather))
weather.head()

In [None]:
#Create dataframe of only needed columns
weather1 = pd.to_datetime(weather['DATE']).to_frame()
weather1["DATE"].iloc[0]

#Split up date and hour for the eventual merge
weather1["Date"] = weather1['DATE'].dt.date
weather1["Date"] = pd.to_datetime(weather1["Date"])
weather1["Hour"] = weather1['DATE'].dt.hour

#Edit the temerature string so it's in the correct format
weather1["temp_C"] = weather["TMP"].str.replace("+", "")
weather1["temp_C"] = weather1["temp_C"].str.replace(",", ".")
#Handles a unique and odd entry that is inccorect
weather1["temp_C"] = weather1["temp_C"].str.replace(".A", "")
weather1["temp_C"] = weather1["temp_C"].astype(float) / 10

weather1 = weather1.groupby(["Date", "Hour"]).mean()

# Data Wrangling

## 1. Extract `date` and `hour`

In [None]:
rides['start'] = pd.to_datetime(rides['Start date'])
rides['start'].iloc[0]

In [None]:
rides['date'] = rides['start'].dt.date#strftime("%Y-%m-%d")
rides['hour'] = rides['start'].dt.hour

## 2. Filter to include only rides by Members
You'll end up with a Series with a hierarchical index; remember that the "get out of jail card" is `.to_frame(name="NAME_GOES_HERE").reset_index()`.

In [None]:
# your code here
rides = rides[rides["Member type"] == "Member"]

## 3. Combine all rows for a single hour of the same day into one

In [None]:
#Combine days and hours together, and show amount of rides per hour of each day.
rides = rides.groupby(['date', 'hour']).size().to_frame("rides").reset_index()

## 4. Add "is_holiday" column

In [None]:
rides = pd.merge(rides, holidays, left_on="date", right_on="date")

In [None]:
#Fill in the days that were missing
rides["is_holiday"] = rides["is_holiday"].fillna("False")

## 5. Add day of week column

In [None]:
rides['date'] = pd.to_datetime(rides['date'])
rides['day_of_week'] = rides['date'].dt.dayofweek

## 6. Add is_weekend and is_workingday column

In [None]:
#Saturday and Sunday are days 5 and 6
rides["is_weekend"] = rides["day_of_week"] >= 5
rides["is_workingday"] = rides["day_of_week"] < 5

## 7. Add temperature column

In [None]:
merged_data = pd.merge(rides, weather1, left_on=("date", "hour"), right_on=("Date", "Hour"), how = "left")

In [None]:
merged_data.head()

## Yay, we're done!

In [None]:
assert len(merged_data) > 365 * 23
assert 'date' in merged_data.columns
assert 'hour' in merged_data.columns
assert 'is_holiday' in merged_data.columns
assert 'temp_C' in merged_data.columns
assert 'rides' in merged_data.columns
assert len(merged_data.dropna()) == len(merged_data)

# Metadata

In [None]:
import requests

In [None]:
resp = requests.get("https://gbfs.capitalbikeshare.com/gbfs/en/station_information.json")
resp_json = resp.json()

In [None]:
stations = pd.DataFrame(resp_json['data']['stations'])

In [None]:
stations.query('short_name == "31620"')

In [None]:
import folium

In [None]:
folium.Map(
    location=[42.961111, -85.655556],
    zoom_start=13
)

In [None]:
m = folium.Map(
    location=[38.9, -77.],
#    tiles='Stamen Toner',
)

for station in stations.itertuples():
    folium.Marker(location=[station.lat, station.lon], tooltip=station.name).add_to(m)

m

In [None]:
folium.Map(
    location=[42.961111, -85.655556],
    tiles='Stamen Toner',
    zoom_start=13
)