In [1]:
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar
import datetime

In [2]:
rides = pd.read_csv("2011-capitalbikeshare-tripdata.zip")
rides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1226767 entries, 0 to 1226766
Data columns (total 9 columns):
Duration                1226767 non-null int64
Start date              1226767 non-null object
End date                1226767 non-null object
Start station number    1226767 non-null int64
Start station           1226767 non-null object
End station number      1226767 non-null int64
End station             1226767 non-null object
Bike number             1226767 non-null object
Member type             1226767 non-null object
dtypes: int64(3), object(6)
memory usage: 84.2+ MB


In [3]:
rides.head()

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
0,3548,2011-01-01 00:01:29,2011-01-01 01:00:37,31620,5th & F St NW,31620,5th & F St NW,W00247,Member
1,346,2011-01-01 00:02:46,2011-01-01 00:08:32,31105,14th & Harvard St NW,31101,14th & V St NW,W00675,Casual
2,562,2011-01-01 00:06:13,2011-01-01 00:15:36,31400,Georgia & New Hampshire Ave NW,31104,Adams Mill & Columbia Rd NW,W00357,Member
3,434,2011-01-01 00:09:21,2011-01-01 00:16:36,31111,10th & U St NW,31503,Florida Ave & R St NW,W00970,Member
4,233,2011-01-01 00:28:26,2011-01-01 00:32:19,31104,Adams Mill & Columbia Rd NW,31106,Calvert & Biltmore St NW,W00346,Casual


In [4]:
del rides["Start station"]
del rides["End station"]

In [5]:
rides['Member type'].value_counts()

Member     979814
Casual     246949
Unknown         4
Name: Member type, dtype: int64

In [6]:
# your code here
rides = rides[rides['Member type'] == "Member"]
print('{:,d} Member rides'.format(len(rides)))

979,814 Member rides


In [7]:
rides['start'] = pd.to_datetime(rides['Start date'])
rides['start'].iloc[0]

Timestamp('2011-01-01 00:01:29')

In [8]:
rides['start'].tail(10).dt.strftime("%Y-%m-%d")

1226757    2011-12-31
1226758    2011-12-31
1226759    2011-12-31
1226760    2011-12-31
1226761    2011-12-31
1226762    2011-12-31
1226763    2011-12-31
1226764    2011-12-31
1226765    2011-12-31
1226766    2011-12-31
Name: start, dtype: object

In [9]:
rides['date'] = rides['start'].dt.date#strftime("%Y-%m-%d")
rides['hour'] = rides['start'].dt.hour

In [10]:
# Here's something that might help with missing hours.
# pd.crosstab(index=rides["date"], columns=rides["hour"])

In [11]:
# your code here. Use groupby and size. You'll end up with a Series with a hierarchical index; remember that the "get out of jail card" is .to_frame(name="NAME_GOES_HERE").reset_index(). 
grouped_as_index = rides.groupby(['date', 'hour']).size()
rides_by_hour = grouped_as_index.to_frame(name="rides").reset_index()
rides_by_hour.head()

Unnamed: 0,date,hour,rides
0,2011-01-01,0,13
1,2011-01-01,1,30
2,2011-01-01,2,26
3,2011-01-01,3,9
4,2011-01-01,4,1


In [12]:
len(rides_by_hour)

8623

In [13]:
assert len(rides_by_hour) > 365 * 23
assert 'date' in rides_by_hour.columns
assert 'hour' in rides_by_hour.columns
assert 'rides' in rides_by_hour.columns

# Exercise 2: Compute rides by day.
The resulting table should have 365 rows.

In [14]:
# your code here
rides_by_day = rides.groupby('date').size().to_frame('rides').reset_index()

In [15]:
assert len(rides_by_day) == 365
assert 'date' in rides_by_day.columns
assert 'rides' in rides_by_day.columns

# Exercise 3: Mark holidays.

The following code gets us a table of federal holidays. Please run it without changing it.

In [16]:
# Run this code unchanged.
holidays = pd.DataFrame({
    'date': USFederalHolidayCalendar().holidays(datetime.date(2011,1,1), datetime.date(2015,12,31)).date,
    'is_holiday': True})
holidays.head()

Unnamed: 0,date,is_holiday
0,2011-01-17,True
1,2011-02-21,True
2,2011-05-30,True
3,2011-07-04,True
4,2011-09-05,True


## 3.1: `join`
Join the `rides_by_hour` table with the `holidays` table to get a new table that has a column for whether the hour occurred on a holiday.

# Weather Data
Our main goal will be to get the hourly temperature data.

The original wranglers used a weather data source that does not seem to provide downloadable data anymore. But we can use the US government's records. They're in a cumbersome format, which will provide us an excuse to practice some **data cleaning**!

First challenge is where to find the data. Here's how we solved this hard problem:

There's a "Find a Station" tool, but it's confusing how to use the results. https://www.ncdc.noaa.gov/data-access/land-based-station-data/station-metadata has a link to a [station list file](ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-history.txt). Searching that, it looks like the code for Reagan Airport is 724050 13743. So the file is
https://www.ncei.noaa.gov/data/global-hourly/access/2011/72405013743.csv

Poking around in that site revealed two documents that look very important:
- https://www.ncei.noaa.gov/data/global-hourly/doc/isd-format-document.pdf
- https://www.ncei.noaa.gov/data/global-hourly/doc/CSV_HELP.pdf



In [17]:
# Run this to load the file directly from the NOAA website.
# You may want to make a local copy and read it in from there instead.
weather = pd.read_csv("https://www.ncei.noaa.gov/data/global-hourly/access/2011/72405013743.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [18]:
print(len(weather))
weather.head()

14558


Unnamed: 0,STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,...,OC1,OD1,OE1,OE2,OE3,RH1,RH2,RH3,REM,EQD
0,72405013743,2011-01-01T00:00:00,4,38.8472,-77.03454,3.0,"WASHINGTON REAGAN NATIONAL AIRPORT, VA US",FM-12,KDCA,V020,...,,,,,,,,,SYN092AAXX 01004 72405 32966 21704 10056 2101...,
1,72405013743,2011-01-01T00:52:00,7,38.8472,-77.03454,3.0,"WASHINGTON REAGAN NATIONAL AIRPORT, VA US",FM-15,KDCA,V030,...,,,,,,,,,MET10012/31/10 19:52:03 METAR KDCA 010052Z 000...,
2,72405013743,2011-01-01T01:52:00,7,38.8472,-77.03454,3.0,"WASHINGTON REAGAN NATIONAL AIRPORT, VA US",FM-15,KDCA,V030,...,,,,,,,,,MET10012/31/10 20:52:03 METAR KDCA 010152Z 000...,
3,72405013743,2011-01-01T02:52:00,7,38.8472,-77.03454,3.0,"WASHINGTON REAGAN NATIONAL AIRPORT, VA US",FM-15,KDCA,V030,...,,,,,,,,,MET10612/31/10 21:52:03 METAR KDCA 010252Z 180...,
4,72405013743,2011-01-01T03:00:00,4,38.8472,-77.03454,3.0,"WASHINGTON REAGAN NATIONAL AIRPORT, VA US",FM-12,KDCA,V020,...,,,,,,,,,SYN076AAXX 01034 72405 32966 41803 10039 2100...,


In [19]:
weather['timestamp'] = pd.to_datetime(weather['DATE'])
weather['timestamp'].head()

0   2011-01-01 00:00:00
1   2011-01-01 00:52:00
2   2011-01-01 01:52:00
3   2011-01-01 02:52:00
4   2011-01-01 03:00:00
Name: timestamp, dtype: datetime64[ns]

Looking at the CSV help document, we see:

> AIR-TEMPERATURE-OBSERVATION air temperature is abbreviated as the column header TMP

So we'll look at the TMP column:

In [21]:
weather['TMP'].head()

0    +0056,1
1    +0056,5
2    +0050,5
3    +0039,5
4    +0039,1
Name: TMP, dtype: object

**Huh? What's that format?**  See pages 10 and 11 of "isd-format-document.pdf".

In [22]:
weather[['temp_raw', 'temp_quality_code']] = weather['TMP'].str.split(',', n=1, expand=True)
weather[['temp_raw', 'temp_quality_code']].head()

Unnamed: 0,temp_raw,temp_quality_code
0,56,1
1,56,5
2,50,5
3,39,5
4,39,1


Note that there are some negative temperatures; we can't just truncate that `+`.

In [23]:
weather['temp_raw'].str[0].value_counts()

+    13889
-      669
Name: temp_raw, dtype: int64

What are those "quality codes"?? Let's look them up.

In [24]:
weather['temp_quality_code'].value_counts(dropna=False)

5    11266
1     2906
9      378
A        7
6        1
Name: temp_quality_code, dtype: int64

Meanings:

* 5: Passed QC
* 1: Passed QC
* 9: Outside of limits
* A: suspect but good

Let's treat 5, 1, and A as ok.

In [25]:
weather['temp_is_valid'] = weather['temp_quality_code'].isin(["5", "1", "A"])
weather = weather[weather['temp_is_valid']].copy()

In [26]:
weather['temp_C'] = pd.to_numeric(weather['temp_raw']) / 10.

In [27]:
weather['date'] = weather['timestamp'].dt.date
weather['hour'] = weather['timestamp'].dt.hour

In [28]:
hourly_temp_data = weather.groupby(['date', 'hour'])['temp_C'].mean().reset_index()
hourly_temp_data.head()

Unnamed: 0,date,hour,temp_C
0,2011-01-01,0,5.6
1,2011-01-01,1,5.0
2,2011-01-01,2,3.9
3,2011-01-01,3,3.05
4,2011-01-01,4,3.3


In [29]:
rides_by_hour_with_weather = pd.merge(
    rides_by_hour,
    hourly_temp_data,
    on=['date', 'hour'],
    how='outer')

# For hours with no rides, there was no entry in the rides_by_hour table.
rides_by_hour_with_weather['rides'] = rides_by_hour_with_weather['rides'].fillna(0.)

rides_by_hour_with_weather.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8760 entries, 0 to 8759
Data columns (total 4 columns):
date      8760 non-null object
hour      8760 non-null int64
rides     8760 non-null float64
temp_C    8759 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 342.2+ KB


In [30]:
assert len(rides_by_hour_with_weather.dropna()) == len(hourly_temp_data)

In [31]:
# your code here
rides_by_hour_with_holidays = pd.merge(
    rides_by_hour_with_weather,
    holidays,
    left_on='date',
    right_on='date',
    how='left')
print(len(rides_by_hour_with_holidays))
rides_by_hour_with_holidays.head()

8760


Unnamed: 0,date,hour,rides,temp_C,is_holiday
0,2011-01-01,0,13.0,5.6,
1,2011-01-01,1,30.0,5.0,
2,2011-01-01,2,26.0,3.9,
3,2011-01-01,3,9.0,3.05,
4,2011-01-01,4,1.0,3.3,


In [32]:
assert len(rides_by_hour_with_holidays) == len(rides_by_hour_with_weather), "Oops, lengths inconsistent!"
assert 'is_holiday' in rides_by_hour_with_holidays.columns

In [33]:
rides_by_hour_with_holidays['is_holiday'] = rides_by_hour_with_holidays['is_holiday'].fillna(False)
rides_by_hour_with_holidays.head()

Unnamed: 0,date,hour,rides,temp_C,is_holiday
0,2011-01-01,0,13.0,5.6,False
1,2011-01-01,1,30.0,5.0,False
2,2011-01-01,2,26.0,3.9,False
3,2011-01-01,3,9.0,3.05,False
4,2011-01-01,4,1.0,3.3,False


In [34]:
assert not any(rides_by_hour_with_holidays['is_holiday'].isna())

In [35]:
rides_by_hour_with_holidays['day_of_week'] = [date.weekday() for date in rides_by_hour_with_holidays['date']]
rides_by_hour_with_holidays.sample(n=10)

Unnamed: 0,date,hour,rides,temp_C,is_holiday,day_of_week
589,2011-01-27,20,54.0,2.2,False,3
8643,2011-01-18,6,0.0,-0.9,False,1
6707,2011-10-12,22,40.0,18.38,False,2
4447,2011-07-09,21,151.0,32.5,False,5
2715,2011-04-28,15,115.0,21.8,False,3
2526,2011-04-20,18,381.0,27.8,False,2
1798,2011-03-21,1,6.0,8.3,False,0
6065,2011-09-16,3,4.0,13.05,False,4
6037,2011-09-14,23,62.0,26.1,False,2
5280,2011-08-13,15,206.0,27.5,False,5


In [36]:
# Monday is 0. So Sunday is 6, Saturday is 5
rides_by_hour_with_holidays['is_weekend'] = rides_by_hour_with_holidays['day_of_week'].isin([5, 6])
rides_by_hour_with_holidays['is_workingday'] = (
    ~rides_by_hour_with_holidays['is_holiday'] & ~rides_by_hour_with_holidays['is_weekend'])

In [37]:
rides_by_hour_with_holidays.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8760 entries, 0 to 8759
Data columns (total 8 columns):
date             8760 non-null object
hour             8760 non-null int64
rides            8760 non-null float64
temp_C           8759 non-null float64
is_holiday       8760 non-null bool
day_of_week      8760 non-null int64
is_weekend       8760 non-null bool
is_workingday    8760 non-null bool
dtypes: bool(3), float64(2), int64(2), object(1)
memory usage: 436.3+ KB


In [38]:
rides_by_hour_with_holidays[rides_by_hour_with_holidays.temp_C.isna()]

Unnamed: 0,date,hour,rides,temp_C,is_holiday,day_of_week,is_weekend,is_workingday
2906,2011-05-06,14,125.0,,False,4,False,True


Oops, a missing temperature value.

Let's impute it by interpolation. (`fillna(method='ffill')` would have been perfectly fine too.)

In [39]:
rides_by_hour_with_holidays['temp_C'] = rides_by_hour_with_holidays.temp_C.interpolate(method="linear")

In [40]:
rides_by_hour_with_holidays.iloc[2900:2910]

Unnamed: 0,date,hour,rides,temp_C,is_holiday,day_of_week,is_weekend,is_workingday
2900,2011-05-06,8,405.0,8.9,False,4,False,True
2901,2011-05-06,9,159.0,8.6,False,4,False,True
2902,2011-05-06,10,94.0,8.9,False,4,False,True
2903,2011-05-06,11,119.0,10.6,False,4,False,True
2904,2011-05-06,12,193.0,11.15,False,4,False,True
2905,2011-05-06,13,179.0,14.4,False,4,False,True
2906,2011-05-06,14,125.0,16.35,False,4,False,True
2907,2011-05-06,15,170.0,18.3,False,4,False,True
2908,2011-05-06,16,260.0,19.4,False,4,False,True
2909,2011-05-06,17,461.0,20.0,False,4,False,True


That looks reasonable.

In [41]:
rides_by_hour_with_holidays.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8760 entries, 0 to 8759
Data columns (total 8 columns):
date             8760 non-null object
hour             8760 non-null int64
rides            8760 non-null float64
temp_C           8760 non-null float64
is_holiday       8760 non-null bool
day_of_week      8760 non-null int64
is_weekend       8760 non-null bool
is_workingday    8760 non-null bool
dtypes: bool(3), float64(2), int64(2), object(1)
memory usage: 436.3+ KB


In [42]:
merged_data = rides_by_hour_with_holidays
merged_data.iloc[::1000]

Unnamed: 0,date,hour,rides,temp_C,is_holiday,day_of_week,is_weekend,is_workingday
0,2011-01-01,0,13.0,5.6,False,5,True,False
1000,2011-02-14,12,97.0,9.15,False,0,False,True
2000,2011-03-29,16,119.0,8.3,False,1,False,True
3000,2011-05-10,12,166.0,14.45,False,1,False,True
4000,2011-06-21,6,105.0,21.95,False,1,False,True
5000,2011-08-01,22,113.0,30.0,False,0,False,True
6000,2011-09-13,10,92.0,18.9,False,1,False,True
7000,2011-10-25,4,4.0,12.8,False,1,False,True
8000,2011-12-05,21,160.0,13.9,False,0,False,True


In [43]:
assert len(merged_data) > 365 * 23
assert 'date' in merged_data.columns
assert 'hour' in merged_data.columns
assert 'is_holiday' in merged_data.columns
assert 'temp_C' in merged_data.columns
assert 'rides' in merged_data.columns
assert len(merged_data.dropna()) == len(merged_data)