In [1]:
# From Lecture 6
import numpy as np
from datascience import *
full = Table.read_table('NC-EST2014-AGESEX-RES.csv')
partial = full.select(['SEX', 'AGE', 8])
census = partial.relabeled(2, 'Population')
census.set_format([2], NumberFormatter)

SEX,AGE,Population
0,0,3948350
0,1,3962123
0,2,3957772
0,3,4005190
0,4,4003448
0,5,4004858
0,6,4134352
0,7,4154000
0,8,4119524
0,9,4106832


In [2]:
males = census.where('SEX', 1)
females = census.where('SEX', 2)

In [3]:
females.sort('Population', descending=True)

SEX,AGE,Population
2,999,161920569
2,54,2331961
2,53,2316131
2,24,2301237
2,23,2298701
2,51,2283261
2,52,2282051
2,50,2279824
2,55,2263367
2,56,2254749


In [4]:
males.sort('Population', descending=True)

SEX,AGE,Population
1,999,156936487
1,23,2399883
1,24,2391398
1,22,2367842
1,21,2310734
1,25,2295836
1,20,2269570
1,54,2242828
1,26,2240026
1,19,2220790


## Apply

In [5]:
def mf(code):
    if code == 0:
        return 'Total'
    elif code == 1:
        return 'Male'
    elif code == 2:
        return 'Female'
    
mf(2)

'Female'

In [6]:
census_mf = census.with_column('MF', census.apply(mf, 'SEX'))
census_mf

SEX,AGE,Population,MF
0,0,3948350,Total
0,1,3962123,Total
0,2,3957772,Total
0,3,4005190,Total
0,4,4003448,Total
0,5,4004858,Total
0,6,4134352,Total
0,7,4154000,Total
0,8,4119524,Total
0,9,4106832,Total


In [7]:
pivoted = census_mf.pivot('MF', 'AGE', 'Population', sum)
fraction = pivoted.with_column('Male Percentage', pivoted.column(2)/pivoted.column(3))
fraction.set_format([1, 2, 3], NumberFormatter)
fraction.set_format(4, PercentFormatter).show()

AGE,Female Population,Male Population,Total Population,Male Percentage
0,1930493,2017857,3948350,51.11%
1,1938870,2023253,3962123,51.06%
2,1935270,2022502,3957772,51.10%
3,1956572,2048618,4005190,51.15%
4,1959950,2043498,4003448,51.04%
5,1961391,2043467,4004858,51.02%
6,2024024,2110328,4134352,51.04%
7,2031760,2122240,4154000,51.09%
8,2014402,2105122,4119524,51.10%
9,2009560,2097272,4106832,51.07%


## Bikes

In [8]:
# Download data (it may take minutes)
import os
if not os.path.exists('201508_station_data.csv'):
    !wget https://s3.amazonaws.com/babs-open-data/babs_open_data_year_2.zip && \
        unzip babs_open_data_year_2.zip && \
        rm 201508_status_data.csv babs_open_data_year_2.zip

In [9]:
trips = Table.read_table('201508_trip_data.csv')
trips

Trip ID,Duration,Start Date,Start Station,Start Terminal,End Date,End Station,End Terminal,Bike #,Subscriber Type,Zip Code
913460,765,8/31/2015 23:26,Harry Bridges Plaza (Ferry Building),50,8/31/2015 23:39,San Francisco Caltrain (Townsend at 4th),70,288,Subscriber,2139
913459,1036,8/31/2015 23:11,San Antonio Shopping Center,31,8/31/2015 23:28,Mountain View City Hall,27,35,Subscriber,95032
913455,307,8/31/2015 23:13,Post at Kearny,47,8/31/2015 23:18,2nd at South Park,64,468,Subscriber,94107
913454,409,8/31/2015 23:10,San Jose City Hall,10,8/31/2015 23:17,San Salvador at 1st,8,68,Subscriber,95113
913453,789,8/31/2015 23:09,Embarcadero at Folsom,51,8/31/2015 23:22,Embarcadero at Sansome,60,487,Customer,9069
913452,293,8/31/2015 23:07,Yerba Buena Center of the Arts (3rd @ Howard),68,8/31/2015 23:12,San Francisco Caltrain (Townsend at 4th),70,538,Subscriber,94118
913451,896,8/31/2015 23:07,Embarcadero at Folsom,51,8/31/2015 23:22,Embarcadero at Sansome,60,363,Customer,92562
913450,255,8/31/2015 22:16,Embarcadero at Sansome,60,8/31/2015 22:20,Steuart at Market,74,470,Subscriber,94111
913449,126,8/31/2015 22:12,Beale at Market,56,8/31/2015 22:15,Temporary Transbay Terminal (Howard at Beale),55,439,Subscriber,94130
913448,932,8/31/2015 21:57,Post at Kearny,47,8/31/2015 22:12,South Van Ness at Market,66,472,Subscriber,94702


In [10]:
starts = trips.select(3)
starts

Start Station
Harry Bridges Plaza (Ferry Building)
San Antonio Shopping Center
Post at Kearny
San Jose City Hall
Embarcadero at Folsom
Yerba Buena Center of the Arts (3rd @ Howard)
Embarcadero at Folsom
Embarcadero at Sansome
Beale at Market
Post at Kearny


## Group

In [11]:
starts.group(0)

Start Station,count
2nd at Folsom,7999
2nd at South Park,9469
2nd at Townsend,14026
5th at Howard,7708
Adobe on Almaden,562
Arena Green / SAP Center,647
Beale at Market,8359
Broadway St at Battery St,7676
California Ave Caltrain Station,400
Castro Street and El Camino Real,1230


In [12]:
starts.group(0).sort(1, descending=True)

Start Station,count
San Francisco Caltrain (Townsend at 4th),26304
San Francisco Caltrain 2 (330 Townsend),21758
Harry Bridges Plaza (Ferry Building),17255
Temporary Transbay Terminal (Howard at Beale),14436
Embarcadero at Sansome,14158
2nd at Townsend,14026
Townsend at 7th,13752
Steuart at Market,13687
Market at 10th,11885
Market at Sansome,11431


In [13]:
trips.select([3, 8])

Start Station,Bike #
Harry Bridges Plaza (Ferry Building),288
San Antonio Shopping Center,35
Post at Kearny,468
San Jose City Hall,68
Embarcadero at Folsom,487
Yerba Buena Center of the Arts (3rd @ Howard),538
Embarcadero at Folsom,363
Embarcadero at Sansome,470
Beale at Market,439
Post at Kearny,472


In [17]:
bikes = trips.select([3, 8]).group('Bike #', list)
bikes

Bike #,Start Station list
9,"['San Jose Diridon Caltrain Station', 'MLK Library', 'SJ ..."
10,"['San Antonio Shopping Center', 'San Antonio Caltrain St ..."
11,"['St James Park', 'St James Park', 'Paseo de San Antonio ..."
12,"['Mountain View City Hall', 'Mountain View City Hall', ' ..."
13,"['Mountain View Caltrain Station', 'Castro Street and El ..."
14,"['San Jose Diridon Caltrain Station', 'Santa Clara at Al ..."
15,"['Redwood City Public Library', 'Redwood City Public Lib ..."
16,"['Temporary Transbay Terminal (Howard at Beale)', 'Golde ..."
17,"['San Jose Diridon Caltrain Station', 'St James Park', ' ..."
18,"['San Jose Civic Center', 'MLK Library', 'San Jose Dirid ..."


In [18]:
bikes.row(3).item(1)

['Mountain View City Hall',
 'Mountain View City Hall',
 'Mountain View Caltrain Station',
 'Castro Street and El Camino Real',
 'Mountain View Caltrain Station',
 'Evelyn Park and Ride',
 'Mountain View Caltrain Station',
 'Rengstorff Avenue / California Street',
 'University and Emerson',
 'University and Emerson',
 'University and Emerson',
 'University and Emerson',
 'University and Emerson',
 'Cowper at University',
 'Palo Alto Caltrain Station',
 'California Ave Caltrain Station',
 'University and Emerson',
 'University and Emerson',
 'California Ave Caltrain Station',
 'Palo Alto Caltrain Station',
 'Park at Olive',
 'Palo Alto Caltrain Station',
 'San Antonio Shopping Center',
 'San Antonio Caltrain Station',
 'San Antonio Shopping Center',
 'Palo Alto Caltrain Station',
 'Cowper at University',
 'Palo Alto Caltrain Station',
 'Cowper at University',
 'Palo Alto Caltrain Station',
 'Cowper at University',
 'Cowper at University',
 'Palo Alto Caltrain Station',
 'Palo Alto Caltr

## Groups

In [19]:
duration = trips.select([3, 6, 1])
duration

Start Station,End Station,Duration
Harry Bridges Plaza (Ferry Building),San Francisco Caltrain (Townsend at 4th),765
San Antonio Shopping Center,Mountain View City Hall,1036
Post at Kearny,2nd at South Park,307
San Jose City Hall,San Salvador at 1st,409
Embarcadero at Folsom,Embarcadero at Sansome,789
Yerba Buena Center of the Arts (3rd @ Howard),San Francisco Caltrain (Townsend at 4th),293
Embarcadero at Folsom,Embarcadero at Sansome,896
Embarcadero at Sansome,Steuart at Market,255
Beale at Market,Temporary Transbay Terminal (Howard at Beale),126
Post at Kearny,South Van Ness at Market,932


In [20]:
shortest = duration.groups([0, 1], min)
shortest

Start Station,End Station,Duration min
2nd at Folsom,2nd at Folsom,61
2nd at Folsom,2nd at South Park,61
2nd at Folsom,2nd at Townsend,137
2nd at Folsom,5th at Howard,215
2nd at Folsom,Beale at Market,219
2nd at Folsom,Broadway St at Battery St,351
2nd at Folsom,Civic Center BART (7th at Market),456
2nd at Folsom,Clay at Battery,272
2nd at Folsom,Commercial at Montgomery,275
2nd at Folsom,Davis at Jackson,396


In [21]:
shortest.where(0, 'Civic Center BART (7th at Market)').sort(2)

Start Station,End Station,Duration min
Civic Center BART (7th at Market),Civic Center BART (7th at Market),60
Civic Center BART (7th at Market),Powell Street BART,97
Civic Center BART (7th at Market),Market at 10th,104
Civic Center BART (7th at Market),Golden Gate at Polk,141
Civic Center BART (7th at Market),San Francisco City Hall,151
Civic Center BART (7th at Market),Market at 4th,164
Civic Center BART (7th at Market),5th at Howard,179
Civic Center BART (7th at Market),South Van Ness at Market,199
Civic Center BART (7th at Market),Market at Sansome,254
Civic Center BART (7th at Market),Powell at Post (Union Square),254


In [22]:
stations = Table.read_table('201508_station_data.csv')
sf = stations.where('landmark', 'San Francisco')
Marker.map_table(sf.select(['lat', 'long', 'name']))

In [23]:
weather = Table.read_table('201508_weather_data.csv')
weather

PDT,Max TemperatureF,Mean TemperatureF,Min TemperatureF,Max Dew PointF,MeanDew PointF,Min DewpointF,Max Humidity,Mean Humidity,Min Humidity,Max Sea Level PressureIn,Mean Sea Level PressureIn,Min Sea Level PressureIn,Max VisibilityMiles,Mean VisibilityMiles,Min VisibilityMiles,Max Wind SpeedMPH,Mean Wind SpeedMPH,Max Gust SpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees,Zip
9/1/2014,83,70,57,58,56,52,86,64,42,29.86,29.82,29.76,10,10,8,16,7,20.0,0,0,,290,94107
9/2/2014,72,66,60,58,57,55,84,73,61,29.87,29.82,29.79,10,10,7,21,8,,0,5,,290,94107
9/3/2014,76,69,61,57,56,55,84,69,53,29.81,29.76,29.72,10,10,10,21,8,24.0,0,4,,276,94107
9/4/2014,74,68,61,57,57,56,84,71,57,29.81,29.76,29.72,10,10,8,22,8,25.0,0,5,,301,94107
9/5/2014,72,66,60,57,56,54,84,71,57,29.92,29.87,29.81,10,9,7,18,8,32.0,0,4,,309,94107
9/6/2014,72,66,60,55,54,52,84,69,53,29.98,29.93,29.89,10,9,7,17,9,30.0,0,4,,290,94107
9/7/2014,72,66,60,54,53,45,78,66,53,29.92,29.87,29.82,10,10,10,18,10,28.0,0,3,,293,94107
9/8/2014,68,64,59,52,51,50,72,64,55,29.96,29.9,29.86,10,10,10,18,12,22.0,0,4,,272,94107
9/9/2014,72,65,57,56,53,52,87,72,57,30.01,29.96,29.91,10,10,10,17,7,21.0,0,5,,270,94107
9/10/2014,82,68,54,59,56,51,93,66,39,29.96,29.9,29.83,10,9,5,16,5,20.0,0,1,,283,94107


In [24]:
temp = weather.where('Zip', 94107).select([0, 1]).sort(1)
temp

PDT,Max TemperatureF
1/2/2015,50
1/3/2015,53
1/4/2015,54
12/27/2014,55
12/28/2014,55
12/29/2014,55
12/30/2014,55
12/12/2014,56
12/15/2014,56
1/1/2015,56


In [25]:
temp = temp.relabeled(0, 'Date').relabeled(1, 'Temp')
temp

Date,Temp
1/2/2015,50
1/3/2015,53
1/4/2015,54
12/27/2014,55
12/28/2014,55
12/29/2014,55
12/30/2014,55
12/12/2014,56
12/15/2014,56
1/1/2015,56


In [26]:
trips.column(2)

array(['8/31/2015 23:26', '8/31/2015 23:11', '8/31/2015 23:13', ...,
       '9/1/2014 0:05', '9/1/2014 0:05', '9/1/2014 0:05'], dtype=object)

In [27]:
'8/31/2015 23:26'.split()

['8/31/2015', '23:26']

In [28]:
np.array('8/31/2015 23:26'.split()).item(0)

'8/31/2015'

In [29]:
def day(time):
    return np.array(time.split()).item(0)

dates = trips.apply(day, 2)
dates

array(['8/31/2015', '8/31/2015', '8/31/2015', ..., '9/1/2014', '9/1/2014',
       '9/1/2014'], 
      dtype='<U10')

In [30]:
trip_dates = trips.select('Start Station').with_column('Date', dates)
trip_dates

Start Station,Date
Harry Bridges Plaza (Ferry Building),8/31/2015
San Antonio Shopping Center,8/31/2015
Post at Kearny,8/31/2015
San Jose City Hall,8/31/2015
Embarcadero at Folsom,8/31/2015
Yerba Buena Center of the Arts (3rd @ Howard),8/31/2015
Embarcadero at Folsom,8/31/2015
Embarcadero at Sansome,8/31/2015
Beale at Market,8/31/2015
Post at Kearny,8/31/2015


In [31]:
by_date = trip_dates.group('Date').sort(1)
by_date

Date,count
12/11/2014,107
11/30/2014,121
11/27/2014,133
2/8/2015,141
12/25/2014,150
1/1/2015,181
1/4/2015,206
4/5/2015,216
12/27/2014,231
11/23/2014,235


In [32]:
by_date = by_date.relabeled(1, 'Trips')
by_date

Date,Trips
12/11/2014,107
11/30/2014,121
11/27/2014,133
2/8/2015,141
12/25/2014,150
1/1/2015,181
1/4/2015,206
4/5/2015,216
12/27/2014,231
11/23/2014,235


## Join

In [33]:
both = by_date.join('Date', temp)
both

Date,Trips,Temp
1/1/2015,181,56
1/10/2015,362,60
1/11/2015,327,57
1/12/2015,1344,57
1/13/2015,1368,66
1/14/2015,1296,59
1/15/2015,1229,59
1/16/2015,1082,57
1/17/2015,319,57
1/18/2015,275,65


In [34]:
np.average(both.where(both.column('Temp') < 65).column('Trips'))

847.41525423728808

In [35]:
np.average(both.where(both.column('Temp') >= 65).column('Trips'))

1028.9757085020242

## Pivot

In [36]:
set(weather.column(' Events'))

{nan, 'Fog', 'Fog-Rain', 'Rain-Thunderstorm', 'Rain'}

In [37]:
def raining(event):
    if event == 'Rain' or event == 'Fog-Rain' or event == 'Rain-Thunderstorm':
        return 'Wet'
    else:
        return 'Dry'
    
def temperature(k):
    if k >= 65:
        return "Hot"
    else:
        return "Cold"
    
climate = Table().with_columns([
        'Date', weather.column(0),
        'Rain', weather.apply(raining, ' Events'),
        'Temp', weather.apply(temperature, 'Max TemperatureF'),
    ])

climate

Date,Rain,Temp
9/1/2014,Dry,Hot
9/2/2014,Dry,Hot
9/3/2014,Dry,Hot
9/4/2014,Dry,Hot
9/5/2014,Dry,Hot
9/6/2014,Dry,Hot
9/7/2014,Dry,Hot
9/8/2014,Dry,Hot
9/9/2014,Dry,Hot
9/10/2014,Dry,Hot


In [38]:
by_date.join('Date', climate).pivot('Rain', 'Temp', 'Trips', np.average)

Temp,Dry Trips,Wet Trips
Cold,925.563,628.097
Hot,1022.26,1083.67
