In [126]:

%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import datetime as dt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import defaultdict

plt.style.use('default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 5)

In [127]:
station = pd.read_csv('station.csv', low_memory=False)
trip_test = pd.read_csv('trip_test.csv', low_memory=False)
trip_train = pd.read_csv('trip_train.csv', low_memory=False)
weather = pd.read_csv('weather.csv', low_memory=False)


In [128]:
trip_train.head()

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code
0,907649,396,8/27/2015 8:36,Harry Bridges Plaza (Ferry Building),50,8/27/2015 8:43,Embarcadero at Sansome,60,187,Subscriber,94602
1,384043,636,7/28/2014 22:06,Market at 10th,67,7/28/2014 22:17,Washington at Kearny,46,417,Subscriber,94133
2,316176,334,6/9/2014 8:42,Market at Sansome,77,6/9/2014 8:47,2nd at Folsom,62,281,Subscriber,94107
3,618874,666,1/26/2015 16:55,San Francisco Caltrain 2 (330 Townsend),69,1/26/2015 17:07,Temporary Transbay Terminal (Howard at Beale),55,634,Subscriber,94602
4,910977,318,8/29/2015 15:09,Market at 10th,67,8/29/2015 15:14,Powell Street BART,39,607,Subscriber,94709


In [129]:
station.head()

Unnamed: 0,id,name,lat,long,dock_count,city,installation_date
0,2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27,San Jose,8/6/2013
1,3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,8/5/2013
2,4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,8/6/2013
3,5,Adobe on Almaden,37.331415,-121.8932,19,San Jose,8/5/2013
4,6,San Pedro Square,37.336721,-121.894074,15,San Jose,8/7/2013


In [130]:
del weather['zip_code']
weather.head()

Unnamed: 0,date,max_temperature_f,mean_temperature_f,min_temperature_f,max_dew_point_f,mean_dew_point_f,min_dew_point_f,max_humidity,mean_humidity,min_humidity,...,max_visibility_miles,mean_visibility_miles,min_visibility_miles,max_wind_Speed_mph,mean_wind_speed_mph,max_gust_speed_mph,precipitation_inches,cloud_cover,events,wind_dir_degrees
0,8/29/2013,74.0,68.0,61.0,61.0,58.0,56.0,93.0,75.0,57.0,...,10.0,10.0,10.0,23.0,11.0,28.0,0,4.0,,286.0
1,8/30/2013,78.0,69.0,60.0,61.0,58.0,56.0,90.0,70.0,50.0,...,10.0,10.0,7.0,29.0,13.0,35.0,0,2.0,,291.0
2,8/31/2013,71.0,64.0,57.0,57.0,56.0,54.0,93.0,75.0,57.0,...,10.0,10.0,10.0,26.0,15.0,31.0,0,4.0,,284.0
3,9/1/2013,74.0,66.0,58.0,60.0,56.0,53.0,87.0,68.0,49.0,...,10.0,10.0,10.0,25.0,13.0,29.0,0,4.0,,284.0
4,9/2/2013,75.0,69.0,62.0,61.0,60.0,58.0,93.0,77.0,61.0,...,10.0,10.0,6.0,23.0,12.0,30.0,0,6.0,,277.0


In [135]:
weather.columns


Index(['date', 'max_temperature_f', 'mean_temperature_f', 'min_temperature_f',
       'max_dew_point_f', 'mean_dew_point_f', 'min_dew_point_f',
       'max_humidity', 'mean_humidity', 'min_humidity',
       'max_sea_level_pressure_inches', 'mean_sea_level_pressure_inches',
       'min_sea_level_pressure_inches', 'max_visibility_miles',
       'mean_visibility_miles', 'min_visibility_miles', 'max_wind_Speed_mph',
       'mean_wind_speed_mph', 'max_gust_speed_mph', 'precipitation_inches',
       'cloud_cover', 'events', 'wind_dir_degrees'],
      dtype='object')

In [136]:
trip_train = pd.merge(trip_train, station[['id', 'lat', 'long']], left_on='start_station_id', right_on= 'id' ,how='left')


In [137]:
trip_train.drop('id_y', axis=1, inplace=True)
trip_train.rename(columns={'id_x' : 'id', 'lat': 'start_station_lat','long': 'start_station_long'}, inplace=True)

In [138]:
trip_train = pd.merge(trip_train, station[['id', 'lat', 'long']], left_on='end_station_id', right_on= 'id' ,how='left')
trip_train

Unnamed: 0,id_x,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,...,start_station_lat,start_station_long,lat_x,long_x,id_x.1,lat_y,long_y,id_y,lat,long
0,907649,396,8/27/2015 8:36,Harry Bridges Plaza (Ferry Building),50,8/27/2015 8:43,Embarcadero at Sansome,60,187,Subscriber,...,37.795392,-122.394203,37.804770,-122.403234,50,37.795392,-122.394203,60,37.804770,-122.403234
1,384043,636,7/28/2014 22:06,Market at 10th,67,7/28/2014 22:17,Washington at Kearny,46,417,Subscriber,...,37.776619,-122.417385,37.795425,-122.404767,67,37.776619,-122.417385,46,37.795425,-122.404767
2,316176,334,6/9/2014 8:42,Market at Sansome,77,6/9/2014 8:47,2nd at Folsom,62,281,Subscriber,...,37.789625,-122.400811,37.785299,-122.396236,77,37.789625,-122.400811,62,37.785299,-122.396236
3,618874,666,1/26/2015 16:55,San Francisco Caltrain 2 (330 Townsend),69,1/26/2015 17:07,Temporary Transbay Terminal (Howard at Beale),55,634,Subscriber,...,37.776600,-122.395470,37.789756,-122.394643,69,37.776600,-122.395470,55,37.789756,-122.394643
4,910977,318,8/29/2015 15:09,Market at 10th,67,8/29/2015 15:14,Powell Street BART,39,607,Subscriber,...,37.776619,-122.417385,37.783871,-122.408433,67,37.776619,-122.417385,39,37.783871,-122.408433
5,522083,337,10/30/2014 7:00,Townsend at 7th,65,10/30/2014 7:06,San Francisco Caltrain (Townsend at 4th),70,370,Subscriber,...,37.771058,-122.402717,37.776617,-122.395260,65,37.771058,-122.402717,70,37.776617,-122.395260
6,880809,394,8/7/2015 17:11,2nd at South Park,64,8/7/2015 17:17,2nd at Townsend,61,443,Subscriber,...,37.782259,-122.392738,37.780526,-122.390288,64,37.782259,-122.392738,61,37.780526,-122.390288
7,488938,766,10/8/2014 14:10,Powell at Post (Union Square),71,10/8/2014 14:23,San Francisco Caltrain (Townsend at 4th),70,485,Subscriber,...,37.788446,-122.408499,37.776617,-122.395260,71,37.788446,-122.408499,70,37.776617,-122.395260
8,899522,531,8/21/2015 7:53,2nd at Folsom,62,8/21/2015 8:02,San Francisco Caltrain (Townsend at 4th),70,603,Subscriber,...,37.785299,-122.396236,37.776617,-122.395260,62,37.785299,-122.396236,70,37.776617,-122.395260
9,737380,267,4/23/2015 6:36,Market at 4th,76,4/23/2015 6:40,Mechanics Plaza (Market at Battery),75,86,Customer,...,37.786305,-122.404966,37.791300,-122.399051,76,37.786305,-122.404966,75,37.791300,-122.399051


In [139]:
trip_train.drop('id_y', axis=1, inplace=True)
trip_train.rename(columns={'id_x' : 'id', 'lat': 'end_station_lat','long': 'end_station_long'}, inplace=True)
trip_train

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code,start_station_lat,start_station_long,lat_x,long_x,id.1,lat_y,long_y,end_station_lat,end_station_long
0,907649,396,8/27/2015 8:36,Harry Bridges Plaza (Ferry Building),50,8/27/2015 8:43,Embarcadero at Sansome,60,187,Subscriber,94602,37.795392,-122.394203,37.804770,-122.403234,50,37.795392,-122.394203,37.804770,-122.403234
1,384043,636,7/28/2014 22:06,Market at 10th,67,7/28/2014 22:17,Washington at Kearny,46,417,Subscriber,94133,37.776619,-122.417385,37.795425,-122.404767,67,37.776619,-122.417385,37.795425,-122.404767
2,316176,334,6/9/2014 8:42,Market at Sansome,77,6/9/2014 8:47,2nd at Folsom,62,281,Subscriber,94107,37.789625,-122.400811,37.785299,-122.396236,77,37.789625,-122.400811,37.785299,-122.396236
3,618874,666,1/26/2015 16:55,San Francisco Caltrain 2 (330 Townsend),69,1/26/2015 17:07,Temporary Transbay Terminal (Howard at Beale),55,634,Subscriber,94602,37.776600,-122.395470,37.789756,-122.394643,69,37.776600,-122.395470,37.789756,-122.394643
4,910977,318,8/29/2015 15:09,Market at 10th,67,8/29/2015 15:14,Powell Street BART,39,607,Subscriber,94709,37.776619,-122.417385,37.783871,-122.408433,67,37.776619,-122.417385,37.783871,-122.408433
5,522083,337,10/30/2014 7:00,Townsend at 7th,65,10/30/2014 7:06,San Francisco Caltrain (Townsend at 4th),70,370,Subscriber,94107,37.771058,-122.402717,37.776617,-122.395260,65,37.771058,-122.402717,37.776617,-122.395260
6,880809,394,8/7/2015 17:11,2nd at South Park,64,8/7/2015 17:17,2nd at Townsend,61,443,Subscriber,94107,37.782259,-122.392738,37.780526,-122.390288,64,37.782259,-122.392738,37.780526,-122.390288
7,488938,766,10/8/2014 14:10,Powell at Post (Union Square),71,10/8/2014 14:23,San Francisco Caltrain (Townsend at 4th),70,485,Subscriber,94107,37.788446,-122.408499,37.776617,-122.395260,71,37.788446,-122.408499,37.776617,-122.395260
8,899522,531,8/21/2015 7:53,2nd at Folsom,62,8/21/2015 8:02,San Francisco Caltrain (Townsend at 4th),70,603,Subscriber,94549,37.785299,-122.396236,37.776617,-122.395260,62,37.785299,-122.396236,37.776617,-122.395260
9,737380,267,4/23/2015 6:36,Market at 4th,76,4/23/2015 6:40,Mechanics Plaza (Market at Battery),75,86,Customer,21202,37.786305,-122.404966,37.791300,-122.399051,76,37.786305,-122.404966,37.791300,-122.399051


In [140]:
fechasStart = trip_train['start_date'].map(lambda x: (x.split(' ')[0]))
trip_train['startDate'] =  fechasStart
fechasEnd = trip_train['end_date'].map(lambda x: (x.split(' ')[0]))
trip_train['endDate'] =  fechasEnd

In [141]:
trip_train[trip_train['startDate'] != trip_train['endDate']]

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,...,start_station_long,lat_x,long_x,id.1,lat_y,long_y,end_station_lat,end_station_long,startDate,endDate
62,246208,64803,4/12/2014 17:23,Grant Avenue at Columbus Avenue,73,4/13/2014 11:23,2nd at Townsend,61,364,Customer,...,-122.407245,37.780526,-122.390288,73,37.798522,-122.407245,37.780526,-122.390288,4/12/2014,4/13/2014
410,402371,59644,8/10/2014 23:10,Yerba Buena Center of the Arts (3rd @ Howard),68,8/11/2014 15:44,Golden Gate at Polk,59,222,Customer,...,-122.401014,37.781332,-122.418603,68,37.784878,-122.401014,37.781332,-122.418603,8/10/2014,8/11/2014
984,566595,15091,12/4/2014 22:46,San Francisco City Hall,58,12/5/2014 2:57,Market at 10th,67,387,Customer,...,-122.418235,37.776619,-122.417385,58,37.778650,-122.418235,37.776619,-122.417385,12/4/2014,12/5/2014
1245,187216,30023,2/17/2014 23:27,Harry Bridges Plaza (Ferry Building),50,2/18/2014 7:47,Embarcadero at Bryant,54,584,Subscriber,...,-122.394203,37.787152,-122.388013,50,37.795392,-122.394203,37.787152,-122.388013,2/17/2014,2/18/2014
1331,658718,241766,2/26/2015 2:42,Santa Clara at Almaden,4,2/28/2015 21:52,San Jose Diridon Caltrain Station,2,46,Customer,...,-121.894902,37.329732,-121.901782,4,37.333988,-121.894902,37.329732,-121.901782,2/26/2015,2/28/2015
1397,32352,534,9/23/2013 23:57,Embarcadero at Sansome,60,9/24/2013 0:06,Mechanics Plaza (Market at Battery),75,290,Subscriber,...,-122.403234,37.791300,-122.399051,60,37.804770,-122.403234,37.791300,-122.399051,9/23/2013,9/24/2013
1654,760403,68367,5/9/2015 23:08,Davis at Jackson,42,5/10/2015 18:08,Mechanics Plaza (Market at Battery),75,523,Customer,...,-122.398436,37.791300,-122.399051,42,37.797280,-122.398436,37.791300,-122.399051,5/9/2015,5/10/2015
1664,825527,46016,6/28/2015 13:30,South Van Ness at Market,66,6/29/2015 2:17,Townsend at 7th,65,502,Customer,...,-122.418954,37.771058,-122.402717,66,37.774814,-122.418954,37.771058,-122.402717,6/28/2015,6/29/2015
1675,635323,71356,2/8/2015 13:47,Embarcadero at Vallejo,48,2/9/2015 9:36,2nd at Townsend,61,442,Customer,...,-122.398525,37.780526,-122.390288,48,37.799953,-122.398525,37.780526,-122.390288,2/8/2015,2/9/2015
1935,535575,458,11/7/2014 23:53,Mountain View Caltrain Station,28,11/8/2014 0:01,Castro Street and El Camino Real,32,102,Subscriber,...,-122.076713,37.385956,-122.083678,28,37.394358,-122.076713,37.385956,-122.083678,11/7/2014,11/8/2014


In [142]:
trip_train = pd.merge(trip_train, weather, left_on='startDate', right_on= 'date' ,how='left')
trip_train

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,...,max_visibility_miles,mean_visibility_miles,min_visibility_miles,max_wind_Speed_mph,mean_wind_speed_mph,max_gust_speed_mph,precipitation_inches,cloud_cover,events,wind_dir_degrees
0,907649,396,8/27/2015 8:36,Harry Bridges Plaza (Ferry Building),50,8/27/2015 8:43,Embarcadero at Sansome,60,187,Subscriber,...,10.0,10.0,10.0,17.0,6.0,29.0,0,2.0,,272.0
1,907649,396,8/27/2015 8:36,Harry Bridges Plaza (Ferry Building),50,8/27/2015 8:43,Embarcadero at Sansome,60,187,Subscriber,...,10.0,10.0,10.0,14.0,3.0,20.0,0,1.0,,309.0
2,907649,396,8/27/2015 8:36,Harry Bridges Plaza (Ferry Building),50,8/27/2015 8:43,Embarcadero at Sansome,60,187,Subscriber,...,20.0,13.0,10.0,17.0,6.0,,0,0.0,,349.0
3,907649,396,8/27/2015 8:36,Harry Bridges Plaza (Ferry Building),50,8/27/2015 8:43,Embarcadero at Sansome,60,187,Subscriber,...,10.0,10.0,10.0,22.0,5.0,25.0,0,0.0,,356.0
4,907649,396,8/27/2015 8:36,Harry Bridges Plaza (Ferry Building),50,8/27/2015 8:43,Embarcadero at Sansome,60,187,Subscriber,...,10.0,10.0,10.0,23.0,6.0,29.0,0,3.0,,313.0
5,384043,636,7/28/2014 22:06,Market at 10th,67,7/28/2014 22:17,Washington at Kearny,46,417,Subscriber,...,10.0,10.0,10.0,21.0,10.0,24.0,0,6.0,,316.0
6,384043,636,7/28/2014 22:06,Market at 10th,67,7/28/2014 22:17,Washington at Kearny,46,417,Subscriber,...,10.0,10.0,10.0,14.0,5.0,,0,4.0,,345.0
7,384043,636,7/28/2014 22:06,Market at 10th,67,7/28/2014 22:17,Washington at Kearny,46,417,Subscriber,...,20.0,13.0,7.0,15.0,8.0,,0,5.0,,352.0
8,384043,636,7/28/2014 22:06,Market at 10th,67,7/28/2014 22:17,Washington at Kearny,46,417,Subscriber,...,10.0,10.0,10.0,16.0,5.0,21.0,0,1.0,,339.0
9,384043,636,7/28/2014 22:06,Market at 10th,67,7/28/2014 22:17,Washington at Kearny,46,417,Subscriber,...,10.0,10.0,10.0,15.0,5.0,18.0,0,4.0,,294.0


In [144]:
del trip_train['endDate']
del trip_train['endDate']

Index(['id', 'duration', 'start_date', 'start_station_name',
       'start_station_id', 'end_date', 'end_station_name', 'end_station_id',
       'bike_id', 'subscription_type', 'zip_code', 'start_station_lat',
       'start_station_long', 'lat_x', 'long_x', 'id', 'lat_y', 'long_y',
       'end_station_lat', 'end_station_long', 'startDate', 'endDate', 'date',
       'max_temperature_f', 'mean_temperature_f', 'min_temperature_f',
       'max_dew_point_f', 'mean_dew_point_f', 'min_dew_point_f',
       'max_humidity', 'mean_humidity', 'min_humidity',
       'max_sea_level_pressure_inches', 'mean_sea_level_pressure_inches',
       'min_sea_level_pressure_inches', 'max_visibility_miles',
       'mean_visibility_miles', 'min_visibility_miles', 'max_wind_Speed_mph',
       'mean_wind_speed_mph', 'max_gust_speed_mph', 'precipitation_inches',
       'cloud_cover', 'events', 'wind_dir_degrees'],
      dtype='object')

In [4]:
#separamos en dos campos nuevos. fecha en formato yyyymmdd y hora del dia en minutos
fechasStart = trip_train['start_date'].map(lambda x: (int(dt.datetime.strptime(x.split(' ')[0], "%m/%d/%Y").strftime("%Y%m%d"))))
horasStart = trip_train['start_date'].map(lambda x: (x.split(' ')[1]))
horasStart = horasStart.map(lambda x: (int(x.split(':')[0]) * 60 + int(x.split(':')[1])))

fechasEnd = trip_train['end_date'].map(lambda x: (int(dt.datetime.strptime(x.split(' ')[0], "%m/%d/%Y").strftime("%Y%m%d"))))
horasEnd = trip_train['end_date'].map(lambda x: (x.split(' ')[1]))
horasEnd = horasEnd.map(lambda x: (int(x.split(':')[0]) * 60 + int(x.split(':')[1])))

In [5]:
del trip_train['start_date']
del trip_train['end_date']
trip_train.head()

Unnamed: 0,id,duration,start_station_name,start_station_id,end_station_name,end_station_id,bike_id,subscription_type,zip_code
0,907649,396,Harry Bridges Plaza (Ferry Building),50,Embarcadero at Sansome,60,187,Subscriber,94602
1,384043,636,Market at 10th,67,Washington at Kearny,46,417,Subscriber,94133
2,316176,334,Market at Sansome,77,2nd at Folsom,62,281,Subscriber,94107
3,618874,666,San Francisco Caltrain 2 (330 Townsend),69,Temporary Transbay Terminal (Howard at Beale),55,634,Subscriber,94602
4,910977,318,Market at 10th,67,Powell Street BART,39,607,Subscriber,94709


In [6]:
trip_train['start_date'] = fechasStart
trip_train['end_date'] = fechasEnd
trip_train['start_hour'] = horasStart
trip_train['end_hour'] = horasEnd

trip_train.head()


Unnamed: 0,id,duration,start_station_name,start_station_id,end_station_name,end_station_id,bike_id,subscription_type,zip_code,start_date,end_date,start_hour,end_hour
0,907649,396,Harry Bridges Plaza (Ferry Building),50,Embarcadero at Sansome,60,187,Subscriber,94602,20150827,20150827,516,523
1,384043,636,Market at 10th,67,Washington at Kearny,46,417,Subscriber,94133,20140728,20140728,1326,1337
2,316176,334,Market at Sansome,77,2nd at Folsom,62,281,Subscriber,94107,20140609,20140609,522,527
3,618874,666,San Francisco Caltrain 2 (330 Townsend),69,Temporary Transbay Terminal (Howard at Beale),55,634,Subscriber,94602,20150126,20150126,1015,1027
4,910977,318,Market at 10th,67,Powell Street BART,39,607,Subscriber,94709,20150829,20150829,909,914


In [7]:
trip_train.dropna(axis = 1, inplace = True)


In [8]:

trip_train['duration'] = trip_train[pd.to_numeric(trip_train.duration, errors='coerce').notnull()]['duration']
d = defaultdict(preprocessing.LabelEncoder)


fit = trip_train.apply(lambda x: d[x.name].fit_transform(x))


fit.apply(lambda x: d[x.name].inverse_transform(x))

trip_train = trip_train.apply(lambda x: d[x.name].transform(x))
trip_train.head()

Unnamed: 0,id,duration,start_station_name,start_station_id,end_station_name,end_station_id,bike_id,subscription_type,start_date,end_date,start_hour,end_hour
0,546342,336,24,40,18,48,168,1,728,728,513,520
1,230234,576,28,55,72,36,395,1,333,333,1323,1334
2,189853,274,30,65,0,50,260,1,284,284,519,524
3,368447,606,52,57,68,43,612,1,515,515,1012,1024
4,548459,258,28,55,40,32,585,1,730,730,906,911


In [9]:
trip_train['is_true'] = np.random.uniform(0,1,len(trip_train)) <= 0.9
df = trip_train
train, test = df[df['is_true']==True], df[df['is_true']==False]


In [10]:
del train['is_true']
del test['is_true']
train.head()


Unnamed: 0,id,duration,start_station_name,start_station_id,end_station_name,end_station_id,bike_id,subscription_type,start_date,end_date,start_hour,end_hour
0,546342,336,24,40,18,48,168,1,728,728,513,520
1,230234,576,28,55,72,36,395,1,333,333,1323,1334
2,189853,274,30,65,0,50,260,1,284,284,519,524
3,368447,606,52,57,68,43,612,1,515,515,1012,1024
4,548459,258,28,55,40,32,585,1,730,730,906,911


In [11]:
features = df.columns[3:4]

features

Index(['start_station_id'], dtype='object')

In [14]:
clf = RandomForestClassifier(n_jobs=-1, cv = 20)

clf.fit(train[features], train['duration'])

TypeError: __init__() got an unexpected keyword argument 'cv'