In [1]:

%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import datetime as dt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import defaultdict

plt.style.use('default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 5)

In [2]:
station = pd.read_csv('station.csv', low_memory=False)
trip_test = pd.read_csv('trip_test.csv', low_memory=False)
trip_train = pd.read_csv('trip_train.csv', low_memory=False)
weather = pd.read_csv('weather.csv', low_memory=False)
status = pd.read_csv('status.csv', low_memory=True)


In [3]:
trip_train.head()

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code
0,907649,396,8/27/2015 8:36,Harry Bridges Plaza (Ferry Building),50,8/27/2015 8:43,Embarcadero at Sansome,60,187,Subscriber,94602
1,384043,636,7/28/2014 22:06,Market at 10th,67,7/28/2014 22:17,Washington at Kearny,46,417,Subscriber,94133
2,316176,334,6/9/2014 8:42,Market at Sansome,77,6/9/2014 8:47,2nd at Folsom,62,281,Subscriber,94107
3,618874,666,1/26/2015 16:55,San Francisco Caltrain 2 (330 Townsend),69,1/26/2015 17:07,Temporary Transbay Terminal (Howard at Beale),55,634,Subscriber,94602
4,910977,318,8/29/2015 15:09,Market at 10th,67,8/29/2015 15:14,Powell Street BART,39,607,Subscriber,94709


In [4]:
#separamos en dos campos nuevos. fecha en formato yyyymmdd y hora del dia en minutos
fechasStart = trip_train['start_date'].map(lambda x: (int(dt.datetime.strptime(x.split(' ')[0], "%m/%d/%Y").strftime("%Y%m%d"))))
horasStart = trip_train['start_date'].map(lambda x: (x.split(' ')[1]))
horasStart = horasStart.map(lambda x: (int(x.split(':')[0]) * 60 + int(x.split(':')[1])))

fechasEnd = trip_train['end_date'].map(lambda x: (int(dt.datetime.strptime(x.split(' ')[0], "%m/%d/%Y").strftime("%Y%m%d"))))
horasEnd = trip_train['end_date'].map(lambda x: (x.split(' ')[1]))
horasEnd = horasEnd.map(lambda x: (int(x.split(':')[0]) * 60 + int(x.split(':')[1])))

In [5]:
del trip_train['start_date']
del trip_train['end_date']
trip_train.head()

Unnamed: 0,id,duration,start_station_name,start_station_id,end_station_name,end_station_id,bike_id,subscription_type,zip_code
0,907649,396,Harry Bridges Plaza (Ferry Building),50,Embarcadero at Sansome,60,187,Subscriber,94602
1,384043,636,Market at 10th,67,Washington at Kearny,46,417,Subscriber,94133
2,316176,334,Market at Sansome,77,2nd at Folsom,62,281,Subscriber,94107
3,618874,666,San Francisco Caltrain 2 (330 Townsend),69,Temporary Transbay Terminal (Howard at Beale),55,634,Subscriber,94602
4,910977,318,Market at 10th,67,Powell Street BART,39,607,Subscriber,94709


In [6]:
trip_train['start_date'] = fechasStart
trip_train['end_date'] = fechasEnd
trip_train['start_hour'] = horasStart
trip_train['end_hour'] = horasEnd

trip_train.head()


Unnamed: 0,id,duration,start_station_name,start_station_id,end_station_name,end_station_id,bike_id,subscription_type,zip_code,start_date,end_date,start_hour,end_hour
0,907649,396,Harry Bridges Plaza (Ferry Building),50,Embarcadero at Sansome,60,187,Subscriber,94602,20150827,20150827,516,523
1,384043,636,Market at 10th,67,Washington at Kearny,46,417,Subscriber,94133,20140728,20140728,1326,1337
2,316176,334,Market at Sansome,77,2nd at Folsom,62,281,Subscriber,94107,20140609,20140609,522,527
3,618874,666,San Francisco Caltrain 2 (330 Townsend),69,Temporary Transbay Terminal (Howard at Beale),55,634,Subscriber,94602,20150126,20150126,1015,1027
4,910977,318,Market at 10th,67,Powell Street BART,39,607,Subscriber,94709,20150829,20150829,909,914


In [7]:
trip_train.dropna(axis = 1, inplace = True)


In [8]:

trip_train['duration'] = trip_train[pd.to_numeric(trip_train.duration, errors='coerce').notnull()]['duration']
d = defaultdict(preprocessing.LabelEncoder)


fit = trip_train.apply(lambda x: d[x.name].fit_transform(x))


fit.apply(lambda x: d[x.name].inverse_transform(x))

trip_train = trip_train.apply(lambda x: d[x.name].transform(x))
trip_train.head()

Unnamed: 0,id,duration,start_station_name,start_station_id,end_station_name,end_station_id,bike_id,subscription_type,start_date,end_date,start_hour,end_hour
0,546342,336,24,40,18,48,168,1,728,728,513,520
1,230234,576,28,55,72,36,395,1,333,333,1323,1334
2,189853,274,30,65,0,50,260,1,284,284,519,524
3,368447,606,52,57,68,43,612,1,515,515,1012,1024
4,548459,258,28,55,40,32,585,1,730,730,906,911


In [9]:
trip_train['is_true'] = np.random.uniform(0,1,len(trip_train)) <= 0.9
df = trip_train
train, test = df[df['is_true']==True], df[df['is_true']==False]


In [10]:
del train['is_true']
del test['is_true']
train.head()


Unnamed: 0,id,duration,start_station_name,start_station_id,end_station_name,end_station_id,bike_id,subscription_type,start_date,end_date,start_hour,end_hour
0,546342,336,24,40,18,48,168,1,728,728,513,520
1,230234,576,28,55,72,36,395,1,333,333,1323,1334
2,189853,274,30,65,0,50,260,1,284,284,519,524
3,368447,606,52,57,68,43,612,1,515,515,1012,1024
4,548459,258,28,55,40,32,585,1,730,730,906,911


In [11]:
features = df.columns[3:4]

features

Index(['start_station_id'], dtype='object')

In [14]:
clf = RandomForestClassifier(n_jobs=-1, cv = 20)

clf.fit(train[features], train['duration'])

TypeError: __init__() got an unexpected keyword argument 'cv'