In [148]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

In [149]:
dfTrain = pd.read_csv('../../../DataSet/trip_train.csv', 
                      parse_dates=['start_date', 'end_date'], 
                      infer_datetime_format=True)

dfTest = pd.read_csv('../../../DataSet/trip_test.csv', 
                     parse_dates=['start_date', 'end_date'], 
                     infer_datetime_format=True)

dfWeather = pd.read_csv('../../../DataSet/weather.csv', parse_dates=['date'], infer_datetime_format=True)

dfStation = pd.read_csv('../../../DataSet/station.csv', parse_dates=['installation_date'], infer_datetime_format=True)

In [150]:

#Descarto las rows que no me brindan datos acerca de las precipitaciones
def lala(dfPrecipitations):
    return dfPrecipitations.events.isnull() & (dfPrecipitations.precipitation_inches != 0)

dfPrecipitations = dfWeather[['date', 'zip_code', 'events', 'precipitation_inches']]
dfPrecipitations.rename(columns={'date':'start_date'}, inplace=True)

dfPrecipitations.events = dfPrecipitations.events.apply(lambda x: x.capitalize() if pd.notnull(x) else 'Clear')
dfPrecipitations[dfPrecipitations.events == 'Clear'].precipitation_inches.unique()

array(['0', '0.01', 'T', '0.08', '0.05', '0.03', nan, '0.02'], dtype=object)

Como se puede ver, en los que quedan categorizados como 'Clear' las precipitaciones son muy bajas y, en su gran mayoría, el valor es 0. Por lo que tomo como correcta la categorización.

In [151]:
#Convierto a 0,1,2,3,4 de acuerdo al tipo de evento
events = dfPrecipitations.events.unique()
dfPrecipitations.events = dfPrecipitations.events.astype('category', categories=events).cat.codes
dfPrecipitations.events.unique()

array([0, 1, 2, 3, 4], dtype=int64)

In [152]:
''' Utilizada como parámetro para función DataFrame.apply
    Para cada row, devuelve el zip code correspondiente a la ciudad.'''
def cityNameToZipCode(row) :

    if row.city == 'San Francisco' :
        return 94107

    if row.city == 'Redwood City' :
        return 94063

    if row.city == 'Palo Alto' :
        return 94301

    if row.city == 'Mountain View' :
        return 94041

    if row.city == 'San Jose' :
        return 95113

dfStation.rename(columns={'id':'start_station_id'}, inplace=True)
#Agrego columna de events a dfTrain
dfTrain = dfTrain.merge(dfStation[['start_station_id', 'city']], on=['start_station_id'])
dfTrain.zip_code = dfTrain.apply(cityNameToZipCode, axis=1)
dfTrain.start_date = pd.to_datetime(dfTrain.start_date.dt.date)
dfTrain = dfTrain.merge(dfPrecipitations, on=['start_date', 'zip_code'])
#Agrego columna de events a dfTest
dfTest = dfTest.merge(dfStation[['start_station_id', 'city']], on=['start_station_id'])
dfTest.zip_code = dfTest.apply(cityNameToZipCode, axis=1)
dfTest.start_date = pd.to_datetime(dfTest.start_date.dt.date)
dfTest = dfTest.merge(dfPrecipitations, on=['start_date', 'zip_code'])

In [153]:
# Convierto a los SUSCRIBER en un 0
# Convierto a los CUSTOMER en un 1
subscriptionTypes = dfTrain.subscription_type.unique()
print subscriptionTypes

# Reemplazo por 0 y 1
dfTrain.subscription_type = dfTrain.subscription_type.astype('category', categories=subscriptionTypes).cat.codes
dfTest.subscription_type = dfTest.subscription_type.astype('category', categories=subscriptionTypes).cat.codes

['Subscriber' 'Customer']


In [154]:
# GENERO TARGET, TRAIN, TEST Y TESTIDS (testids para el output)
target = dfTrain.duration
testIds = dfTest['id']


trainDateData = {'start_month':dfTrain.start_date.dt.month,
                 'start_dayOfWeek':dfTrain.start_date.dt.dayofweek, 
                 'start_hourOfDay':dfTrain.start_date.dt.hour}

testDateData = {'start_month':dfTest.start_date.dt.month,
                'start_dayOfWeek':dfTest.start_date.dt.dayofweek, 
                'start_hourOfDay':dfTest.start_date.dt.hour}

# Agrego columnas con la informacion de fechas a los dataframes
dfTrain = dfTrain[['start_station_id', 'subscription_type', 'events']].join(pd.DataFrame(trainDateData), how='outer')
dfTest = dfTest[['start_station_id', 'subscription_type', 'events']].join(pd.DataFrame(testDateData), how='outer')



In [155]:
rf = RandomForestRegressor(n_estimators=3, n_jobs=-1)

print("Volcando puntos...")
rf.fit(dfTrain, target)

print("Prediciendo...")
predictions = rf.predict(dfTest)



Volcando puntos...
Prediciendo...


In [156]:
zip(predictions, testIds)

[(2537.1545222854434, 504737),
 (891.10169082125594, 505036),
 (2537.1545222854434, 504958),
 (2537.1545222854434, 505161),
 (2537.1545222854434, 504906),
 (891.10169082125594, 504720),
 (2537.1545222854434, 505126),
 (2537.1545222854434, 505048),
 (891.10169082125594, 504616),
 (2537.1545222854434, 505015),
 (641.65604765604769, 505067),
 (575.08297755883962, 504667),
 (4787.0206133735546, 505142),
 (575.08297755883962, 504878),
 (498.7955165692008, 504669),
 (4069.2614379084966, 504819),
 (498.7955165692008, 504695),
 (579.32702020202021, 505019),
 (579.32702020202021, 504692),
 (704.47821138211384, 504843),
 (1670.7210732714138, 504783),
 (546.14281305114639, 504825),
 (546.14281305114639, 504760),
 (546.14281305114639, 504911),
 (1690.4045400238949, 504590),
 (1690.4045400238949, 504595),
 (1690.4045400238949, 504972),
 (1690.4045400238949, 504659),
 (706.80303030303037, 504604),
 (2110.6724242424243, 504872),
 (1625.9279609279608, 505011),
 (567.41364626658742, 504689),
 (2884.339