In [1]:
cities = {'San Francisco':1,'San Jose':2,'Mountain View':3,'Palo Alto':4,'Redwood City':5}
def ciudades(ciudad):
    return cities[ciudad]

# Funcion para convertir el tipo de subscripcion
def sub_type(sub):
    if sub == 'Customer': return 1
    return 0

events = {'Normal': 0 , 'Fog': 1 , 'Rain': 2 , 'Fog-Rain': 3 , 'Rain-Thunderstorm': 4}
def eventos(event):
    return events[event]

In [2]:
import time
start = time.time()

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
train = pd.read_csv('./data/train.csv', low_memory=False)

In [5]:
nombres = {'subscription_type':  'sub_type',
           'start_station_name': 'ss_name',
           'end_station_name':   'es_name',
           'start_station_id':   'sid',
           'end_station_id':     'es_id',
           'start_date':         's_date',
           'end_date':           'e_date',
           'zip_code':           'z_code',
           'bike_id':            'b_id'}

train.rename(columns=nombres, inplace=True)

train.drop(['ss_name', 'es_name', 'z_code', 'id','e_date','es_id'], axis=1, inplace=True)
train.rename(columns={'sid':'id'}, inplace=True)

train['s_date'] = pd.to_datetime(train['s_date'], format='%m/%d/%Y %H:%M')

train['year'] = train['s_date'].map(lambda x:x.year)
train['month'] = train['s_date'].map(lambda x:x.month)
train['day'] = train['s_date'].map(lambda x:x.day)
train['weekday'] = train['s_date'].map(lambda x: x.weekday())
train['hour'] = train['s_date'].map(lambda x:x.hour)
train['minute'] = train['s_date'].map(lambda x:x.minute)

train['date'] = pd.DatetimeIndex(train['s_date']).normalize()

train = train.merge(pd.get_dummies(train.sub_type), left_index=True, right_index=True)
train.drop('sub_type', axis=1, inplace=True)

In [6]:
train = train[train.duration <= 21600]

In [7]:
stations = pd.read_csv('./data/station.csv', low_memory=False)

In [8]:
stations = stations.merge(pd.get_dummies(stations.city), left_index=True, right_index=True)
stations.drop(['name','lat','long','installation_date','city'], axis=1, inplace=True)

In [9]:
weather = pd.read_csv('./data/weather.csv', low_memory=False)

In [10]:
rename = {'max_temperature_f': 'max_temp', 
          'mean_temperature_f': 'mean_temp', 
          'min_temperature_f': 'min_temp', 
          'max_dew_point_f': 'max_dew', 
          'mean_dew_point_f': 'mean_dew', 
          'min_dew_point_f': 'min_dew', 
          'max_humidity': 'max_hum', 
          'mean_humidity': 'mean_hum', 
          'min_humidity': 'min_hum', 
          'max_sea_level_pressure_inches': 'max_slp', 
          'mean_sea_level_pressure_inches': 'mean_slp', 
          'min_sea_level_pressure_inches': 'min_slp', 
          'max_visibility_miles': 'max_vis', 
          'mean_visibility_miles': 'mean_vis', 
          'min_visibility_miles': 'min_vis', 
          'max_wind_Speed_mph': 'max_wind', 
          'mean_wind_speed_mph': 'mean_wind', 
          'max_gust_speed_mph': 'max_gust', 
          'precipitation_inches': 'precipitation', 
          'cloud_cover': 'cloud', 
          'wind_dir_degrees': 'wind_dir', 
          'zip_code': 'z_code'
         }

weather.rename(columns=rename, inplace=True)

weather['date'] = pd.to_datetime(weather['date'], format='%m/%d/%Y')
weather.loc[weather.events == 'rain', 'events'] = 'Rain'
weather.loc[weather.events.isnull(), 'events'] = 'Normal'
weather = weather.merge(pd.get_dummies(weather.events), left_index=True, right_index=True)
#weather.events = weather.events.map(lambda x: events(x))

weather.loc[weather.precipitation == 'T', 'precipitation'] = 0
weather.precipitation = weather.precipitation.astype(np.float64)

for i in weather:
    weather.drop(weather.loc[weather[i].isnull()].index, inplace=True)

In [11]:
df = pd.merge(train, stations, how='right', on='id')

In [12]:
df.shape

(547395, 19)

In [13]:
df = pd.merge(df, weather, how='right', on='date')

In [14]:
df.shape

(2035506, 47)

In [15]:
df.drop(['s_date','date','events'], axis=1, inplace=True)

In [16]:
df.isnull().sum()

duration             0
id                   0
b_id                 0
year                 0
month                0
day                  0
weekday              0
hour                 0
minute               0
Customer             0
Subscriber           0
dock_count           0
Mountain View        0
Palo Alto            0
Redwood City         0
San Francisco        0
San Jose             0
max_temp             0
mean_temp            0
min_temp             0
max_dew              0
mean_dew             0
min_dew              0
max_hum              0
mean_hum             0
min_hum              0
max_slp              0
mean_slp             0
min_slp              0
max_vis              0
mean_vis             0
min_vis              0
max_wind             0
mean_wind            0
max_gust             0
precipitation        0
cloud                0
wind_dir             0
z_code               0
Fog                  0
Fog-Rain             0
Normal               0
Rain                 0
Rain-Thunde

In [17]:
from sklearn.preprocessing import normalize
Y = df.duration.values
df = normalize(df.drop(['duration'], axis=1))

In [18]:
#from pandas.tseries.holiday import USFederalHolidayCalendar
#from pandas.tseries.offsets import CustomBusinessDay

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.ensemble import * 
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import *
from sklearn.model_selection import cross_val_score
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error as mse
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.kernel_ridge import KernelRidge




In [19]:
x = df[:1000]
y = Y[:1000]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [21]:
regs = {AdaBoostRegressor():'AdaBoost         ',
BaggingRegressor():'Bagging          ',
ExtraTreesRegressor():'ExtraTrees       ',
GradientBoostingRegressor():'GrandientBoosting',
RandomForestRegressor():'RandomForest     ',
DecisionTreeRegressor():'DecisionTree     ',
ARDRegression():'ARD              ',
HuberRegressor():'Huber            ',
LinearRegression():'Linear           ',
LogisticRegression():'Logistic         ',
PassiveAggressiveRegressor():'PassiveAggressive',
RANSACRegressor():'RANSAC           ',
SGDRegressor():'SGD              ',
TheilSenRegressor():'TheilSen         ',
KernelRidge():'KernelRidge      ',
GaussianProcessRegressor():'GaussianProcess  '}

for reg in regs:
    reg.fit(X_train, y_train)
    print regs[reg],mse(reg.predict(X_test), y_test)#,'\t ', reg.score(X_test, y_test)

GrandientBoosting 3558784.77579
RANSAC            200404640936.0
RandomForest      3481778.1345
ARD               3689569.58531
SGD               3690041.98287
PassiveAggressive 3749869.89779
DecisionTree      2092393.92
TheilSen          3564378.39385
AdaBoost          3729020.82413
KernelRidge       3689734.05241
Bagging           2905468.28285
Linear            3512080.5393
GaussianProcess   3515935.05244
ExtraTrees        2718297.32515
Logistic          3722314.815
Huber             3735942.7178


In [22]:
end = time.time()
print end-start

39.0715739727
