In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import time
start = time.time()

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
train = pd.read_csv('./data/train.csv', low_memory=False)
test = pd.read_csv('./data/test.csv', low_memory=False)

In [5]:
nombres = {'subscription_type':  'sub_type',
           'start_station_name': 'ss_name',
           'end_station_name':   'es_name',
           'start_station_id':   'sid',
           'end_station_id':     'es_id',
           'start_date':         's_date',
           'end_date':           'e_date',
           'zip_code':           'z_code',
           'bike_id':            'b_id'}

train.rename(columns=nombres, inplace=True)
train.drop(['ss_name', 'es_name', 'z_code', 'id','e_date','es_id'], axis=1, inplace=True)
train.rename(columns={'sid':'id'}, inplace=True)
train['s_date'] = pd.to_datetime(train['s_date'], format='%m/%d/%Y %H:%M')
train['year'] = train['s_date'].map(lambda x:x.year)
train['month'] = train['s_date'].map(lambda x:x.month)
train['day'] = train['s_date'].map(lambda x:x.day)
train['weekday'] = train['s_date'].map(lambda x: x.weekday())
train['hour'] = train['s_date'].map(lambda x:x.hour)
train['minute'] = train['s_date'].map(lambda x:x.minute)
train['date'] = pd.DatetimeIndex(train['s_date']).normalize()
train = train.merge(pd.get_dummies(train.sub_type), left_index=True, right_index=True)
train.drop('sub_type', axis=1, inplace=True)

test.rename(columns=nombres, inplace=True)
test.drop(['ss_name', 'es_name', 'z_code', 'e_date','es_id'], axis=1, inplace=True)
test.rename(columns={'id':'id_final'}, inplace=True)
test.rename(columns={'sid':'id'}, inplace=True)
test['s_date'] = pd.to_datetime(test['s_date'], format='%m/%d/%Y %H:%M')
test['year'] = test['s_date'].map(lambda x:x.year)
test['month'] = test['s_date'].map(lambda x:x.month)
test['day'] = test['s_date'].map(lambda x:x.day)
test['weekday'] = test['s_date'].map(lambda x: x.weekday())
test['hour'] = test['s_date'].map(lambda x:x.hour)
test['minute'] = test['s_date'].map(lambda x:x.minute)
test['date'] = pd.DatetimeIndex(test['s_date']).normalize()
test = test.merge(pd.get_dummies(test.sub_type), left_index=True, right_index=True)
test.drop('sub_type', axis=1, inplace=True)

In [6]:
test.shape

(119998, 13)

In [7]:
train = train[train.duration <= 21600]

In [8]:
stations = pd.read_csv('./data/station.csv', low_memory=False)

In [9]:
stations.drop(['name','lat','long','installation_date'], axis=1, inplace=True)

In [10]:
weather = pd.read_csv('./data/weather.csv', low_memory=False)

In [11]:
def zip_ciudad(zip_code):
    if zip_code == 95113: return 'San Jose'
    elif zip_code == 94301: return 'Palo Alto'
    elif zip_code == 94107: return 'San Francisco'
    elif zip_code == 94063: return 'Redwood City'
    else: return 'Mountain View'

In [12]:
rename = {'max_temperature_f': 'max_temp', 
          'mean_temperature_f': 'mean_temp', 
          'min_temperature_f': 'min_temp', 
          'max_dew_point_f': 'max_dew', 
          'mean_dew_point_f': 'mean_dew', 
          'min_dew_point_f': 'min_dew', 
          'max_humidity': 'max_hum', 
          'mean_humidity': 'mean_hum', 
          'min_humidity': 'min_hum', 
          'max_sea_level_pressure_inches': 'max_slp', 
          'mean_sea_level_pressure_inches': 'mean_slp', 
          'min_sea_level_pressure_inches': 'min_slp', 
          'max_visibility_miles': 'max_vis', 
          'mean_visibility_miles': 'mean_vis', 
          'min_visibility_miles': 'min_vis', 
          'max_wind_Speed_mph': 'max_wind', 
          'mean_wind_speed_mph': 'mean_wind', 
          'max_gust_speed_mph': 'max_gust', 
          'precipitation_inches': 'precipitation', 
          'cloud_cover': 'cloud', 
          'wind_dir_degrees': 'wind_dir', 
          'zip_code': 'z_code'
         }

weather.rename(columns=rename, inplace=True)

weather['date'] = pd.to_datetime(weather['date'], format='%m/%d/%Y')
weather.loc[weather.events == 'rain', 'events'] = 'Rain'
weather.loc[weather.events.isnull(), 'events'] = 'Normal'
weather = weather.merge(pd.get_dummies(weather.events), left_index=True, right_index=True)

weather.drop(['precipitation'], axis=1, inplace=True)

weather['city'] = weather['z_code'].map(zip_ciudad)

for i in weather:
    weather.drop(weather.loc[weather[i].isnull()].index, inplace=True)

In [82]:
df = pd.merge(train, stations, how='inner', on='id')
test_final = pd.merge(test, stations, how='inner', on='id')

In [83]:
df = pd.merge(df, weather, how='inner', on=['date','city'])
test_final = pd.merge(test_final, weather, how='outer', on=['date','city'])

In [84]:
df = df.merge(pd.get_dummies(df.city), left_index=True, right_index=True)
test_final = test_final.merge(pd.get_dummies(test_final.city), left_index=True, right_index=True)

In [85]:
df.drop(['s_date','date','events','z_code','city'], axis=1, inplace=True)
test_final.drop(['s_date','date','events','z_code','city'], axis=1, inplace=True)

In [101]:
test_final.drop(test_final[test_final.id_final.isnull()].index, inplace=True)

In [103]:
test_final = test_final.fillna(test_final.mean())

In [104]:
test_final.isnull().sum()

id_final             0
id                   0
b_id                 0
year                 0
month                0
day                  0
weekday              0
hour                 0
minute               0
Customer             0
Subscriber           0
dock_count           0
max_temp             0
mean_temp            0
min_temp             0
max_dew              0
mean_dew             0
min_dew              0
max_hum              0
mean_hum             0
min_hum              0
max_slp              0
mean_slp             0
min_slp              0
max_vis              0
mean_vis             0
min_vis              0
max_wind             0
mean_wind            0
max_gust             0
cloud                0
wind_dir             0
Fog                  0
Fog-Rain             0
Normal               0
Rain                 0
Rain-Thunderstorm    0
Mountain View        0
Palo Alto            0
Redwood City         0
San Francisco        0
San Jose             0
dtype: int64

In [46]:
df.isnull().sum()

duration             0
id                   0
b_id                 0
year                 0
month                0
day                  0
weekday              0
hour                 0
minute               0
Customer             0
Subscriber           0
dock_count           0
max_temp             0
mean_temp            0
min_temp             0
max_dew              0
mean_dew             0
min_dew              0
max_hum              0
mean_hum             0
min_hum              0
max_slp              0
mean_slp             0
min_slp              0
max_vis              0
mean_vis             0
min_vis              0
max_wind             0
mean_wind            0
max_gust             0
cloud                0
wind_dir             0
Fog                  0
Fog-Rain             0
Normal               0
Rain                 0
Rain-Thunderstorm    0
Mountain View        0
Palo Alto            0
Redwood City         0
San Francisco        0
San Jose             0
dtype: int64

In [105]:
test_final.tail(2)

Unnamed: 0,id_final,id,b_id,year,month,day,weekday,hour,minute,Customer,...,Fog,Fog-Rain,Normal,Rain,Rain-Thunderstorm,Mountain View,Palo Alto,Redwood City,San Francisco,San Jose
119996,426726.0,23.0,689.0,2014.0,8.0,27.0,2.0,9.0,0.0,0.0,...,0.064891,0.010142,0.813209,0.109584,0.002174,0,0,1,0,0
119997,644534.0,23.0,643.0,2015.0,2.0,14.0,5.0,15.0,24.0,1.0,...,0.064891,0.010142,0.813209,0.109584,0.002174,0,0,1,0,0


In [106]:
from sklearn.preprocessing import normalize
Y = df.duration.values
df = normalize(df.drop(['duration'], axis=1))
ids_final = test_final.id_final.values
test_final = normalize(test_final.drop(['id_final'], axis=1))

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.cross_decomposition import *
from sklearn.ensemble import * 
from sklearn.tree import *
from sklearn.neural_network import *
from sklearn.linear_model import *
from sklearn.metrics import mean_squared_error as mse
from sklearn.gaussian_process import *
from sklearn.kernel_ridge import *
from sklearn.neighbors import *
from sklearn.svm import *
from sklearn.model_selection import *

In [None]:
def scores_de_predictores_CV_5():
    for reg in regs:
        print regs[reg],np.mean(cross_val_score(reg, x, y, cv=5, scoring="neg_mean_squared_error", n_jobs=1) * -1)

In [None]:
regs = {AdaBoostRegressor():         'AdaBoost         ',
        BaggingRegressor():          'Bagging          ',
        ExtraTreesRegressor():       'ExtraTrees       ',
        GradientBoostingRegressor(): 'GrandientBoosting',
        RandomForestRegressor():     'RandomForest     ',
        DecisionTreeRegressor():     'DecisionTree     ',
        HuberRegressor():            'Huber            ',
        LinearRegression():          'Linear           ',
        LogisticRegression():        'Logistic         ',
        PassiveAggressiveRegressor():'PassiveAggressive',
        RANSACRegressor():           'RANSAC           ',
        SGDRegressor():              'SGD              ',
        TheilSenRegressor():         'TheilSen         ',
        KernelRidge():               'KernelRidge      ',
        GaussianProcessRegressor():  'GaussianProcess  ',
        KNeighborsRegressor():       'KNN              ',
        RadiusNeighborsRegressor():  'RNN              ',
        MLPRegressor():              'MLP              ',
        PLSRegression():             'PLS              ',
        SVR():                       'SVR              ',
        ElasticNet():                'ElasticNet       ',
        BayesianRidge():             'BayesRidge       ',
        LarsCV():                    'LarsCV           ',
        RidgeCV():                   'RidgeCV          '
       }

def plots_de_predictores_3000():
    x = df[:3000]
    y = Y[:3000]
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    plt.figure(figsize=(40, 40));
    i = 1
    for reg in regs:
        s1 = time.time()
        reg.fit(X_train, y_train)
        s2 = time.time()
        pred = reg.predict(X_test)
        score = mse(pred, y_test)
        plt.subplot(6, 4, i)
        plt.title(regs[reg] + str(s2-s1))
        plt.plot(pred)
        plt.plot(y_test)
        i += 1
    print 'fin'

#plots_de_predictores_3000()
#scores_de_predictores_CV_5()

In [None]:
#Seleccionamos los mejores algoritmos y realizamos CV con 100000 registros.
def cv_mejores_estimadores_100000():
    x = df[:100000]
    y = Y[:100000]
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    regs = {
            BaggingRegressor():          'Bagging          ',
            ExtraTreesRegressor():       'ExtraTrees       ',
            DecisionTreeRegressor():     'DecisionTree     ',
           }

    scores_de_predictores_CV_5()

In [None]:
#GridSearch de ExtraTrees.
def gs_extra_trees_5000():
    x = df[:5000]
    y = Y[:5000]
    params = {"n_estimators": list(range(10,31))}
    ext = ExtraTreesRegressor()

    clf = GridSearchCV(ext, param_grid=params)
    clf.fit(x, y)
    print "Mejores parametros:"
    clf.best_params_

In [None]:
#Comparacion de ExtraTrees con y sin parametros con 100000 registros.
def comparacion_extra_trees_100000():
    x = df[:100000]
    y = Y[:100000]
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    regs = {
            ExtraTreesRegressor():       'ExtraTrees sin parametros       ',
            ExtraTreesRegressor(n_estimators=20):    'ExtraTrees con parametros     ',
           }

    scores_de_predictores_CV_5()

In [None]:
def estimacion_final():
    X_train, X_test, y_train, y_test = train_test_split(df, Y, test_size=0)
    
    #est = ExtraTreesRegressor(n_estimators=29)
    est = ExtraTreesRegressor()
    est.fit(X_train, y_train)
    predicciones = est.predict(test_final)
    return predicciones

predicciones = estimacion_final()

In [51]:
end = time.time()
print end-start

415.811530113


In [53]:
df.shape

(530864, 41)

In [54]:
test_final.shape

(120297, 41)

In [None]:
ids_final[0]

In [52]:
f=open('submission.csv','w')
f.write('id,duration\n')
for i in range(len(ids_final)):
    f.write(str(ids_final[i])+','+'%.f'%predicciones[i]+'\n')
f.close()

f=open('sumission.csv','w')
for i in predicciones:
    f.write('%.f'%(,i)+',\n')
f.close()