In [1]:
%matplotlib inline
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import timedelta
import datetime as dt
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [16, 10]
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
import warnings
from collections import Counter

warnings.filterwarnings('ignore')

In [2]:
np.random.seed(1987)
N = 100000 # number of sample rows in plots
t0 = dt.datetime.now()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
#weather = pd.read_csv('weather_data_nyc_centralpark_2016.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
train['pickup_datetime'] = pd.to_datetime(train.pickup_datetime)
test['pickup_datetime'] = pd.to_datetime(test.pickup_datetime)
train.loc[:, 'pickup_date'] = train['pickup_datetime'].dt.date
test.loc[:, 'pickup_date'] = test['pickup_datetime'].dt.date
train['dropoff_datetime'] = pd.to_datetime(train.dropoff_datetime)
train['store_and_fwd_flag'] = 1 * (train.store_and_fwd_flag.values == 'Y')
test['store_and_fwd_flag'] = 1 * (test.store_and_fwd_flag.values == 'Y')
train['check_trip_duration'] = (train['dropoff_datetime'] - train['pickup_datetime']).map(lambda x: x.total_seconds())
duration_difference = train[np.abs(train['check_trip_duration'].values  - train['trip_duration'].values) > 1]
train['log_trip_duration'] = np.log(train['trip_duration'].values + 1)
print('Trip_duration and datetimes are ok.') if len(duration_difference[['pickup_datetime', 'dropoff_datetime', 'trip_duration', 'check_trip_duration']]) == 0 else print('Ooops.')

Trip_duration and datetimes are ok.


In [4]:
coords = np.vstack((train[['pickup_latitude', 'pickup_longitude']].values,
                    train[['dropoff_latitude', 'dropoff_longitude']].values,
                    test[['pickup_latitude', 'pickup_longitude']].values,
                    test[['dropoff_latitude', 'dropoff_longitude']].values))

pca = PCA().fit(coords)
train['pickup_pca0'] = pca.transform(train[['pickup_latitude', 'pickup_longitude']])[:, 0]
train['pickup_pca1'] = pca.transform(train[['pickup_latitude', 'pickup_longitude']])[:, 1]
train['dropoff_pca0'] = pca.transform(train[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
train['dropoff_pca1'] = pca.transform(train[['dropoff_latitude', 'dropoff_longitude']])[:, 1]
test['pickup_pca0'] = pca.transform(test[['pickup_latitude', 'pickup_longitude']])[:, 0]
test['pickup_pca1'] = pca.transform(test[['pickup_latitude', 'pickup_longitude']])[:, 1]
test['dropoff_pca0'] = pca.transform(test[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
test['dropoff_pca1'] = pca.transform(test[['dropoff_latitude', 'dropoff_longitude']])[:, 1]

In [5]:
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

def dummy_manhattan_distance(lat1, lng1, lat2, lng2):
    a = haversine_array(lat1, lng1, lat1, lng2)
    b = haversine_array(lat1, lng1, lat2, lng1)
    return a + b

def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

train.loc[:, 'distance_haversine'] = haversine_array(train['pickup_latitude'].values, train['pickup_longitude'].values, train['dropoff_latitude'].values, train['dropoff_longitude'].values)
train.loc[:, 'distance_dummy_manhattan'] = dummy_manhattan_distance(train['pickup_latitude'].values, train['pickup_longitude'].values, train['dropoff_latitude'].values, train['dropoff_longitude'].values)
train.loc[:, 'direction'] = bearing_array(train['pickup_latitude'].values, train['pickup_longitude'].values, train['dropoff_latitude'].values, train['dropoff_longitude'].values)
train.loc[:, 'pca_manhattan'] = np.abs(train['dropoff_pca1'] - train['pickup_pca1']) + np.abs(train['dropoff_pca0'] - train['pickup_pca0'])

test.loc[:, 'distance_haversine'] = haversine_array(test['pickup_latitude'].values, test['pickup_longitude'].values, test['dropoff_latitude'].values, test['dropoff_longitude'].values)
test.loc[:, 'distance_dummy_manhattan'] = dummy_manhattan_distance(test['pickup_latitude'].values, test['pickup_longitude'].values, test['dropoff_latitude'].values, test['dropoff_longitude'].values)
test.loc[:, 'direction'] = bearing_array(test['pickup_latitude'].values, test['pickup_longitude'].values, test['dropoff_latitude'].values, test['dropoff_longitude'].values)
test.loc[:, 'pca_manhattan'] = np.abs(test['dropoff_pca1'] - test['pickup_pca1']) + np.abs(test['dropoff_pca0'] - test['pickup_pca0'])

train.loc[:, 'center_latitude'] = (train['pickup_latitude'].values + train['dropoff_latitude'].values) / 2
train.loc[:, 'center_longitude'] = (train['pickup_longitude'].values + train['dropoff_longitude'].values) / 2
test.loc[:, 'center_latitude'] = (test['pickup_latitude'].values + test['dropoff_latitude'].values) / 2
test.loc[:, 'center_longitude'] = (test['pickup_longitude'].values + test['dropoff_longitude'].values) / 2

In [6]:
train.loc[:, 'pickup_weekday'] = train['pickup_datetime'].dt.weekday
train.loc[:, 'pickup_hour_weekofyear'] = train['pickup_datetime'].dt.weekofyear
train.loc[:, 'pickup_hour'] = train['pickup_datetime'].dt.hour
train.loc[:, 'pickup_minute'] = train['pickup_datetime'].dt.minute
train.loc[:, 'pickup_dt'] = (train['pickup_datetime'] - train['pickup_datetime'].min()).dt.total_seconds()
train.loc[:, 'pickup_week_hour'] = train['pickup_weekday'] * 24 + train['pickup_hour']

test.loc[:, 'pickup_weekday'] = test['pickup_datetime'].dt.weekday
test.loc[:, 'pickup_hour_weekofyear'] = test['pickup_datetime'].dt.weekofyear
test.loc[:, 'pickup_hour'] = test['pickup_datetime'].dt.hour
test.loc[:, 'pickup_minute'] = test['pickup_datetime'].dt.minute
test.loc[:, 'pickup_dt'] = (test['pickup_datetime'] - train['pickup_datetime'].min()).dt.total_seconds()
test.loc[:, 'pickup_week_hour'] = test['pickup_weekday'] * 24 + test['pickup_hour']

In [7]:
train.loc[:, 'avg_speed_h'] = 1000 * train['distance_haversine'] / train['trip_duration']
train.loc[:, 'avg_speed_m'] = 1000 * train['distance_dummy_manhattan'] / train['trip_duration']
train.loc[:, 'pickup_lat_bin'] = np.round(train['pickup_latitude'], 3)
train.loc[:, 'pickup_long_bin'] = np.round(train['pickup_longitude'], 3)
# Average speed for regions
gby_cols = ['pickup_lat_bin', 'pickup_long_bin']
coord_speed = train.groupby(gby_cols).mean()[['avg_speed_h']].reset_index()
coord_count = train.groupby(gby_cols).count()[['id']].reset_index()
coord_stats = pd.merge(coord_speed, coord_count, on=gby_cols)
coord_stats = coord_stats[coord_stats['id'] > 100]

In [8]:
train.loc[:, 'pickup_lat_bin'] = np.round(train['pickup_latitude'], 3)
train.loc[:, 'pickup_long_bin'] = np.round(train['pickup_longitude'], 3)
train.loc[:, 'pickup_lat_bin'] = np.round(train['pickup_latitude'], 2)
train.loc[:, 'pickup_long_bin'] = np.round(train['pickup_longitude'], 2)
train.loc[:, 'center_lat_bin'] = np.round(train['center_latitude'], 2)
train.loc[:, 'center_long_bin'] = np.round(train['center_longitude'], 2)
train.loc[:, 'pickup_dt_bin'] = (train['pickup_dt'] // (3 * 3600))
test.loc[:, 'pickup_lat_bin'] = np.round(test['pickup_latitude'], 2)
test.loc[:, 'pickup_long_bin'] = np.round(test['pickup_longitude'], 2)
test.loc[:, 'center_lat_bin'] = np.round(test['center_latitude'], 2)
test.loc[:, 'center_long_bin'] = np.round(test['center_longitude'], 2)
test.loc[:, 'pickup_dt_bin'] = (test['pickup_dt'] // (3 * 3600))

In [9]:
sample_ind = np.random.permutation(len(coords))[:500000]
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(coords[sample_ind])

train.loc[:, 'pickup_cluster'] = kmeans.predict(train[['pickup_latitude', 'pickup_longitude']])
train.loc[:, 'dropoff_cluster'] = kmeans.predict(train[['dropoff_latitude', 'dropoff_longitude']])
test.loc[:, 'pickup_cluster'] = kmeans.predict(test[['pickup_latitude', 'pickup_longitude']])
test.loc[:, 'dropoff_cluster'] = kmeans.predict(test[['dropoff_latitude', 'dropoff_longitude']])
t1 = dt.datetime.now()
print('Time till clustering: %i seconds' % (t1 - t0).seconds)

Time till clustering: 60 seconds


In [10]:
for gby_col in ['pickup_hour', 'pickup_date', 'pickup_dt_bin',
               'pickup_week_hour', 'pickup_cluster', 'dropoff_cluster']:
    gby = train.groupby(gby_col).mean()[['avg_speed_h', 'avg_speed_m', 'log_trip_duration']]
    gby.columns = ['%s_gby_%s' % (col, gby_col) for col in gby.columns]
    train = pd.merge(train, gby, how='left', left_on=gby_col, right_index=True)
    test = pd.merge(test, gby, how='left', left_on=gby_col, right_index=True)

for gby_cols in [['center_lat_bin', 'center_long_bin'],
                 ['pickup_hour', 'center_lat_bin', 'center_long_bin'],
                 ['pickup_hour', 'pickup_cluster'],  ['pickup_hour', 'dropoff_cluster'],
                 ['pickup_cluster', 'dropoff_cluster']]:
    coord_speed = train.groupby(gby_cols).mean()[['avg_speed_h']].reset_index()
    coord_count = train.groupby(gby_cols).count()[['id']].reset_index()
    coord_stats = pd.merge(coord_speed, coord_count, on=gby_cols)
    coord_stats = coord_stats[coord_stats['id'] > 100]
    coord_stats.columns = gby_cols + ['avg_speed_h_%s' % '_'.join(gby_cols), 'cnt_%s' %  '_'.join(gby_cols)]
    train = pd.merge(train, coord_stats, how='left', on=gby_cols)
    test = pd.merge(test, coord_stats, how='left', on=gby_cols)

In [11]:
group_freq = '60min'
df_all = pd.concat((train, test))[['id', 'pickup_datetime', 'pickup_cluster', 'dropoff_cluster']]
train.loc[:, 'pickup_datetime_group'] = train['pickup_datetime'].dt.round(group_freq)
test.loc[:, 'pickup_datetime_group'] = test['pickup_datetime'].dt.round(group_freq)

# Count trips over 60min
df_counts = df_all.set_index('pickup_datetime')[['id']].sort_index()
df_counts['count_60min'] = df_counts.isnull().rolling(group_freq).count()['id']
train = train.merge(df_counts, on='id', how='left')
test = test.merge(df_counts, on='id', how='left')

# Count how many trips are going to each cluster over time
dropoff_counts = df_all \
    .set_index('pickup_datetime') \
    .groupby([pd.TimeGrouper(group_freq), 'dropoff_cluster']) \
    .agg({'id': 'count'}) \
    .reset_index().set_index('pickup_datetime') \
    .groupby('dropoff_cluster').rolling('240min').mean() \
    .drop('dropoff_cluster', axis=1) \
    .reset_index().set_index('pickup_datetime').shift(freq='-120min').reset_index() \
    .rename(columns={'pickup_datetime': 'pickup_datetime_group', 'id': 'dropoff_cluster_count'})

train['dropoff_cluster_count'] = train[['pickup_datetime_group', 'dropoff_cluster']].merge(dropoff_counts, on=['pickup_datetime_group', 'dropoff_cluster'], how='left')['dropoff_cluster_count'].fillna(0)
test['dropoff_cluster_count'] = test[['pickup_datetime_group', 'dropoff_cluster']].merge(dropoff_counts, on=['pickup_datetime_group', 'dropoff_cluster'], how='left')['dropoff_cluster_count'].fillna(0)

In [12]:
# Count how many trips are going from each cluster over time
df_all = pd.concat((train, test))[['id', 'pickup_datetime', 'pickup_cluster', 'dropoff_cluster']]
pickup_counts = df_all \
    .set_index('pickup_datetime') \
    .groupby([pd.TimeGrouper(group_freq), 'pickup_cluster']) \
    .agg({'id': 'count'}) \
    .reset_index().set_index('pickup_datetime') \
    .groupby('pickup_cluster').rolling('240min').mean() \
    .drop('pickup_cluster', axis=1) \
    .reset_index().set_index('pickup_datetime').shift(freq='-120min').reset_index() \
    .rename(columns={'pickup_datetime': 'pickup_datetime_group', 'id': 'pickup_cluster_count'})

train['pickup_cluster_count'] = train[['pickup_datetime_group', 'pickup_cluster']].merge(pickup_counts, on=['pickup_datetime_group', 'pickup_cluster'], how='left')['pickup_cluster_count'].fillna(0)
test['pickup_cluster_count'] = test[['pickup_datetime_group', 'pickup_cluster']].merge(pickup_counts, on=['pickup_datetime_group', 'pickup_cluster'], how='left')['pickup_cluster_count'].fillna(0)

In [13]:
train = train.fillna(train.median())
test = test.fillna(test.median())
fr1 = pd.read_csv('fastest_routes_train_part_1.csv')
fr2 = pd.read_csv('fastest_routes_train_part_2.csv')
test_street_info = pd.read_csv('fastest_routes_test.csv')
train_street_info = pd.concat((fr1, fr2))

In [14]:
def dicc(z):
    return dict(zip(z[0],z[1]))

In [15]:
a=train_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()
b=train_street_info['distance_per_step'].apply(lambda x: x.split("|")).values.tolist()
c=train_street_info['travel_time_per_step'].apply(lambda x: x.split("|")).values.tolist()
u = list(zip(a,b))
v = list(zip(a,c))
train_dist_dict = np.apply_along_axis(dicc,1,u)
train_time_dict = np.apply_along_axis(dicc,1,v)
del a
del b
del c
del u 
del v

In [16]:
a=test_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()
b=test_street_info['distance_per_step'].apply(lambda x: x.split("|")).values.tolist()
c=test_street_info['travel_time_per_step'].apply(lambda x: x.split("|")).values.tolist()
u = list(zip(a,b))
v = list(zip(a,c))
test_dist_dict = np.apply_along_axis(dicc,1,u)
test_time_dict = np.apply_along_axis(dicc,1,v)
del a
del b
del c
del u
del v

In [17]:
train_street_info['num_left_turns'] = [x.count('left') for x in train_street_info['step_direction'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['num_left_turns'] = [x.count('left') for x in test_street_info['step_direction'].apply(lambda x: x.split("|")).values.tolist()]
train_street_info['num_slightleft_turns'] = [x.count('slight left') for x in train_street_info['step_direction'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['num_slightleft_turns'] = [x.count('slight left') for x in test_street_info['step_direction'].apply(lambda x: x.split("|")).values.tolist()]
train_street_info['num_sharpleft_turns'] = [x.count('sharp left') for x in train_street_info['step_direction'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['num_sharpleft_turns'] = [x.count('sharp left') for x in test_street_info['step_direction'].apply(lambda x: x.split("|")).values.tolist()]

train_street_info['num_uturns'] = [x.count('uturn') for x in train_street_info['step_direction'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['num_uturns'] = [x.count('uturn') for x in test_street_info['step_direction'].apply(lambda x: x.split("|")).values.tolist()]
train_street_info['num_none'] = [x.count('none') for x in train_street_info['step_direction'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['num_none'] = [x.count('none') for x in test_street_info['step_direction'].apply(lambda x: x.split("|")).values.tolist()]
train_street_info['num_straight'] = [x.count('straight') for x in train_street_info['step_direction'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['num_straight'] = [x.count('straight') for x in test_street_info['step_direction'].apply(lambda x: x.split("|")).values.tolist()]

train_street_info['num_right_turns'] = [x.count('right') for x in train_street_info['step_direction'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['num_right_turns'] = [x.count('right') for x in test_street_info['step_direction'].apply(lambda x: x.split("|")).values.tolist()]
train_street_info['num_slightright_turns'] = [x.count('slight right') for x in train_street_info['step_direction'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['num_slightright_turns'] = [x.count('slight right') for x in test_street_info['step_direction'].apply(lambda x: x.split("|")).values.tolist()]
train_street_info['num_sharpright_turns'] = [x.count('sharp right') for x in train_street_info['step_direction'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['num_sharpright_turns'] = [x.count('sharp right') for x in test_street_info['step_direction'].apply(lambda x: x.split("|")).values.tolist()]

train_street_info['end_of_road_count'] = [x.count('end of road') for x in train_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['end_of_road_count'] = [x.count('end of road') for x in test_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
train_street_info['new_name_count'] = [x.count('new name') for x in train_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['new_name_count'] = [x.count('new name') for x in test_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
train_street_info['rotary_count'] = [x.count('rotary') for x in train_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['rotary_count'] = [x.count('rotary') for x in test_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
train_street_info['merge_count'] = [x.count('merge') for x in train_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['merge_count'] = [x.count('merge') for x in test_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
train_street_info['continue_count'] = [x.count('continue') for x in train_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['continue_count'] = [x.count('continue') for x in test_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
train_street_info['onramp_count'] = [x.count('on ramp') for x in train_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['onramp_count'] = [x.count('on ramp') for x in test_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
train_street_info['offramp_count'] = [x.count('off ramp') for x in train_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['offramp_count'] = [x.count('off ramp') for x in test_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
train_street_info['fork_count'] = [x.count('fork') for x in train_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['fork_count'] = [x.count('fork') for x in test_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
train_street_info['turn_count'] = [x.count('turn') for x in train_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['turn_count'] = [x.count('turn') for x in test_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
train_street_info['roundabout_count'] = [x.count('roundabout') for x in train_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['roundabout_count'] = [x.count('roudabout') for x in test_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
train_street_info['roundabout_turn_count'] = [x.count('roundabout turn') for x in train_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['roundabout_turn_count'] = [x.count('roundabout turn') for x in test_street_info['step_maneuvers'].apply(lambda x: x.split("|")).values.tolist()]

train_street_info['arrive_tm'] = [d['arrive'] if 'arrive' in d else 0 for d in train_time_dict]
test_street_info['arrive_tm'] = [d['arrive'] if 'arrive' in d else 0 for d in test_time_dict]
train_street_info['depart_tm'] = [d['depart'] if 'depart' in d else 0 for d in train_time_dict]
test_street_info['depart_tm'] = [d['depart'] if 'depart' in d else 0 for d in test_time_dict]
train_street_info['end_of_road_tm'] = [d['end of road'] if 'end of road' in d else 0 for d in train_time_dict]
test_street_info['end_of_road_tm'] = [d['end of road'] if 'end of road' in d else 0 for d in test_time_dict]
train_street_info['new_name_tm'] = [d['new name'] if 'new name' in d else 0 for d in train_time_dict]
test_street_info['new_name_tm'] = [d['new name'] if 'new name' in d else 0 for d in test_time_dict]
train_street_info['rotary_tm'] = [d['rotary'] if 'rotary' in d else 0 for d in train_time_dict ]
test_street_info['rotary_tm'] = [d['rotary'] if 'rotary' in d else 0 for d in test_time_dict]
train_street_info['merge_tm'] = [d['merge'] if 'merge' in d else 0 for d in train_time_dict]
test_street_info['merge_tm'] = [d['merge'] if 'merge' in d else 0 for d in test_time_dict]
train_street_info['continue_tm'] = [d['continue'] if 'continue' in d else 0 for d in train_time_dict]
test_street_info['continue_tm'] = [d['continue'] if 'continue' in d else 0 for d in test_time_dict]
train_street_info['onramp_tm'] = [d['on ramp'] if 'on ramp' in d else 0 for d in train_time_dict]
test_street_info['onramp_tm'] = [d['on ramp'] if 'on ramp' in d else 0 for d in test_time_dict]
train_street_info['offramp_tm'] = [d['off ramp'] if 'off ramp' in d else 0 for d in train_time_dict]
test_street_info['offramp_tm'] = [d['off ramp'] if 'off ramp' in d else 0 for d in test_time_dict]
train_street_info['fork_tm'] = [d['fork'] if 'fork' in d else 0 for d in train_time_dict]
test_street_info['fork_tm'] = [d['fork'] if 'fork' in d else 0 for d in test_time_dict]
train_street_info['turn_tm'] = [d['turn'] if 'turn' in d else 0 for d in train_time_dict]
test_street_info['turn_tm'] = [d['turn'] if 'turn' in d else 0 for d in test_time_dict]
train_street_info['roundabout_tm'] = [d['roundabout'] if 'roundabout' in d else 0 for d in train_time_dict]
test_street_info['roundabout_tm'] = [d['roundabout'] if 'roundabout' in d else 0 for d in test_time_dict]
train_street_info['roundabout_turn_tm'] = [d['roundabout turn'] if 'roudabout turn' in d else 0 for d in train_time_dict]
test_street_info['roundabout_turn_tm'] = [d['roundabout turn'] if 'roundabout turn' in d else 0 for d in test_time_dict]

train_street_info['arrive_ds'] = [d['arrive'] if 'arrive' in d else 0 for d in train_dist_dict]
test_street_info['arrive_ds'] = [d['arrive'] if 'arrive' in d else 0 for d in test_dist_dict]
train_street_info['depart_ds'] = [d['depart'] if 'depart' in d else 0 for d in train_dist_dict]
test_street_info['depart_ds'] = [d['depart'] if 'depart' in d else 0 for d in test_dist_dict]
train_street_info['end_of_road_ds'] = [d['end of road'] if 'end of road' in d else 0 for d in train_dist_dict]
test_street_info['end_of_road_ds'] = [d['end of road'] if 'end of road' in d else 0 for d in test_dist_dict]
train_street_info['new_name_ds'] = [d['new name'] if 'new name' in d else 0 for d in train_dist_dict]
test_street_info['new_name_ds'] = [d['new name'] if 'new name' in d else 0 for d in test_dist_dict]
train_street_info['rotary_ds'] = [d['rotary'] if 'rotary' in d else 0 for d in train_dist_dict]
test_street_info['rotary_ds'] = [d['rotary'] if 'rotary' in d else 0 for d in test_dist_dict]
train_street_info['merge_ds'] = [d['merge'] if 'merge' in d else 0 for d in train_dist_dict]
test_street_info['merge_ds'] = [d['merge'] if 'merge' in d else 0 for d in test_dist_dict]
train_street_info['continue_ds'] = [d['continue'] if 'continue' in d else 0 for d in train_dist_dict]
test_street_info['continue_ds'] = [d['continue'] if 'continue' in d else 0 for d in test_dist_dict]
train_street_info['onramp_ds'] = [d['on ramp'] if 'on ramp' in d else 0 for d in train_dist_dict]
test_street_info['onramp_ds'] = [d['on ramp'] if 'on ramp' in d else 0 for d in test_dist_dict]
train_street_info['offramp_ds'] = [d['off ramp'] if 'off ramp' in d else 0 for d in train_dist_dict]
test_street_info['offramp_ds'] = [d['off ramp'] if 'off ramp' in d else 0 for d in test_dist_dict]
train_street_info['fork_ds'] = [d['fork'] if 'fork' in d else 0 for d in train_dist_dict]
test_street_info['fork_ds'] = [d['fork'] if 'fork' in d else 0 for d in test_dist_dict]
train_street_info['turn_ds'] = [d['turn'] if 'turn' in d else 0 for d in train_dist_dict]
test_street_info['turn_ds'] = [d['turn'] if 'turn' in d else 0 for d in test_dist_dict]
train_street_info['roundabout_ds'] = [d['roundabout']if 'roundabout' in d else 0 for d in train_dist_dict]
test_street_info['roundabout_ds'] = [d['roundabout'] if 'roudabout' in d else 0 for d in test_dist_dict]
train_street_info['roundabout_turn_ds'] = [d['roundabout turn'] if 'roudabout turn' in d else 0 for d in train_dist_dict]
test_street_info['roundabout_turn_ds'] = [d['roundabout turn'] if 'roundabout turn' in d else 0 for d in test_dist_dict]

train_street_info['street_count'] = [len(x) for x in train_street_info['street_for_each_step'].apply(lambda x: x.split("|")).values.tolist()]
test_street_info['street_count'] = [len(x) for x in test_street_info['street_for_each_step'].apply(lambda x: x.split("|")).values.tolist()]

#train_street_info['start_eq_end'] = [train_street_info['starting_street'] == train_street_info['end_street']]
#test_street_info['start_eq_end'] = [test_street_info['starting_street'] == test_street_info['end_street']]

#usecols=['id', 'total_distance', 'total_travel_time', 'number_of_steps']
dropcols=['starting_street','end_street','street_for_each_step','distance_per_step',
          'travel_time_per_step','step_maneuvers','step_direction','step_location_list']
trainusecols = [a for a in train_street_info.columns.tolist() if a not in dropcols]
testusecols = [a for a in test_street_info.columns.tolist() if a not in dropcols]

train = train.merge(train_street_info[trainusecols], how='left', on='id')
test = test.merge(test_street_info[testusecols], how='left', on='id')
train_street_info.head()
#train_street_info
#test_street_info
#del test_dist_dict
#del test_time_dict
#del train_dist_dict
#del train_time_dict

Unnamed: 0,id,starting_street,end_street,total_distance,total_travel_time,number_of_steps,street_for_each_step,distance_per_step,travel_time_per_step,step_maneuvers,...,rotary_ds,merge_ds,continue_ds,onramp_ds,offramp_ds,fork_ds,turn_ds,roundabout_ds,roundabout_turn_ds,street_count
0,id2875421,Columbus Circle,East 65th Street,2009.1,164.9,5,Columbus Circle|Central Park West|65th Street ...,0|576.4|885.6|547.1|0,0|61.1|60.1|43.7|0,depart|rotary|turn|new name|arrive,...,576.4,0,0.0,0,0,0,885.6,0,0,5
1,id2377394,2nd Avenue,Washington Square West,2513.2,332.0,6,2nd Avenue|East 13th Street|5th Avenue|Washing...,877.3|836.5|496.1|164.2|139.1|0,111.7|109|69.9|25.8|15.6|0,depart|turn|turn|end of road|continue|arrive,...,0.0,0,139.1,0,0,0,496.1,0,0,6
2,id3504673,Greenwich Street,Broadway,1779.4,235.8,4,Greenwich Street|Park Place|Broadway|Broadway,644.2|379.9|755.3|0,80.5|50.8|104.5|0,depart|turn|end of road|arrive,...,0.0,0,0.0,0,0,0,379.9,0,0,4
3,id2181028,Broadway,West 81st Street,1614.9,140.1,5,Broadway|West 86th Street|Columbus Avenue|West...,617|427.4|412.2|158.3|0,56|36|37.8|10.3|0,depart|turn|turn|turn|arrive,...,0.0,0,0.0,0,0,0,158.3,0,0,5
4,id0801584,Lexington Avenue,West 31st Street,1393.5,189.4,5,Lexington Avenue|East 27th Street|Madison Aven...,18.9|311.9|313.3|749.4|0,6.3|42.9|48.4|91.8|0,depart|turn|turn|turn|arrive,...,0.0,0,0.0,0,0,0,749.4,0,0,5


In [18]:
print(np.setdiff1d(train.columns, test.columns))

['avg_speed_h' 'avg_speed_m' 'check_trip_duration' 'dropoff_datetime'
 'log_trip_duration' 'trip_duration']


In [19]:
del train_street_info
del test_street_info
del test_dist_dict
del test_time_dict
del train_dist_dict
del train_time_dict

In [20]:
#feature_names = list(train.columns)
print(np.setdiff1d(train.columns, test.columns))
# do_not_use_for_training = ['id', 'log_trip_duration', 'pickup_datetime', 'dropoff_datetime', 
#                            'trip_duration', 'check_trip_duration','pickup_date', 'avg_speed_h', 
#                            'avg_speed_m', 'pickup_lat_bin', 'pickup_long_bin',
#                            'center_lat_bin', 'center_long_bin', 'pickup_dt_bin', 
#                            'pickup_datetime_group']
do_not_use_for_training = ['id', 'log_trip_duration', 'pickup_datetime', 'dropoff_datetime', 
                           'trip_duration', 'check_trip_duration','pickup_date', 'avg_speed_h', 
                           'avg_speed_m', 'pickup_lat_bin', 'pickup_long_bin',
                           'center_lat_bin', 'center_long_bin', 'pickup_dt_bin', 
                           'pickup_datetime_group']

feature_names = [f for f in train.columns if f not in do_not_use_for_training]
#print(feature_names)
print('We have %i features.' % len(feature_names))
train[feature_names].count()
y = np.log(train['trip_duration'].values + 1)
#y = np.log(train['trip_duration'].loc[y_is_within_cut].values + 1)

t1 = dt.datetime.now()
print('Feature extraction time: %i seconds' % (t1 - t0).seconds)

['avg_speed_h' 'avg_speed_m' 'check_trip_duration' 'dropoff_datetime'
 'log_trip_duration' 'trip_duration']
We have 106 features.
Feature extraction time: 668 seconds


In [22]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,rotary_ds,merge_ds,continue_ds,onramp_ds,offramp_ds,fork_ds,turn_ds,roundabout_ds,roundabout_turn_ds,street_count
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,...,576.4,0.0,0.0,0.0,0.0,0.0,885.6,0,0.0,5.0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,...,0.0,0.0,139.1,0.0,0.0,0.0,496.1,0,0.0,6.0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,...,0.0,7463.4,0.0,219.9,217.9,198.4,63.8,0,0.0,16.0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,...,0.0,0.0,0.0,0.0,0.0,0.0,379.9,0,0.0,4.0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,...,0.0,0.0,0.0,0.0,0.0,0.0,158.3,0,0.0,5.0


In [None]:
#train = train.fillna(train.median())
#test = test.fillna(test.median())
# train['pickup_dt'] = (train['pickup_dt'] -train['pickup_dt'].mean())/train['pickup_dt'].std()
# test['pickup_dt'] = (test['pickup_dt'] - test['pickup_dt'].mean())/test['pickup_dt'].std()

# train['pickup_longitude'] = (train['pickup_longitude'] -train['pickup_longitude'].mean())/train['pickup_longitude'].std()
# test['pickup_longitude'] = (test['pickup_longitude'] - test['pickup_longitude'].mean())/test['pickup_longitude'].std()
# train['pickup_latitude'] = (train['pickup_latitude'] -train['pickup_latitude'].mean())/train['pickup_latitude'].std()
# test['pickup_latitude'] = (test['pickup_latitude'] - test['pickup_latitude'].mean())/test['pickup_latitude'].std()
# train['dropoff_longitude'] = (train['dropoff_longitude'] -train['dropoff_longitude'].mean())/train['dropoff_longitude'].std()
# test['dropoff_longitude'] = (test['dropoff_longitude'] - test['dropoff_longitude'].mean())/test['dropoff_longitude'].std()
# train['dropoff_latitude'] = (train['dropoff_latitude'] -train['dropoff_latitude'].mean())/train['dropoff_latitude'].std()
# test['dropoff_latitude'] = (test['dropoff_latitude'] - test['dropoff_latitude'].mean())/test['dropoff_latitude'].std()

In [28]:
Xtr, Xv, ytr, yv = train_test_split(train[feature_names].values, y, test_size=0.02, random_state=1987)
#Xtr, Xv, ytr, yv = train_test_split(train[feature_names].loc[y_is_within_cut,:].values, y, test_size=0.02, random_state=1987)

In [None]:
from keras.models import Model,Sequential
from keras.layers import LSTM, Dropout, Dense, Activation
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

def fit(X,y,dims):
    hidden_neurons = 128
    bs = 32

    model = Sequential()  
    #model.add(Dense(hidden_neurons, batch_input_shape=(None, length_of_sequences, in_out_neurons), return_sequences=False))
    #model.add(LSTM(hidden_neurons, input_dim=length_of_sequences, return_sequences=True))
    model.add(Dense(64, activation='relu',input_dim=dims))
    model.add(BatchNormalization())
    model.add(Activation("relu")) 
    model.add(Dropout(0.6))
    model.add(Dense(32))
    model.add(BatchNormalization())
    model.add(Activation("relu")) 
    model.add(Dropout(0.6))
    model.add(Dense(1))
    model.add(Activation("linear"))  
    model.compile(loss="mean_squared_error", optimizer="nadam")

    early_stopping = EarlyStopping(monitor='loss', patience=1)
    #model.load_weights("nn_model.h5") 
    model.fit(X, y, batch_size=bs, epochs=5, validation_split=0.1, callbacks=[early_stopping])  
    
    #model.fit(X_train, y_train, batch_size=bs, nb_epoch=15, validation_data=(X_test, y_test), callbacks=[remote])     
    model_checkpoint = ModelCheckpoint("nn_model.h5", save_best_only=True, save_weights_only=True)
    #predicted = model.predict(X) 
    #dataf =  pd.DataFrame(predicted[:1200])
    #dataf.columns = ["predict"]
    #dataf["input"] = y_test[:1200]
    #dataf.plot(figsize=(15, 5))

    #score = model.evaluate(X_test.as_matrix(), y_test, batch_size=16)
    #score = model.evaluate(X_test, y_test, batch_size=32)
    return model

TypeError: unorderable types: float() >= str()

In [30]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import ExtraTreesRegressor

rfr = ExtraTreesRegressor(n_estimators=500, max_depth=6, n_jobs=14, random_state=123, verbose=0)
rfr.fit(Xtr,ytr)
y_pred = reg.predict(Xv)
rmse = np.sqrt(mean_squared_error(yv, y_pred))
print("RMSE: %.4f" % rmse)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
model = fit(Xtr,ytr,len(feature_names))
model.evaluate(Xv,yv,batch_size=1024)

In [39]:
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb

lgb_train = lgb.Dataset(Xtr, ytr)
lgb_eval = lgb.Dataset(Xv, yv, reference=lgb_train)

params = {
        'task' : 'train',
        'boosting_type' : 'gbdt',
        'objective' : 'regression',
        'metric' : {'l2'},
        'num_leaves' : 100,
        'max_depth' : 5,
        'learning_rate' : 0.2,
        'feature_fraction' : 0.8,
        'bagging_fraction' : 0.8,
        #'bagging_freq': 5,
        'min_data_in_leaf' : 500,
        #'max_bin': 200,
        'verbose' : 0
}

gbm2 = lgb.train(params,
            lgb_train,
            num_boost_round=5000,
            valid_sets=lgb_eval,
            early_stopping_rounds=30)
y_pred = gbm2.predict(Xv, num_iteration=gbm2.best_iteration)
rmse = np.sqrt(mean_squared_error(yv, y_pred))
print("RMSE: %.4f" % rmse)

[1]	valid_0's l2: 0.485286
Train until valid scores didn't improve in 30 rounds.
[2]	valid_0's l2: 0.385495
[3]	valid_0's l2: 0.318342
[4]	valid_0's l2: 0.273454
[5]	valid_0's l2: 0.243223
[6]	valid_0's l2: 0.222842
[7]	valid_0's l2: 0.208355
[8]	valid_0's l2: 0.198122
[9]	valid_0's l2: 0.190815
[10]	valid_0's l2: 0.185654
[11]	valid_0's l2: 0.181907
[12]	valid_0's l2: 0.178946
[13]	valid_0's l2: 0.177032
[14]	valid_0's l2: 0.175535
[15]	valid_0's l2: 0.174163
[16]	valid_0's l2: 0.173138
[17]	valid_0's l2: 0.171463
[18]	valid_0's l2: 0.170797
[19]	valid_0's l2: 0.169819
[20]	valid_0's l2: 0.169412
[21]	valid_0's l2: 0.168139
[22]	valid_0's l2: 0.167572
[23]	valid_0's l2: 0.167086
[24]	valid_0's l2: 0.166722
[25]	valid_0's l2: 0.166032
[26]	valid_0's l2: 0.165554
[27]	valid_0's l2: 0.165198
[28]	valid_0's l2: 0.164875
[29]	valid_0's l2: 0.164501
[30]	valid_0's l2: 0.164058
[31]	valid_0's l2: 0.163738
[32]	valid_0's l2: 0.163421
[33]	valid_0's l2: 0.16327
[34]	valid_0's l2: 0.163062
[35]

[287]	valid_0's l2: 0.149699
[288]	valid_0's l2: 0.149687
[289]	valid_0's l2: 0.149687
[290]	valid_0's l2: 0.149677
[291]	valid_0's l2: 0.149658
[292]	valid_0's l2: 0.149636
[293]	valid_0's l2: 0.149622
[294]	valid_0's l2: 0.149617
[295]	valid_0's l2: 0.149602
[296]	valid_0's l2: 0.149602
[297]	valid_0's l2: 0.149602
[298]	valid_0's l2: 0.1496
[299]	valid_0's l2: 0.14959
[300]	valid_0's l2: 0.149572
[301]	valid_0's l2: 0.149557
[302]	valid_0's l2: 0.149533
[303]	valid_0's l2: 0.149519
[304]	valid_0's l2: 0.149509
[305]	valid_0's l2: 0.1495
[306]	valid_0's l2: 0.149506
[307]	valid_0's l2: 0.149503
[308]	valid_0's l2: 0.149495
[309]	valid_0's l2: 0.149455
[310]	valid_0's l2: 0.149434
[311]	valid_0's l2: 0.149417
[312]	valid_0's l2: 0.14941
[313]	valid_0's l2: 0.149349
[314]	valid_0's l2: 0.149322
[315]	valid_0's l2: 0.14928
[316]	valid_0's l2: 0.149286
[317]	valid_0's l2: 0.149272
[318]	valid_0's l2: 0.149262
[319]	valid_0's l2: 0.149225
[320]	valid_0's l2: 0.149238
[321]	valid_0's l2: 0

[571]	valid_0's l2: 0.146692
[572]	valid_0's l2: 0.146698
[573]	valid_0's l2: 0.146693
[574]	valid_0's l2: 0.146686
[575]	valid_0's l2: 0.146649
[576]	valid_0's l2: 0.146656
[577]	valid_0's l2: 0.146685
[578]	valid_0's l2: 0.146692
[579]	valid_0's l2: 0.146688
[580]	valid_0's l2: 0.146667
[581]	valid_0's l2: 0.146661
[582]	valid_0's l2: 0.146674
[583]	valid_0's l2: 0.146648
[584]	valid_0's l2: 0.146643
[585]	valid_0's l2: 0.146664
[586]	valid_0's l2: 0.146688
[587]	valid_0's l2: 0.146655
[588]	valid_0's l2: 0.146657
[589]	valid_0's l2: 0.146652
[590]	valid_0's l2: 0.146621
[591]	valid_0's l2: 0.146602
[592]	valid_0's l2: 0.146596
[593]	valid_0's l2: 0.146588
[594]	valid_0's l2: 0.146608
[595]	valid_0's l2: 0.146621
[596]	valid_0's l2: 0.146623
[597]	valid_0's l2: 0.146606
[598]	valid_0's l2: 0.146589
[599]	valid_0's l2: 0.146629
[600]	valid_0's l2: 0.14662
[601]	valid_0's l2: 0.146604
[602]	valid_0's l2: 0.146607
[603]	valid_0's l2: 0.146605
[604]	valid_0's l2: 0.146605
[605]	valid_0's

[855]	valid_0's l2: 0.145136
[856]	valid_0's l2: 0.145135
[857]	valid_0's l2: 0.145134
[858]	valid_0's l2: 0.14513
[859]	valid_0's l2: 0.145122
[860]	valid_0's l2: 0.145132
[861]	valid_0's l2: 0.145116
[862]	valid_0's l2: 0.145128
[863]	valid_0's l2: 0.145145
[864]	valid_0's l2: 0.145144
[865]	valid_0's l2: 0.145163
[866]	valid_0's l2: 0.145153
[867]	valid_0's l2: 0.14516
[868]	valid_0's l2: 0.145149
[869]	valid_0's l2: 0.145154
[870]	valid_0's l2: 0.145123
[871]	valid_0's l2: 0.145126
[872]	valid_0's l2: 0.145121
[873]	valid_0's l2: 0.145115
[874]	valid_0's l2: 0.145115
[875]	valid_0's l2: 0.145114
[876]	valid_0's l2: 0.145115
[877]	valid_0's l2: 0.145099
[878]	valid_0's l2: 0.145101
[879]	valid_0's l2: 0.145071
[880]	valid_0's l2: 0.145087
[881]	valid_0's l2: 0.145085
[882]	valid_0's l2: 0.145071
[883]	valid_0's l2: 0.14506
[884]	valid_0's l2: 0.145064
[885]	valid_0's l2: 0.145099
[886]	valid_0's l2: 0.145058
[887]	valid_0's l2: 0.145056
[888]	valid_0's l2: 0.145054
[889]	valid_0's l

In [40]:
# yNN = model.predict(Xv)
# newX=pd.DataFrame(yNN)
# yRF = reg.predict(Xv)
# newX=pd.DataFrame(yRF)

yGB = gbm.predict(Xv, num_iteration=gbm.best_iteration)
XX = pd.DataFrame(yGB)
yGB2 = gbm2.predict(Xv, num_iteration=gbm2.best_iteration)
XX2 = pd.DataFrame(yGB2)

res = pd.concat([XX,XX2],axis=1)
res.columns=['gbm','gbm2']
res.head()

Unnamed: 0,gbm,gbm2
0,7.323821,7.334568
1,6.92291,6.937566
2,5.693614,5.634628
3,6.178876,6.057216
4,6.080499,6.059073


In [41]:
from sklearn.linear_model import LinearRegression, Lasso

# rr = Lasso(alpha=0.0001,precompute=True,max_iter=1000,
#              positive=True, random_state=9999, selection='random')
rr = LinearRegression()
rr.fit(res,yv)
print(rr.coef_)

[ 0.79795415  0.20064334]


In [45]:
#RFy = reg.predict(test[feature_names].values)
#GBy2 = gbm2.predict(test[feature_names].values)
# print('Test shape OK.') if test.shape[0] == ytest.shape[0] else print('Oops')
# #test['trip_duration'] = np.exp(ytest) - 1
# #test[['id', 'trip_duration']].to_csv('double-d_submission.csv.gz', index=False, compression='gzip')
#GBy = gbm.predict(test[feature_names].values)
#test['trip_duration_rf'] = rr.coef_[0]*(np.exp(RFy) - 1)
test['trip_duration_gb'] = rr.coef_[0]*(np.exp(GBy) - 1)
test['trip_duration_gb2'] = rr.coef_[1]*(np.exp(GBy2) - 1)
#test['trip_duration'] = np.max(test['trip_duration_rf']+test['trip_duration_gb'],0)
test['trip_duration'] = test['trip_duration_gb2']+test['trip_duration_gb']
#test['trip_duration'] = test['trip_duration_rf']+test['trip_duration_gb']
#test['trip_duration'] = (np.exp(GBy) - 1)

test[['id', 'trip_duration']].to_csv('double-d_gbmmeerg_submission.csv.gz', index=False, compression='gzip')


In [None]:
train['trip_duration'].quantile(.98)

In [None]:
low_y_cut = 0.5
high_y_cut = 8
y_is_above_cut = (train['log_trip_duration'] > high_y_cut)
y_is_below_cut = (train['log_trip_duration']  < low_y_cut)
y_is_within_cut = (~y_is_above_cut & ~y_is_below_cut)
#model1 = rfr.fit(np.array(train.loc[y_is_within_cut,:].values), y.loc[y_is_within_cut])

In [None]:
train['log_trip_duration'].quantile(.99)

In [None]:
test['trip_duration'].loc()

In [None]:
filt = (test['trip_duration'] < 6)

In [None]:
test['trip_duration'].min()

In [None]:
print(test.loc[filt,:])

In [None]:
test['trip_duration'][217686]

In [None]:
test[['id', 'trip_duration']].to_csv('double-d_avr2_submission.csv.gz', index=False, compression='gzip')

In [None]:
test.columns

In [32]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='median', axis=1)
Xtr = imp.fit_transform(Xtr) 

KeyboardInterrupt: 

In [None]:
print("hello")

In [37]:
np.where(np.isnan(np.array(Xv)))

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [44]:
GBy

array([ 6.57285511,  6.24816425,  5.93532981, ...,  7.43228655,
        7.62990355,  7.06491495])