In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import pickle
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import os
from tqdm import tqdm
from scipy.sparse import csr_matrix, hstack
from sklearn.impute import SimpleImputer
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

Add weather data from: ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ 
From this, use daily maximum and minimum temperatures (encoded as binary 'extreme' days), average wind speed, precipitation, and snow (all as numeric).

In [None]:
nyc_weather = pd.read_csv('../input/nyc-weather/nyc_weather.csv')
weather_cols = ['DATE','AWND','PRCP','SNOW','TMAX','TMIN']
nyc_weather = nyc_weather[weather_cols].copy()
nyc_weather['DATE'] = pd.to_datetime(nyc_weather['DATE'], utc=True, format='%m/%d/%Y') 
nyc_weather.head()


The plot below shows the distribution of daily high and low temperatures. This highlights the temperatures that will be considered extreme. 

In [None]:
plt.rc('figure', figsize=(15, 8))
plt.subplot(1,2,1)
plt.hist(nyc_weather.TMAX, bins =  30)
plt.xlabel('Temperature (C)')
plt.ylabel('Frequency Count')
plt.title('Max Daily Temperature')
plt.subplot(1,2,2)
plt.hist(nyc_weather.TMIN, bins =  30)
plt.xlabel('Temperature (C)')
plt.ylabel('Frequency Count')
plt.title('Min Daily Temperature')
plt.show()

In [None]:
nyc_weather.describe()

In [None]:
holidays = pd.read_csv('../input/us-bank-holidays-20092018/US Bank Holidays 2012-2018.csv')
holidays['Date'] = pd.to_datetime(holidays['Date'], utc=True, format='%m/%d/%y') 
holidays.head(12)

The training set has ~55M rows so I take a sample of 12M.  After cleaning up some formatting and changing the data types to improve efficiency, it's time to begin feature engineering.  I break the pickup times down into categorical features, including year, month, and day/hour combinations (e.g. Friday 5pm, Saturday 7am, etc.).  I join the weather data and use wind speed, precipitation and snow as numeric features. I encode the extreme temperature days as binary features. I calculate the distance between pickup and dropoff using the Haversine. Lastly, I add the various bank holidays as categorical features. 

*Hiding this code for ease of reading the kernel. 

In [None]:
%%time
#import sample of train and full test
import random

n = sum(1 for line in open('../input/new-york-city-taxi-fare-prediction/train.csv')) - 1 #number of records in file (excludes header)
s = 15000000 #desired sample size
skip = sorted(random.sample(range(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list

train = pd.read_csv('../input/new-york-city-taxi-fare-prediction/train.csv', skiprows=skip) 
test = pd.read_csv('../input/new-york-city-taxi-fare-prediction/test.csv') 
test_id = test.key.values #set this value for final submission


In [None]:
train.head()

In [None]:
%%time
#truncate datetime string for efficiency converting to datetime format
train['pickup_datetime'] = train['pickup_datetime'].str.slice(0, 16)
test['pickup_datetime'] = test['pickup_datetime'].str.slice(0, 16)

train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M') 
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'], utc=True,format='%Y-%m-%d %H:%M') 

train_sample = train.dropna()
del train

#get rid of unnecessary memory consuming column
train_sample.drop(labels='key', axis=1, inplace=True)
test.drop(labels='key', axis=1, inplace=True)

#convert data to less memory intensive types
train_sample.loc[:,'passenger_count'] = train_sample.passenger_count.astype(dtype = 'uint8')
train_sample['pickup_longitude'] = train_sample.pickup_longitude.astype(dtype = 'float32')
train_sample['pickup_latitude'] = train_sample.pickup_latitude.astype(dtype = 'float32')
train_sample['dropoff_longitude'] = train_sample.dropoff_longitude.astype(dtype = 'float32')
train_sample['dropoff_latitude'] = train_sample.dropoff_latitude.astype(dtype = 'float32')
train_sample['fare_amount'] = train_sample.fare_amount.astype(dtype = 'float32')

test['pickup_longitude'] = test.pickup_longitude.astype(dtype = 'float32')
test['pickup_latitude'] = test.pickup_latitude.astype(dtype = 'float32')
test['dropoff_longitude'] = test.dropoff_longitude.astype(dtype = 'float32')
test['dropoff_latitude'] = test.dropoff_latitude.astype(dtype = 'float32')

#filter training set to be within full range of test set
train_sample = train_sample.loc[train_sample.pickup_longitude.between(test.pickup_longitude.min(), test.pickup_longitude.max())]
train_sample = train_sample.loc[train_sample.pickup_latitude.between(test.pickup_latitude.min(), test.pickup_latitude.max())]
train_sample = train_sample.loc[train_sample.dropoff_longitude.between(test.dropoff_longitude.min(), test.dropoff_longitude.max())]
train_sample = train_sample.loc[train_sample.dropoff_latitude.between(test.dropoff_latitude.min(), test.dropoff_latitude.max())]

#convert timestamp to features to be used as categorial
train_sample['hour'] = train_sample['pickup_datetime'].apply(lambda time: time.hour)
train_sample['month'] = train_sample['pickup_datetime'].apply(lambda time: time.month)
train_sample['day_of_week'] = train_sample['pickup_datetime'].apply(lambda time: time.dayofweek)
train_sample['year'] = train_sample['pickup_datetime'].apply(lambda t: t.year)


test['hour'] = test['pickup_datetime'].apply(lambda time: time.hour)
test['month'] = test['pickup_datetime'].apply(lambda time: time.month)
test['day_of_week'] = test['pickup_datetime'].apply(lambda time: time.dayofweek)
test['year'] = test['pickup_datetime'].apply(lambda t: t.year)

#reduce memory by converting datatypes
train_sample['hour'] = train_sample.hour.astype(dtype = 'uint8')
train_sample['month'] = train_sample.month.astype(dtype = 'uint8')
train_sample['day_of_week'] = train_sample.day_of_week.astype(dtype = 'uint8')
train_sample['year'] = train_sample.year.astype(dtype = 'uint16')


test['hour'] = test.hour.astype(dtype = 'uint8')
test['month'] = test.month.astype(dtype = 'uint8')
test['day_of_week'] = test.day_of_week.astype(dtype = 'uint8')
test['year'] = test.year.astype(dtype = 'uint16')


# Join Weather data
train_sample['pickup_day'] = train_sample.pickup_datetime.dt.floor('d')
train_sample = train_sample.merge(nyc_weather, how = 'left', left_on ='pickup_day', right_on = 'DATE')
train_sample.drop(columns = ['pickup_day','DATE'], axis = 0, inplace = True)

test['pickup_day'] = test.pickup_datetime.dt.floor('d')
test = test.merge(nyc_weather, how = 'left', left_on ='pickup_day', right_on = 'DATE')
test.drop(columns = ['pickup_day','DATE'], axis = 0, inplace = True)

train_sample['AWND'] = train_sample.AWND.astype(dtype = 'float16')
train_sample['PRCP'] = train_sample.PRCP.astype(dtype = 'float16')
train_sample['SNOW'] = train_sample.day_of_week.astype(dtype = 'float16')
train_sample['TMAX'] = train_sample.TMAX.astype(dtype = 'float16')
train_sample['TMIN'] = train_sample.TMAX.astype(dtype = 'float16')

test['AWND'] = test.AWND.astype(dtype = 'float16')
test['PRCP'] = test.PRCP.astype(dtype = 'float16')
test['SNOW'] = test.day_of_week.astype(dtype = 'float16')
test['TMAX'] = test.TMAX.astype(dtype = 'float16')
test['TMIN'] = test.TMAX.astype(dtype = 'float16')

#create weather features
#extreme temps
train_sample['hot_day'] = np.where(train_sample.TMAX >= 30,1,0)
train_sample['cold_day'] = np.where(train_sample.TMIN <= 0,1,0)
test['hot_day'] =  np.where(test.TMAX >= 30,1,0)
test['cold_day'] = np.where(test.TMIN <= 0,1,0)
train_sample['hot_day'] = train_sample.hot_day.astype(dtype = 'uint8')
train_sample['cold_day'] = train_sample.cold_day.astype(dtype = 'uint8')
test['hot_day'] = test.hot_day.astype(dtype = 'uint8')
test['cold_day'] = test.cold_day.astype(dtype = 'uint8')

#rain and snow
train_sample['rainy_day'] = np.where(train_sample.PRCP >= 0,1,0)
train_sample['snowy_day'] = np.where(train_sample.SNOW <= 0,1,0)
test['rainy_day'] =  np.where(test.PRCP >= 0,1,0)
test['snowy_day'] = np.where(test.SNOW <= 0,1,0)
train_sample['rainy_day'] = train_sample.rainy_day.astype(dtype = 'uint8')
train_sample['snowy_day'] = train_sample.snowy_day.astype(dtype = 'uint8')
test['rainy_day'] = test.rainy_day.astype(dtype = 'uint8')
test['snowy_day'] = test.snowy_day.astype(dtype = 'uint8')

#windy days
train_sample['windy_day'] = np.where(train_sample.AWND >= 0,1,0)
test['windy_day'] =  np.where(test.AWND >= 0,1,0)
train_sample['windy_day'] = train_sample.windy_day.astype(dtype = 'uint8')
test['windy_day'] = test.windy_day.astype(dtype = 'uint8')


#calculate distance between pickup and dropoff
def degree_to_radion(degree):
    return degree*(np.pi/180)

def calculate_distance(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude):
    
    from_lat = degree_to_radion(pickup_latitude)
    from_long = degree_to_radion(pickup_longitude)
    to_lat = degree_to_radion(dropoff_latitude)
    to_long = degree_to_radion(dropoff_longitude)
    
    radius = 6371.01
    
    lat_diff = to_lat - from_lat
    long_diff = to_long - from_long

    a = np.sin(lat_diff / 2)**2 + np.cos(degree_to_radion(from_lat)) * np.cos(degree_to_radion(to_lat)) * np.sin(long_diff / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    
    return radius * c

train_sample['distance'] = calculate_distance(train_sample.pickup_latitude, train_sample.pickup_longitude, train_sample.dropoff_latitude, train_sample.dropoff_longitude)
test['distance'] = calculate_distance(test.pickup_latitude, test.pickup_longitude, test.dropoff_latitude, test.dropoff_longitude)

train_sample['distance'] = train_sample.distance.astype(dtype = 'float32')
test['distance'] = test.distance.astype(dtype = 'float32')


#combine day and hour to make every hour of the week a binary feature
train_sample['day_hour'] = train_sample.day_of_week.astype(str) + "_" + train_sample.hour.astype(str)
train_sample['day_hour'] = train_sample['day_hour'].astype('category')

test['day_hour'] = test.day_of_week.astype(str) + test.hour.astype(str)
test['day_hour'] = test['day_hour'].astype('category')

#filter out negative fares
train_sample = train_sample[train_sample.fare_amount > 0]

#holidays
train_sample['pickup_day'] = train_sample.pickup_datetime.dt.floor('d')
train_sample = train_sample.merge(holidays, left_on = 'pickup_day', right_on = 'Date', how = 'left')
train_sample['Holiday'] =train_sample.Holiday.fillna('None')

le = LabelEncoder()
train_sample['holiday'] = le.fit_transform(train_sample.Holiday.values)
train_sample.drop(['Holiday','Date','pickup_day'], axis = 1, inplace = True)

test['pickup_day'] = test.pickup_datetime.dt.floor('d')
test = test.merge(holidays, left_on = 'pickup_day', right_on = 'Date', how = 'left')
test['Holiday'] =test.Holiday.fillna('None')

test['holiday'] = le.fit_transform(test.Holiday.values)
test.drop(['Holiday','Date','pickup_day'], axis = 1, inplace = True)


train_sample['holiday'] = train_sample.holiday.astype(dtype = 'uint8')
test['holiday'] = test.holiday.astype(dtype = 'uint8')



In [None]:
train_sample.head()

Here, I identify 200 'neighborhoods' throughout the city using Kmeans clustering. I include all possible coordinate pairs from both the training sample and the test set, rounded to 4 decimal places, and take the unique values (note: I am loading a model from an earlier kernel, which was trained on the full test set and a sample of 25M rows from the training set) Rounding and taking unique values makes the neighborhoods more evenly distributed, instead of being concentrated in the most frequent pickup/dropoff points. Without doing this, the cluster centers are much more densely located in Manhattan and even more sparse in the outer boroughs. Rounding to 4 decimal places performed best on the test set. 

With the trained model, I label each pickup/dropoff 'neighborhood' on the training and test set, which I will encode as categorical features.

In [None]:
# Create set of unique locations rounded to 4 decimal places. This prevents the model from biasing towards more frequently used pickup/dropoff spots
full_pickups = pd.concat([train_sample[['pickup_longitude','pickup_latitude']],test[['pickup_longitude','pickup_latitude']]], axis = 0)
full_pickups.columns = ['x','y']
full_dropoffs = pd.concat([train_sample[['dropoff_longitude','dropoff_latitude']],test[['dropoff_longitude','dropoff_latitude']]], axis = 0)
full_dropoffs.columns = ['x','y']
full_locs = pd.concat([full_pickups,full_dropoffs], axis = 0)
full_locs = full_locs.sample(10000000)
full_locs['x'] = full_locs.x.round(4)
full_locs['y'] = full_locs.y.round(4)

full_locs = full_locs.groupby(['x','y']).count().reset_index()
full_locs.info()


The plots below show the resulting clusters of locations and their centers. These are the rounded/unique values, not the training set. 

In [None]:
%%time

X_df = full_locs.copy()
X_kmeans = full_locs.values
del full_locs, full_pickups, full_dropoffs 

num_clusters = 100

#fit the model (done in a previous kernel on a larger set, since the number of rows gets reduced with rounding and taking unique values)
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(X_kmeans)

#load model from previous kernel trained on 25M observations
#with open('../input/taxi-weather-holidays-kmeans-neighborhoods/kmeans_200_round4.pkl', 'rb') as fid:
#    kmeans = pickle.load(fid)

#create labels for graph below
z = kmeans.predict(X_kmeans)

centers = kmeans.cluster_centers_

x_centers = [pair[0] for pair in centers]
y_centers = [pair[1] for pair in centers]
z_centers = np.arange(num_clusters)

#locations plotted with clusters as different shades
plt.subplot(1,2,1)
plt.scatter(X_df['x'], X_df['y'], c=z)
plt.gray()
plt.xlabel('Pickup/Dropoff Longitude')
plt.ylabel('Pickup/Dropoff Latitude')
plt.title('Clusters of NYC locations')
plt.subplot(1,2,2)
#plot of cluster center locations
plt.scatter(x_centers, y_centers, c=z_centers)
plt.gray()
plt.xlabel('Pickup/Dropoff Longitude')
plt.ylabel('Pickup/Dropoff Latitude')
plt.title('Cluster Centers of NYC locations')

plt.show()

In [None]:
#add cluster labels as new features
del X_kmeans, X_df
#train_sample = train_sample.sample(10000000)

train_sample['pickup_neighborhood'] = kmeans.predict(np.column_stack([train_sample.pickup_longitude.values,train_sample.pickup_latitude.values]))
train_sample['dropoff_neighborhood'] = kmeans.predict(np.column_stack([train_sample.dropoff_longitude.values,train_sample.dropoff_latitude.values]))

test['pickup_neighborhood'] =  kmeans.predict(np.column_stack([test.pickup_longitude.values, test.pickup_latitude.values]))
test['dropoff_neighborhood'] = kmeans.predict(np.column_stack([test.dropoff_longitude.values,test.dropoff_latitude.values]))

train_sample['pickup_neighborhood'] = train_sample.pickup_neighborhood.astype(dtype = 'uint8')
train_sample['dropoff_neighborhood'] = train_sample.dropoff_neighborhood.astype(dtype = 'uint8')

test['pickup_neighborhood'] = test.pickup_neighborhood.astype(dtype = 'uint8')
test['dropoff_neighborhood'] = test.dropoff_neighborhood.astype(dtype = 'uint8')


In [None]:
train_sample.head()

In [None]:
#save kmeans model for future use
with open('kmeans_100_round4_v2.pkl', 'wb') as fid:
    pickle.dump(kmeans, fid)    


Before I train the final model, I need to convert the training set into the proper format. Since there are a large number of binary features resulting from the neighborhoods and day/hour combinations, I use sparse matricies.  I impute any missing values with the mean, then split the set 90/10 for cross validation, in this case just to see how the test score compares to the submission. 

In [None]:
#create final array for model
categorical_cols = ['day_hour','month','year','pickup_neighborhood','dropoff_neighborhood','passenger_count','holiday','hot_day','cold_day','rainy_day','snowy_day','windy_day'] #'day_hour','hot_day','cold_day','jfk_pickup','jfk_dropoff','lga_pickup','lga_dropoff','ewr_pickup','ewr_dropoff'     'hour','day_of_week', 'pickup_lat_round','pickup_long_round'
numerical_cols = ['distance'] # 'delta_lat','delta_long', , 'pickup_latitude','pickup_longitude','AWND','PRCP','SNOW','TMAX','TMIN'

#subset categorical features for onehot encoding, return sparse matrix
X_cats = train_sample[categorical_cols].values
X_cats_test = test[categorical_cols].values
X_cats_full = np.append(X_cats, X_cats_test, axis = 0)

ohe = OneHotEncoder(categories = 'auto')
X_onehot = ohe.fit_transform(X_cats_full)
del X_cats,X_cats_test, X_cats_full

#subset numerical columns and convert to sparse in order to combine with categorical subset
X_nums = train_sample[numerical_cols].values
X_nums_test = test[numerical_cols].values
X_nums_full = np.append(X_nums, X_nums_test, axis = 0)

X_nums_sparse = csr_matrix(X_nums_full)
del X_nums, X_nums_test, X_nums_full

#combine sparse matricies
X_full = hstack([X_onehot, X_nums_sparse]).tocsr()

#impute any missing data
si = SimpleImputer()
X_full_imputed = si.fit_transform(X_full)

X = X_full_imputed[:train_sample.shape[0],:]
X_public = X_full_imputed[train_sample.shape[0]:,:]

y = train_sample.fare_amount.values
del X_onehot, X_nums_sparse

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .1)
#del X,y

Finally, train the Light GBM. 

In [None]:
%%time

params = {'objective': 'regression',
          'boosting': 'gbdt',
          'metric': 'rmse',
          'num_leaves': 50,
          'max_depth': 8,
          'learning_rate': 0.5,
          'bagging_fraction': 0.8,
          'feature_fraction': 0.8,
          'min_split_gain': 0.02,
          'min_child_samples': 10, 
          'min_child_weight': 0.02, 
          'lambda_l2': 0.0475,
          'verbosity': -1,
          'data_random_seed': 17,
          'early_stop': 100,
          'verbose_eval': 100,
          'num_rounds': 100} 

d_train = lgb.Dataset(X_train, label=y_train)
d_test = lgb.Dataset(X_test, label=y_test)
watchlist = [d_train, d_test]
num_rounds = 100
verbose_eval = 100
early_stop = 100
model_lgb = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval,
                      early_stopping_rounds=early_stop)
    
pred_test_y_lgb = model_lgb.predict(X_test, num_iteration=model_lgb.best_iteration)

print("LGB Loss = " + str(sqrt(mean_squared_error(y_test,pred_test_y_lgb))))


Format the final predictions on the public set for submission, then plot a distribution of the fare amount predictions. 

In [None]:
lgb_public= model_lgb.predict(X_public, num_iteration=model_lgb.best_iteration)

final_pred_public =lgb_public.flatten()

#clean and format final submission
test_predictions_lgb = [float(np.asscalar(x)) for x in final_pred_public]
test_predictions_lgb = [x if x>0 else 0 for x in test_predictions_lgb]
sample = pd.DataFrame({'key': test_id,'fare_amount':test_predictions_lgb})
sample = sample.reindex(['key', 'fare_amount'], axis=1)
sample.to_csv('submission_lgb.csv', index=False)
sample.head()

In [None]:
plt.rc('figure', figsize=(10, 10))
plt.hist(test_predictions_lgb, bins = 100)
plt.xlabel('Predicticted Price')
plt.ylabel('Frequency')
plt.title('Predictions from LGB')
plt.show()