# **NYC Taxi Trip duration**
This notebook is my submission for the New York City Taxi Trip Duration Challenge on Kaggle.com.
The goal is to predict the duration of taxi rides in New York City.

In [177]:
# Importing Python libraries for data analysis, processing, modelling and visualization.
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['nyc-taxi-trip-duration', 'weather-data-in-new-york-city-2016']


# Exploratory analysis

In [178]:
# Loading data and checking it, using the id column provided in the dataset.
df_train = pd.read_csv('../input/nyc-taxi-trip-duration/train.csv',index_col=0)
df_test = pd.read_csv('../input/nyc-taxi-trip-duration/test.csv', index_col=0)

In [179]:
df_sample = pd.read_csv('../input/nyc-taxi-trip-duration/sample_submission.csv')
df_sample.head()

Unnamed: 0,id,trip_duration
0,id3004672,959
1,id3505355,959
2,id1217141,959
3,id2150126,959
4,id1598245,959


In [180]:
# Checking whether ID is unique or not.
len(df_train.index) == df_train.index.nunique()

True

In [181]:
# Checking for null values
df_train.isnull().values.any()

False

In [182]:
# Quick statistical analysis.
df_train.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0
mean,1.53495,1.66453,-73.97349,40.75092,-73.97342,40.7518,959.4923
std,0.4987772,1.314242,0.07090186,0.03288119,0.07064327,0.03589056,5237.432
min,1.0,0.0,-121.9333,34.3597,-121.9333,32.18114,1.0
25%,1.0,1.0,-73.99187,40.73735,-73.99133,40.73588,397.0
50%,2.0,1.0,-73.98174,40.7541,-73.97975,40.75452,662.0
75%,2.0,2.0,-73.96733,40.76836,-73.96301,40.76981,1075.0
max,2.0,9.0,-61.33553,51.88108,-61.33553,43.92103,3526282.0


In [183]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1458644 entries, id2875421 to id1209952
Data columns (total 10 columns):
vendor_id             1458644 non-null int64
pickup_datetime       1458644 non-null object
dropoff_datetime      1458644 non-null object
passenger_count       1458644 non-null int64
pickup_longitude      1458644 non-null float64
pickup_latitude       1458644 non-null float64
dropoff_longitude     1458644 non-null float64
dropoff_latitude      1458644 non-null float64
store_and_fwd_flag    1458644 non-null object
trip_duration         1458644 non-null int64
dtypes: float64(4), int64(3), object(3)
memory usage: 122.4+ MB


In [184]:
# Changing data type to handle dates in a easier way
#df_train['pickup_datetime'] = pd.to_datetime(df_train['pickup_datetime'])
#df_train['dropoff_datetime'] = pd.to_datetime(df_train['dropoff_datetime'])
#df_test['pickup_datetime'] = pd.to_datetime(df_test['pickup_datetime'])

In [185]:
# Splitting the dates in several columns to find out if any particulars years/months/days/hours/minutes have a different behavior 
def extract_date_features_train(df, col):
    X = pd.DataFrame()
    """ Extract features from a date. """
    X[col + '_year'] = df_train[col].dt.year
    X[col + '_month'] = df_train[col].dt.month
    X[col + '_day'] = df_train[col].dt.day
    X[col + '_week_of_year'] = df_train[col].dt.week
    X[col + '_day_of_week'] = df_train[col].dt.dayofweek
    X[col + '_day_of_year'] = df_train[col].dt.dayofyear
    X[col + '_hour'] = df_train[col].dt.hour
    X[col + '_minute'] = df_train[col].dt.minute
    X[col + '_days_in_month'] = df_train[col].dt.days_in_month
    X[col + '_is_month_start'] = df_train[col].dt.is_month_start
    X[col + '_is_month_end'] = df_train[col].dt.is_month_end
    return X

In [186]:
def extract_date_features_test(df, col):
    X = pd.DataFrame()
    """ Extract features from a date. """
    X[col + '_year'] = df_test[col].dt.year
    X[col + '_month'] = df_test[col].dt.month
    X[col + '_day'] = df_test[col].dt.day
    X[col + '_week_of_year'] = df_test[col].dt.week
    X[col + '_day_of_week'] = df_test[col].dt.dayofweek
    X[col + '_day_of_year'] = df_test[col].dt.dayofyear
    X[col + '_hour'] = df_test[col].dt.hour
    X[col + '_minute'] = df_test[col].dt.minute
    X[col + '_days_in_month'] = df_test[col].dt.days_in_month
    X[col + '_is_month_start'] = df_test[col].dt.is_month_start
    X[col + '_is_month_end'] = df_test[col].dt.is_month_end
    return X

In [187]:
#pickup_features_train = extract_date_features_train(df_train, 'pickup_datetime')
#pickup_features_test = extract_date_features_test(df_test, 'pickup_datetime')

In [188]:
#dropoff_features_train = extract_date_features_train(df_train, 'dropoff_datetime')

In [189]:
# Adding the dates components to the dataframe
#df_train = pd.concat([df_train, pickup_features_train, dropoff_features_train], axis=1)
#df_test = pd.concat([df_test, pickup_features_test], axis=1)
#df_train.head()

In [190]:
# Creating a fonction to calculate distance between our lat/long pickup and dropoff coordinates.
#from math import sin, cos, sqrt, atan2, radians

#def calculate_distance(longitude1, latitude1, longitude2, latitude2):
    # Radius of Earth in km
 #   R = 6373.0

  #  lat1 = radians(latitude1)
 #    lon1 = radians(longitude1)
 #    lat2 = radians(latitude2)
 #    lon2 = radians(longitude2)

 #    dlon = lon2 - lon1
 #    dlat = lat2 - lat1

 #    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
 #    c = 2 * atan2(sqrt(a), sqrt(1 - a))
  #   # Return distance in km
 #    return R * c

# def add_distance(df):
#    return df.apply(lambda row: calculate_distance(
#         row.pickup_longitude,
 #        row.pickup_latitude,
  #       row.dropoff_longitude,
 #        row.dropoff_latitude
#     ), axis=1)

# df_train['distance_km'] = add_distance(df_train)
# df_train.head()

In [191]:
# Adding a weather dataset of NY to check if it affects traffic.
weather = pd.read_csv('../input/weather-data-in-new-york-city-2016/weather_data_nyc_centralpark_2016(1).csv')
weather.head()

Unnamed: 0,date,maximum temperature,minimum temperature,average temperature,precipitation,snow fall,snow depth
0,1-1-2016,42,34,38.0,0.0,0.0,0
1,2-1-2016,40,32,36.0,0.0,0.0,0
2,3-1-2016,45,35,40.0,0.0,0.0,0
3,4-1-2016,36,14,25.0,0.0,0.0,0
4,5-1-2016,29,11,20.0,0.0,0.0,0


In [192]:
weather.date = pd.to_datetime(weather.date, dayfirst=True)
weather['day_of_year']= weather.date.dt.dayofyear
weather.head()

Unnamed: 0,date,maximum temperature,minimum temperature,average temperature,precipitation,snow fall,snow depth,day_of_year
0,2016-01-01,42,34,38.0,0.0,0.0,0,1
1,2016-01-02,40,32,36.0,0.0,0.0,0,2
2,2016-01-03,45,35,40.0,0.0,0.0,0,3
3,2016-01-04,36,14,25.0,0.0,0.0,0,4
4,2016-01-05,29,11,20.0,0.0,0.0,0,5


In [193]:
#df_train=df_train.merge(weather,left_on='pickup_datetime_day_of_year',right_on='day_of_year')
df_train.head()

Unnamed: 0_level_0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [194]:
#df_test=df_test.merge(weather,left_on='pickup_datetime_day_of_year',right_on='day_of_year')
df_test.head()

Unnamed: 0_level_0,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


In [201]:
df_train = df_train.drop(columns=['pickup_datetime','dropoff_datetime','store_and_fwd_flag'])
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1458644 entries, id2875421 to id1209952
Data columns (total 7 columns):
vendor_id            1458644 non-null int64
passenger_count      1458644 non-null int64
pickup_longitude     1458644 non-null float64
pickup_latitude      1458644 non-null float64
dropoff_longitude    1458644 non-null float64
dropoff_latitude     1458644 non-null float64
trip_duration        1458644 non-null int64
dtypes: float64(4), int64(3)
memory usage: 89.0+ MB


In [202]:
df_test = df_test.drop(columns=['pickup_datetime','store_and_fwd_flag'])
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 625134 entries, id3004672 to id0621643
Data columns (total 6 columns):
vendor_id            625134 non-null int64
passenger_count      625134 non-null int64
pickup_longitude     625134 non-null float64
pickup_latitude      625134 non-null float64
dropoff_longitude    625134 non-null float64
dropoff_latitude     625134 non-null float64
dtypes: float64(4), int64(2)
memory usage: 33.4+ MB


In [203]:
X = df_train
X.head()

Unnamed: 0_level_0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
id2875421,2,1,-73.982155,40.767937,-73.96463,40.765602,455
id2377394,1,1,-73.980415,40.738564,-73.999481,40.731152,663
id3858529,2,1,-73.979027,40.763939,-74.005333,40.710087,2124
id3504673,2,1,-74.01004,40.719971,-74.012268,40.706718,429
id2181028,2,1,-73.973053,40.793209,-73.972923,40.78252,435


In [207]:
y = df_train
y.head()

Unnamed: 0_level_0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
id2875421,2,1,-73.982155,40.767937,-73.96463,40.765602,455
id2377394,1,1,-73.980415,40.738564,-73.999481,40.731152,663
id3858529,2,1,-73.979027,40.763939,-74.005333,40.710087,2124
id3504673,2,1,-74.01004,40.719971,-74.012268,40.706718,429
id2181028,2,1,-73.973053,40.793209,-73.972923,40.78252,435


In [208]:
X.shape,y.shape

((1458644, 7), (1458644, 7))

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X, y)
y_test_pred = rf.predict(X)
rf.score(X, y)



In [167]:
from sklearn.metrics import r2_score
r2_score(y, y_test_pred)

NameError: name 'y_test_pred' is not defined

In [None]:
y.to_csv('to_submit.csv')

In [None]:
#!ls