In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

from math import floor # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

<ol>NYC Taxi Trip
<li>Before moving further, we need to decide what is our objective of this problem</li>
<li>What question we need to ask to solve our objective efficiently</li>
<li>What our dataset is providing to us</li>
</ol>

<ol>
<li>We have to predict the trip duration given a particular location at a particular time in a day.</li>
<li>We are provided with pickup_datetime, pick_up_location, drop_off_location, vendor_id, passenger_count, store and forward flag(if network is not working at the time of passenger takeoff, information will be stored in local database and it will be transfered to Main server when the network starts working) for successfully predicting the trip duration.</li>
<li>We can extract binary variable isPeak, isHoliday from pickup_datetime for better analysis.</li>
<li>We can divide data in pickup_location and dropoff_location in different categories for better analysis.</li>
</ol>

In [2]:
import pandas as pd;#for dataprocessing.
import matplotlib.pyplot as plt;
from IPython.display import display;
%matplotlib inline

In [3]:
def readData(filename):
    data = pd.read_csv(filename, parse_dates=[2,3]);
    display(data.head(3));
    return data;

In [4]:
data = readData('../input/train.csv');

In [5]:
display(data.isnull().sum());

<p>All columns contains some values in it. ( No missing value)</p> 

In [6]:
print("Avaliable dataset is from", data['pickup_datetime'].min(), "to", data['pickup_datetime'].max());
print("Total rows in dataset is",len(data));

<p>Using box plot of trip_duration, pickup_longitude, pickup_latitude, dropoff_longitude and dropoff_latitude, we will check if our dataset contain any outliers or not.</p>

In [7]:
data.boxplot(column=['trip_duration']);

<li>It's huge, trip_duration do contain outliers, let's remove them</li>
<li>Any trip_duration greater than 20,000(for now) can be considered as outliers</li>

In [8]:
data = data[data.trip_duration < 20000];
data.boxplot(column=['trip_duration']);

<li>Box plot in pandas uses 1.5 IQR(Inter Quantile Range) for drawing any outliers</li>
<li>For this dataset,considering this as the standard we can remove any value which lie outside of any whiskers(-1.5 IQR to 1.5 IQR).</li>

In [9]:
data = data[data.trip_duration < 2000];
data.boxplot(column=['trip_duration']);

In [10]:
data.boxplot(column=['pickup_longitude', 'dropoff_longitude']);

<p>There is one more point which surely is an error of dataentry, let's remove it</p>

In [11]:
data = data[ data.pickup_longitude > -80];

In [12]:
data.boxplot(column=['pickup_longitude', 'dropoff_longitude']);

In [13]:
PICKUP_LOWER_LIMIT = data['pickup_longitude'] > -74;
PICKUP_UPPER_LIMIT = data['pickup_longitude'] < -73.92;
DROP_OFF_LOWER_LIMIT = data['dropoff_longitude'] > -74;
DROP_OFF_UPPER_LIMIT = data['dropoff_longitude'] < -73.92;
data = data[PICKUP_LOWER_LIMIT & PICKUP_UPPER_LIMIT & DROP_OFF_LOWER_LIMIT & DROP_OFF_UPPER_LIMIT];
pd.DataFrame.boxplot(data, column=['pickup_longitude', 'dropoff_longitude']);

<p>We will perform same exersice for pick_latitude and dropoff_latitude</p>

In [14]:
data.boxplot(column=['pickup_latitude', 'dropoff_latitude']);

In [15]:
PICKUP_LOWER_LIMIT = data['pickup_latitude'] > 40.7;
PICKUP_UPPER_LIMIT = data['pickup_latitude'] < 40.82;
DROP_OFF_LOWER_LIMIT = data['dropoff_latitude'] > 40.7;
DROP_OFF_UPPER_LIMIT = data['dropoff_latitude'] < 40.82;
data = data[PICKUP_LOWER_LIMIT & PICKUP_UPPER_LIMIT & DROP_OFF_LOWER_LIMIT & DROP_OFF_UPPER_LIMIT];
pd.DataFrame.boxplot(data, column=['pickup_latitude', 'dropoff_latitude']);

In [16]:
print("Total samples in our dataset are ",len(data))

<p>Only 0.95 million rows are remaining out of 1.45 million rows (Hats off to data cleaning).</p>

In [17]:
# Print data types to check type of each column and remaining column.
print(data.dtypes)

<p>We can divide time in peak or non peak, similarly we can divide day in a holiday or non holiday for further analysis.</p>

In [18]:
data.set_index('pickup_datetime', inplace=True, drop=False, append=False);

In [19]:
data['pickup_hour'] = data.index.hour

In [20]:
display(data.head(3));

We can considering 7:00-9:59(morning) and 18:00-21:59(evening) for peak time.

In [21]:
data['isPeak'] = (data['pickup_hour'].isin([7,8,9,18,19,20,21])).astype(int);

In [22]:
display(data.head(2));

<p>For dividing dataset into holidays or non-holidays we can use USFederalHolidayCalender</p>

In [23]:
# Extracting date from datetime.

In [24]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar;

us_hol_cal = calendar();
holidays = us_hol_cal.holidays(start = data['pickup_datetime'].min(), end=data['pickup_datetime'].max());
data['isWeekend'] = (data['pickup_datetime'].dt.dayofweek > 5).astype(int);
data['isUSHoliday'] = (data['pickup_datetime'].isin(holidays)).astype(int);

In [25]:
data['isHoliday'] = data['isWeekend'] | data['isUSHoliday'];

In [26]:
display(data.head(3));

In [27]:
# Histogram of longitude and latitude values for dividing them in different categories
data.hist(column=['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'], figsize=(15,6));

<p>From preceding plot, we can say that latitude is varying with a margin of 0.02 and longitude is varying with a margin of 0.01</p>

In [28]:
latitude_step=0.02
latitude_transform = lambda lat: floor(lat/latitude_step)*latitude_step;
data['pickup_lat'] = data.pickup_latitude.map(latitude_transform);
data['dropoff_lat'] = data.dropoff_latitude.map(latitude_transform);

longitude_step = 0.01;
longitude_transform = lambda lon: floor(lon/longitude_step)*longitude_step;
data['pickup_lon'] = data.pickup_longitude.map(longitude_transform);
data['dropoff_lon'] = data.dropoff_longitude.map(longitude_transform);

In [29]:
display(data.head(2));

In [30]:
print(data.dtypes);

In [31]:
# update dtype from object to int.
data['store_and_fwd'] = (data['store_and_fwd_flag'].map(lambda value: value=='Y')).astype(int);

In [32]:
display(data.tail(1));

In [33]:
# drop redundant columns;
data.drop(['pickup_datetime', 'dropoff_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag', 'pickup_hour', 'isWeekend', 'isUSHoliday'], axis=1, inplace=True);

In [34]:
print(data.dtypes);

In [35]:
display(data.head(5));

<p>We are successfully able to clean our dataset (Hip hip hurray :-) )</p>