# Import modules and load data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
# Unzip archive
!unzip -o archive.zip

Archive:  archive.zip
  inflating: uber.csv                


In [3]:
# Create DataFrame
uber = pd.read_csv('uber.csv')
uber

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...,...
199995,42598914,2012-10-28 10:49:00.00000053,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1
199996,16382965,2014-03-14 01:09:00.0000008,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1
199997,27804658,2009-06-29 00:42:00.00000078,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2
199998,20259894,2015-05-20 14:56:25.0000004,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1


# Data Preprocessing

In [4]:
# Drop unnecessary columns
uber.drop(columns=['Unnamed: 0', 'key'], axis=0, inplace=True)
uber.sort_values(by='pickup_datetime', axis=0, inplace=True)
uber.reset_index(inplace=True, drop=True)
uber

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,8.5,2009-01-01 01:15:22 UTC,-73.981918,40.779456,-73.957685,40.771043,2
1,13.0,2009-01-01 01:59:17 UTC,-73.983759,40.721389,-73.994833,40.687179,2
2,10.6,2009-01-01 02:05:03 UTC,-73.956635,40.771254,-73.991528,40.749778,2
3,12.2,2009-01-01 02:09:13 UTC,-73.984605,40.728020,-73.955746,40.776830,1
4,11.0,2009-01-01 02:13:41 UTC,-73.980127,40.737425,-74.009544,40.726025,4
...,...,...,...,...,...,...,...
199995,18.5,2015-06-30 22:57:53 UTC,-73.971703,40.782207,-73.943680,40.827991,2
199996,25.5,2015-06-30 23:16:42 UTC,-74.001099,40.730961,-73.957123,40.806908,2
199997,20.0,2015-06-30 23:31:06 UTC,-73.999962,40.733135,-73.962448,40.773041,4
199998,8.5,2015-06-30 23:33:33 UTC,-73.980988,40.762020,-73.960083,40.770531,1


In [12]:
uber['pickup_datetime'] = pd.to_datetime(uber['pickup_datetime'])
uber

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,8.5,2009-01-01 01:15:22+00:00,-73.981918,40.779456,-73.957685,40.771043,2
1,13.0,2009-01-01 01:59:17+00:00,-73.983759,40.721389,-73.994833,40.687179,2
2,10.6,2009-01-01 02:05:03+00:00,-73.956635,40.771254,-73.991528,40.749778,2
3,12.2,2009-01-01 02:09:13+00:00,-73.984605,40.728020,-73.955746,40.776830,1
4,11.0,2009-01-01 02:13:41+00:00,-73.980127,40.737425,-74.009544,40.726025,4
...,...,...,...,...,...,...,...
199995,18.5,2015-06-30 22:57:53+00:00,-73.971703,40.782207,-73.943680,40.827991,2
199996,25.5,2015-06-30 23:16:42+00:00,-74.001099,40.730961,-73.957123,40.806908,2
199997,20.0,2015-06-30 23:31:06+00:00,-73.999962,40.733135,-73.962448,40.773041,4
199998,8.5,2015-06-30 23:33:33+00:00,-73.980988,40.762020,-73.960083,40.770531,1


In [16]:
uber['pickup_datetime_year'] = uber['pickup_datetime'].dt.year
uber['pickup_datetime_month'] = uber['pickup_datetime'].dt.month
uber['pickup_datetime_day'] = uber['pickup_datetime'].dt.day
uber['pickup_datetime_hour'] = uber['pickup_datetime'].dt.hour
uber

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_hour
0,8.5,2009-01-01 01:15:22+00:00,-73.981918,40.779456,-73.957685,40.771043,2,2009,1,1,1
1,13.0,2009-01-01 01:59:17+00:00,-73.983759,40.721389,-73.994833,40.687179,2,2009,1,1,1
2,10.6,2009-01-01 02:05:03+00:00,-73.956635,40.771254,-73.991528,40.749778,2,2009,1,1,2
3,12.2,2009-01-01 02:09:13+00:00,-73.984605,40.728020,-73.955746,40.776830,1,2009,1,1,2
4,11.0,2009-01-01 02:13:41+00:00,-73.980127,40.737425,-74.009544,40.726025,4,2009,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
199995,18.5,2015-06-30 22:57:53+00:00,-73.971703,40.782207,-73.943680,40.827991,2,2015,6,30,22
199996,25.5,2015-06-30 23:16:42+00:00,-74.001099,40.730961,-73.957123,40.806908,2,2015,6,30,23
199997,20.0,2015-06-30 23:31:06+00:00,-73.999962,40.733135,-73.962448,40.773041,4,2015,6,30,23
199998,8.5,2015-06-30 23:33:33+00:00,-73.980988,40.762020,-73.960083,40.770531,1,2015,6,30,23


In [17]:
# Compute distance from each coordinates
uber['distance'] = np.sqrt((uber['dropoff_latitude']**2 - uber['pickup_latitude']**2) +
                           uber['dropoff_longitude']**2 - uber['pickup_longitude'])
uber

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_hour,distance
0,8.5,2009-01-01 01:15:22+00:00,-73.981918,40.779456,-73.957685,40.771043,2,2009,1,1,1,74.451561
1,13.0,2009-01-01 01:59:17+00:00,-73.983759,40.721389,-73.994833,40.687179,2,2009,1,1,1,74.474385
2,10.6,2009-01-01 02:05:03+00:00,-73.956635,40.771254,-73.991528,40.749778,2,2009,1,1,2,74.477863
3,12.2,2009-01-01 02:09:13+00:00,-73.984605,40.728020,-73.955746,40.776830,1,2009,1,1,2,74.480972
4,11.0,2009-01-01 02:13:41+00:00,-73.980127,40.737425,-74.009544,40.726025,4,2009,1,1,2,74.501437
...,...,...,...,...,...,...,...,...,...,...,...,...
199995,18.5,2015-06-30 22:57:53+00:00,-73.971703,40.782207,-73.943680,40.827991,2,2015,6,30,22,74.467281
199996,25.5,2015-06-30 23:16:42+00:00,-74.001099,40.730961,-73.957123,40.806908,2,2015,6,30,23,74.497313
199997,20.0,2015-06-30 23:31:06+00:00,-73.999962,40.733135,-73.962448,40.773041,4,2015,6,30,23,74.482859
199998,8.5,2015-06-30 23:33:33+00:00,-73.980988,40.762020,-73.960083,40.770531,1,2015,6,30,23,74.463204
