# NYC Yellow Cap Data Engineering Project
# Exploratory Data Analysis 

The purposes of this EDA are to gain familiarity with the dataset, and to identify data issues that require attention.   

In [3]:
#import packages
import configparser
import boto3
import pandas as pd
import gc
import json
from pandas.io.json import json_normalize


In [7]:
# get AWS credentials from the credentials.file
config = configparser.ConfigParser()
config.read_file(open('credentials.cfg'))
AWS_KEY_ID=config.get("AWS","KEY")
AWS_SECRET=config.get("AWS","SECRET")

#download data if not exist

s3 = boto3.client('s3',
region_name='us-west-2',
aws_access_key_id=AWS_KEY_ID,
aws_secret_access_key=AWS_SECRET)

# months = [f"{i:02d}" for i in range(1,13)]
# s3.download_file(Filename='{month}.csv', Bucket='nyc-yellow-cab-project',Key="tripdata/2019/{month}.csv")

s3.download_file(Filename='weather.json', Bucket='nyc-yellow-cab-project',Key="weather/weather.json")

## 1. Inspect trips record data

First inspect trip records data for 2019-01 as an example:

In [8]:
#Inspect trips record for 2019-01
trips01 = pd.read_csv('01.csv')

In [9]:
#check dataset size
print(f'number of rows: {trips01.shape[0]}')

number of rows: 7667792


In [48]:
#check null values for each feature
print(trips01.isnull().sum(axis=0))

VendorID                       0
tpep_pickup_datetime           0
tpep_dropoff_datetime          0
passenger_count                0
trip_distance                  0
RatecodeID                     0
store_and_fwd_flag             0
PULocationID                   0
DOLocationID                   0
payment_type                   0
fare_amount                    0
extra                          0
mta_tax                        0
tip_amount                     0
tolls_amount                   0
improvement_surcharge          0
total_amount                   0
congestion_surcharge     4855978
dtype: int64


With the exception of the congestion_surcharge feature, there are no missing values. 

In [33]:
#data description for numerical columns
trips01.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
count,7667792.0,7667792.0,7667792.0,7667792.0,7667792.0,7667792.0,7667792.0,7667792.0,7667792.0,7667792.0,7667792.0,7667792.0,7667792.0,7667792.0,2811814.0
mean,1.636775,1.567078,2.801084,1.058371,165.5009,163.7529,1.291776,12.40941,0.3280394,0.4968458,1.827367,0.3169187,0.2993382,15.68222,3.289691e-05
std,0.5398204,1.224431,3.737529,0.6780889,66.3918,70.36445,0.4733229,262.0721,0.5074789,0.05337843,2.501213,2.023665,0.01911711,262.2932,0.009068695
min,1.0,0.0,0.0,1.0,1.0,1.0,1.0,-362.0,-60.0,-0.5,-63.5,-70.0,-0.3,-362.8,0.0
25%,1.0,1.0,0.9,1.0,130.0,113.0,1.0,6.0,0.0,0.5,0.0,0.0,0.3,8.19,0.0
50%,2.0,1.0,1.53,1.0,162.0,162.0,1.0,8.5,0.0,0.5,1.43,0.0,0.3,11.27,0.0
75%,2.0,2.0,2.8,1.0,234.0,234.0,2.0,13.5,0.5,0.5,2.33,0.0,0.3,16.56,0.0
max,4.0,9.0,831.8,99.0,265.0,265.0,4.0,623259.9,535.38,60.8,787.25,3288.0,0.6,623261.7,2.5


For the vendorID column, according to the data dictionary, there should be values of 1 and 2, corresponding to the 2 providers. Yet there number "4" is also present. There is no information available online to explain this anomaly. To take a closer look: 

In [53]:
trips01.VendorID.value_counts(normalize=True,dropna=False)

2    0.606718
1    0.383263
4    0.010019
Name: VendorID, dtype: float64

Only about 1% of rows are with VendorID = 4.

In [13]:
#check if any tpep_pickup_datetime is not in 2019-01
trips01[(trips01.tpep_pickup_datetime < '2019-01-01 00:00:00') | (trips01.tpep_pickup_datetime > '2019-01-31 23:59:59')]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
2,2,2018-12-21 13:48:30,2018-12-21 13:52:40,3,0.00,1,N,236,236,1,4.5,0.5,0.5,0.00,0.00,0.3,5.80,
3,2,2018-11-28 15:52:25,2018-11-28 15:55:45,5,0.00,1,N,193,193,2,3.5,0.5,0.5,0.00,0.00,0.3,7.55,
4,2,2018-11-28 15:56:57,2018-11-28 15:58:33,5,0.00,2,N,193,193,2,52.0,0.0,0.5,0.00,0.00,0.3,55.55,
5,2,2018-11-28 16:25:49,2018-11-28 16:28:26,5,0.00,1,N,193,193,2,3.5,0.5,0.5,0.00,5.76,0.3,13.31,
6,2,2018-11-28 16:29:37,2018-11-28 16:33:43,5,0.00,2,N,193,193,2,52.0,0.0,0.5,0.00,0.00,0.3,55.55,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7665728,2,2019-02-01 00:01:04,2019-02-01 00:07:41,1,0.84,1,N,164,50,1,6.5,0.5,0.5,1.56,0.00,0.3,9.36,0.0
7665964,2,2019-02-01 00:00:35,2019-02-01 00:06:06,1,0.91,1,N,164,161,1,5.5,0.5,0.5,1.70,0.00,0.3,8.50,0.0
7666027,2,2019-02-01 00:01:40,2019-02-01 00:32:02,1,18.12,1,N,132,165,1,50.0,0.5,0.5,0.00,0.00,0.3,51.30,0.0
7666250,2,2019-02-01 00:02:20,2019-02-01 00:14:01,1,2.53,1,N,144,170,1,10.5,0.5,0.5,2.36,0.00,0.3,14.16,0.0


Interestingly, some records have pickup times that are not within that month.

In [56]:
#create reference feature list to compare with datasets of other months
ref_features = list(trips01.columns)
print(features_list)

['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge']


In [57]:
#check other months

months = [f"{i:02d}" for i in range(2,13)]
print(months)


['02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']


In [83]:
for month in months:
    
    print("**********************************")
    print(f"Checking dataset for month {month}...\n")
    trips = pd.read_csv(f'data/{month}.csv')
    
    print(f'Number of rows: {trips.shape[0]}\n')
    
    if list(trips.columns) == ref_features:
        print('Features are the same as reference features list.\n')
    else:
        print('***WARNING: Some features are different.***\n')
    
    print('Check missing values...\n')
    print(trips.isnull().sum())
    print("\n")
    print('Check statistical summary of numerical features...\n')
    print(trips.describe())
    print("\n")
    print(f'Maxium fare is {trips.fare_amount.max()}')
    print("\n\n")

**********************************
Checking dataset for month 02...

Number of rows: 7019375

Features are the same as reference features list.

Check missing values...

VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
congestion_surcharge     0
dtype: int64


Check statistical summary of numerical features...

           VendorID  passenger_count  trip_distance    RatecodeID  \
count  7.019375e+06     7.019375e+06   7.019375e+06  7.019375e+06   
mean   1.636639e+00     1.571420e+00   2.884923e+00  1.061126e+00   
std    5.248609e-01     1.228251e+00   3.780133e+00  6.375023e-01   
m

  interactivity=interactivity, compiler=compiler, result=result)


Number of rows: 6310419

Features are the same as reference features list.

Check missing values...

VendorID                 33959
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
passenger_count          33959
trip_distance                0
RatecodeID               33959
store_and_fwd_flag       33959
PULocationID                 0
DOLocationID                 0
payment_type             33959
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge         0
dtype: int64


Check statistical summary of numerical features...

           VendorID  passenger_count  trip_distance    RatecodeID  \
count  6.276460e+06     6.276460e+06   6.310419e+06  6.276460e+06   
mean   1.647390e+00     1.572045e+00   3.110132e+00  1.061235e+00   
std    4.881620e-01     1.214311e+00   4.065832e+00  7.495861e-01  

  interactivity=interactivity, compiler=compiler, result=result)


Number of rows: 6073357

Features are the same as reference features list.

Check missing values...

VendorID                 33321
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
passenger_count          33321
trip_distance                0
RatecodeID               33321
store_and_fwd_flag       33321
PULocationID                 0
DOLocationID                 0
payment_type             33321
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge         0
dtype: int64


Check statistical summary of numerical features...

           VendorID  passenger_count  trip_distance    RatecodeID  \
count  6.040036e+06     6.040036e+06   6.073357e+06  6.040036e+06   
mean   1.644728e+00     1.573908e+00   3.162674e+00  1.064386e+00   
std    4.788811e-01     1.213185e+00   4.124069e+00  7.894866e-01  

  interactivity=interactivity, compiler=compiler, result=result)


Number of rows: 6567788

Features are the same as reference features list.

Check missing values...

VendorID                 34089
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
passenger_count          34089
trip_distance                0
RatecodeID               34089
store_and_fwd_flag       34089
PULocationID                 0
DOLocationID                 0
payment_type             34089
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge         1
dtype: int64


Check statistical summary of numerical features...

           VendorID  passenger_count  trip_distance    RatecodeID  \
count  6.533699e+06     6.533699e+06   6.567788e+06  6.533699e+06   
mean   1.648833e+00     1.549407e+00   3.086974e+00  1.064147e+00   
std    4.773602e-01     1.192190e+00   4.036779e+00  8.400102e-01  

  interactivity=interactivity, compiler=compiler, result=result)


Number of rows: 7213891

Features are the same as reference features list.

Check missing values...

VendorID                 46723
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
passenger_count          46723
trip_distance                0
RatecodeID               46723
store_and_fwd_flag       46723
PULocationID                 0
DOLocationID                 0
payment_type             46723
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge         0
dtype: int64


Check statistical summary of numerical features...

           VendorID  passenger_count  trip_distance    RatecodeID  \
count  7.167168e+06     7.167168e+06   7.213891e+06  7.167168e+06   
mean   1.653885e+00     1.538858e+00   3.016725e+00  1.062221e+00   
std    4.757303e-01     1.178510e+00   3.940327e+00  8.380422e-01  

  interactivity=interactivity, compiler=compiler, result=result)


Number of rows: 6878111

Features are the same as reference features list.

Check missing values...

VendorID                 47491
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
passenger_count          47491
trip_distance                0
RatecodeID               47491
store_and_fwd_flag       47491
PULocationID                 0
DOLocationID                 0
payment_type             47491
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge         2
dtype: int64


Check statistical summary of numerical features...

           VendorID  passenger_count  trip_distance    RatecodeID  \
count  6.830620e+06     6.830620e+06   6.878111e+06  6.830620e+06   
mean   1.659567e+00     1.541836e+00   2.928827e+00  1.060229e+00   
std    4.738550e-01     1.176670e+00   8.130716e+00  8.126997e-01  

  interactivity=interactivity, compiler=compiler, result=result)


Number of rows: 6896317

Features are the same as reference features list.

Check missing values...

VendorID                 51018
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
passenger_count          51018
trip_distance                0
RatecodeID               51018
store_and_fwd_flag       51018
PULocationID                 0
DOLocationID                 0
payment_type             51018
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge         0
dtype: int64


Check statistical summary of numerical features...

           VendorID  passenger_count  trip_distance    RatecodeID  \
count  6.845299e+06     6.845299e+06   6.896317e+06  6.845299e+06   
mean   1.666457e+00     1.550877e+00   2.973421e+00  1.065756e+00   
std    4.714787e-01     1.174330e+00   1.643113e+01  9.309869e-01  

A few observations:  

1. Datasets of all months have the same columns.

2. VendorID of 4 is a recurring issue. Here I assume that it means "UNKNOWN" and will use this information in EMR.

3. In some months (e.g. 07, 08), there are missing VendorID, passenger_count,rateCodeID, store_and_fwd_flag, and payment_type. I assume those are erroneous records and they will be removed accordingly.

4. Starting from month 07 and beyond, there's a warning regarding mixed datatype for Column(6) (RateCodeID). Those are the erroneous records that will be removed accordingly.

5. Some fares are unreasonable high (e.g. the max fare is $671123.14 in month 02). Yet we will keep the records for the data scientists and analysts for their evaluation.

## 2. Inspect Weather Data

In [14]:
weather = pd.read_json('data/weather.json')

In [15]:
weather.head()

Unnamed: 0,city_name,lat,lon,main,wind,clouds,weather,dt,dt_iso,timezone,rain,snow
0,New York,40.712775,-74.005973,"{'temp': 29.32, 'temp_min': 27.94, 'temp_max':...","{'speed': 3.36, 'deg': 0}",{'all': 1},"[{'id': 800, 'main': 'Clear', 'description': '...",1420070400,2015-01-01 00:00:00 +0000 UTC,-18000,,
1,New York,40.712775,-74.005973,"{'temp': 28.81, 'temp_min': 24.98, 'temp_max':...","{'speed': 4.7, 'deg': 260}",{'all': 1},"[{'id': 800, 'main': 'Clear', 'description': '...",1420074000,2015-01-01 01:00:00 +0000 UTC,-18000,,
2,New York,40.712775,-74.005973,"{'temp': 28.15, 'temp_min': 23, 'temp_max': 30...","{'speed': 3.36, 'deg': 240}",{'all': 1},"[{'id': 800, 'main': 'Clear', 'description': '...",1420077600,2015-01-01 02:00:00 +0000 UTC,-18000,,
3,New York,40.712775,-74.005973,"{'temp': 28.09, 'temp_min': 24.08, 'temp_max':...","{'speed': 4.7, 'deg': 0}",{'all': 1},"[{'id': 800, 'main': 'Clear', 'description': '...",1420081200,2015-01-01 03:00:00 +0000 UTC,-18000,,
4,New York,40.712775,-74.005973,"{'temp': 28.26, 'temp_min': 24.98, 'temp_max':...","{'speed': 7.96, 'deg': 224}",{'all': 1},"[{'id': 800, 'main': 'Clear', 'description': '...",1420084800,2015-01-01 04:00:00 +0000 UTC,-18000,,


First,features such as city_name, lat, lon are irrelevant.  
Also, some data are nested json. We will evaluate the columns that are relevant to the etl pipeline. 

In [16]:
weather_list = json.load((open('data/weather.json')))

In [18]:
#A sample of the weather data record 
weather_list[0]

{'city_name': 'New York',
 'lat': 40.712775,
 'lon': -74.005973,
 'main': {'temp': 29.32,
  'temp_min': 27.94,
  'temp_max': 30.92,
  'feels_like': 21.43,
  'pressure': 1025,
  'humidity': 37},
 'wind': {'speed': 3.36, 'deg': 0},
 'clouds': {'all': 1},
 'weather': [{'id': 800,
   'main': 'Clear',
   'description': 'sky is clear',
   'icon': '01n'}],
 'dt': 1420070400,
 'dt_iso': '2015-01-01 00:00:00 +0000 UTC',
 'timezone': -18000}

In [19]:
#nomalize the data with relevant data columns

weather = pd.json_normalize(weather_list,"weather",[['dt_iso'],['main','temp'],['main','temp_min'],['main','temp_max'],
                                                    ['main','feels_like'],['main','pressure'],['main','humidity'],
                                                    ['wind','speed'],['wind','deg'],['clouds','all']])

In [20]:
weather.head()

Unnamed: 0,id,main,description,icon,dt_iso,main.temp,main.temp_min,main.temp_max,main.feels_like,main.pressure,main.humidity,wind.speed,wind.deg,clouds.all
0,800,Clear,sky is clear,01n,2015-01-01 00:00:00 +0000 UTC,29.32,27.94,30.92,21.43,1025,37,3.36,0,1
1,800,Clear,sky is clear,01n,2015-01-01 01:00:00 +0000 UTC,28.81,24.98,32.0,20.14,1024,37,4.7,260,1
2,800,Clear,sky is clear,01n,2015-01-01 02:00:00 +0000 UTC,28.15,23.0,30.92,20.14,1024,35,3.36,240,1
3,800,Clear,sky is clear,01n,2015-01-01 03:00:00 +0000 UTC,28.09,24.08,30.2,19.42,1023,38,4.7,0,1
4,800,Clear,sky is clear,01n,2015-01-01 04:00:00 +0000 UTC,28.26,24.98,30.92,17.87,1023,42,7.96,224,1


In [21]:
# limit the date to year 2019

weather = weather.loc[(weather.dt_iso >= '2018-12-31') & (weather.dt_iso <= '2020-01-01')]

In [22]:
weather.head()

Unnamed: 0,id,main,description,icon,dt_iso,main.temp,main.temp_min,main.temp_max,main.feels_like,main.pressure,main.humidity,wind.speed,wind.deg,clouds.all
36019,803,Clouds,broken clouds,04n,2018-12-31 00:00:00 +0000 UTC,38.17,35.6,39.99,31.8,1025,75,4.7,190,63
36020,802,Clouds,scattered clouds,03n,2018-12-31 01:00:00 +0000 UTC,37.89,35.01,39.99,30.9,1025,76,5.82,180,39
36021,800,Clear,sky is clear,01n,2018-12-31 02:00:00 +0000 UTC,37.38,34.0,39.99,30.96,1025,76,4.7,210,1
36022,800,Clear,sky is clear,01n,2018-12-31 03:00:00 +0000 UTC,36.63,32.0,41.0,30.97,1025,79,3.36,280,1
36023,800,Clear,sky is clear,01n,2018-12-31 04:00:00 +0000 UTC,36.1,30.99,41.0,30.47,1025,81,3.36,0,1


In [29]:
print(f'number of records: {weather.shape[0]}')

number of records: 8872


In [23]:
weather.isnull().sum()

id                 0
main               0
description        0
icon               0
dt_iso             0
main.temp          0
main.temp_min      0
main.temp_max      0
main.feels_like    0
main.pressure      0
main.humidity      0
wind.speed         0
wind.deg           0
clouds.all         0
dtype: int64

There is no missing data in the weather dataset.

In [25]:
#Change numerical features to numeric

num_features_list = ['main.temp', 'main.temp_min', 'main.temp_max', 'main.feels_like','main.pressure', 'main.humidity', 
                   'wind.speed','wind.deg', 'clouds.all']

for feature in num_features_list:
    weather[feature] = weather[feature].astype('float')

In [26]:
weather.describe()

Unnamed: 0,id,main.temp,main.temp_min,main.temp_max,main.feels_like,main.pressure,main.humidity,wind.speed,wind.deg,clouds.all
count,8872.0,8872.0,8872.0,8872.0,8872.0,8872.0,8872.0,8872.0,8872.0,8872.0
mean,726.389766,55.23805,51.933394,58.143291,50.284022,1016.872858,70.244026,9.167759,199.805681,50.607642
std,128.88203,18.028087,17.531875,18.22296,23.353538,8.143625,14.995459,5.72202,103.999515,43.458812
min,500.0,2.61,0.0,5.77,-10.95,987.0,25.0,0.18,0.0,0.0
25%,601.0,40.08,37.0,43.0,30.39,1012.0,59.0,4.985,113.0,0.0
50%,800.0,55.67,52.88,58.82,48.92,1017.0,71.0,7.63,220.0,53.0
75%,803.0,70.755,67.1025,73.4,70.84,1023.0,82.0,12.08,289.0,99.0
max,804.0,97.59,93.02,100.99,107.6,1041.0,100.0,42.3,360.0,100.0


The numbers are within a reasonable range.

## 3. To-do-list

1. Update vendor.json by inserting {"id":4, "provider":"UNKNOWN"}
2. Update rate.json inserting {"id": 99, "rate", "UNKNOWN"}
3. (In EMR) Remove trip record with Vendor ID is null.
3. (In EMR) Remove records with pickup datetime out of the expected  datetime range.