# Exploratory Data Analysis

## Reading files

In [1]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#importing data
data = pd.read_csv('nyc_taxi_trip_duration.csv')

In [3]:
#first 5 instances using "head()" function
data.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,400
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,1100
2,id0857912,2,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,N,1635
3,id3744273,2,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.96167,40.75972,-73.956779,40.780628,N,1141
4,id0232939,1,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.01712,40.708469,-73.988182,40.740631,N,848


In [4]:
#finding out the shape of the data using "shape" variable: Output (rows, columns)
data.shape

(729322, 11)

In [5]:
#Printing all the columns present in data
data.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration'],
      dtype='object')

- trip_duration is the target variable

In [21]:
data.nunique()

id                    729322
vendor_id                  2
passenger_count            9
pickup_longitude       19729
pickup_latitude        39776
dropoff_longitude      27892
dropoff_latitude       53579
store_and_fwd_flag         2
trip_duration           6296
hod_pickup                24
hod_dropoff               24
doy_pickup               182
doy_dropoff              183
woy_pickup                27
woy_dropoff               27
moy_pickup                 6
moy_dropoff                7
dow_pickup                 7
dow_dropoff                7
dtype: int64

## Variable Identification and Typecasting

In [6]:
# A closer look at the data types present in the data
data.dtypes

id                     object
vendor_id               int64
pickup_datetime        object
dropoff_datetime       object
passenger_count         int64
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
store_and_fwd_flag     object
trip_duration           int64
dtype: object

There are a lot of variables visible at one, so let's narrow this down by looking **at one datatype at once**. We will start with int

### Integer Data Type

In [7]:
# Identifying variables with integer datatype
data.dtypes[data.dtypes == 'int64']

vendor_id          int64
passenger_count    int64
trip_duration      int64
dtype: object

Summary:

- **vendor_id**  is a code indicating the provider associated with the trip record. It should be  **category.**
- **passenger_count** is the number of passengers in the vehicle (driver entered value). It is **Okay as Integer.**
- **trip_duration** is (target) duration of the trip in seconds. It is **Okay as Integer.**

In [8]:
# typecasting "id" to string type
data['vendor_id'] = data['vendor_id'].astype('category')

### Float Data Type

In [9]:
# Identifying variables with float datatype
data.dtypes[data.dtypes == 'float64']

pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
dtype: object

Summary:

- **pickup_longitude**  the longitude where the meter was engaged. It is **Okay as Float.**
- **pickup_latitude** the latitude where the meter was engaged. It is **Okay as Float.**
- **dropoff_longitude** the longitude where the meter was disengaged. It is **Okay as Float.**
- **dropoff_latitude** the latitude where the meter was disengaged. It is **Okay as Float.**

### Object Data Type

In [10]:
data.dtypes

id                      object
vendor_id             category
pickup_datetime         object
dropoff_datetime        object
passenger_count          int64
pickup_longitude       float64
pickup_latitude        float64
dropoff_longitude      float64
dropoff_latitude       float64
store_and_fwd_flag      object
trip_duration            int64
dtype: object

*    **variables like 'pickup_datetime', 'dropoff_datetime' and 'store_and_fwd_flag' are of type object**. This means that **Pandas was not able to recognise the datatype** of these three variables.

In [11]:
# Manually checking object types
data[['id','pickup_datetime','dropoff_datetime','store_and_fwd_flag']].head(7)

Unnamed: 0,id,pickup_datetime,dropoff_datetime,store_and_fwd_flag
0,id1080784,2016-02-29 16:40:21,2016-02-29 16:47:01,N
1,id0889885,2016-03-11 23:35:37,2016-03-11 23:53:57,N
2,id0857912,2016-02-21 17:59:33,2016-02-21 18:26:48,N
3,id3744273,2016-01-05 09:44:31,2016-01-05 10:03:32,N
4,id0232939,2016-02-17 06:42:23,2016-02-17 06:56:31,N
5,id1918069,2016-02-14 18:31:42,2016-02-14 18:55:57,N
6,id2429028,2016-04-20 20:30:14,2016-04-20 20:36:51,N


*    **id** is a unique identifier for each trip. **It should be a String**.
*    **store_and_fwd_flag** variable **belong to categorical data type**.
*    **pickup_datetime** and **dropoff_datetime** belong to **datetime data type**.

In [12]:
# typecasting "id" to string type
data['id'] = data['id'].astype('string')

In [13]:
# typecasting "store_and_fwd_flag" to category type
data['store_and_fwd_flag'] = data['store_and_fwd_flag'].astype('category')

In [14]:
# checking
data[['store_and_fwd_flag']].dtypes

store_and_fwd_flag    category
dtype: object

In [15]:
data.dtypes

id                      string
vendor_id             category
pickup_datetime         object
dropoff_datetime        object
passenger_count          int64
pickup_longitude       float64
pickup_latitude        float64
dropoff_longitude      float64
dropoff_latitude       float64
store_and_fwd_flag    category
trip_duration            int64
dtype: object

### datetime Data Type

In [16]:
# creating instances(pickup_date and dropoff_date) of DatetimeIndex class using "pickup_datetime" and "dropoff_datetime" 
pickup_date = pd.DatetimeIndex(data['pickup_datetime'])
dropoff_date = pd.DatetimeIndex(data['dropoff_datetime'])

In [17]:
# extracting new columns from "pickup_datetime" and dropoff_datetime

# hour of day when trip started and ended
data['hod_pickup'] = pickup_date.hour
data['hod_dropoff'] = dropoff_date.hour

# day of year when trip started and ended
data['doy_pickup'] = pickup_date.dayofyear
data['doy_dropoff'] = dropoff_date.dayofyear

# week of year when trip started and ended
data['woy_pickup'] = pd.Int64Index(pickup_date.isocalendar().week)
data['woy_dropoff'] = pd.Int64Index(dropoff_date.isocalendar().week)

# month of year when trip started and ended
data['moy_pickup'] = pickup_date.month
data['moy_dropoff'] = dropoff_date.month

# day of week when trip started and ended
data['dow_pickup'] = pickup_date.dayofweek
data['dow_dropoff'] = dropoff_date.dayofweek

In [18]:
# checking new extracted columns using datetime
data[['pickup_datetime','dropoff_datetime', 'hod_pickup','doy_pickup','woy_pickup','moy_pickup', 'dow_pickup',
     'hod_dropoff','doy_dropoff','woy_dropoff','moy_dropoff', 'dow_dropoff']].head()

Unnamed: 0,pickup_datetime,dropoff_datetime,hod_pickup,doy_pickup,woy_pickup,moy_pickup,dow_pickup,hod_dropoff,doy_dropoff,woy_dropoff,moy_dropoff,dow_dropoff
0,2016-02-29 16:40:21,2016-02-29 16:47:01,16,60,9,2,0,16,60,9,2,0
1,2016-03-11 23:35:37,2016-03-11 23:53:57,23,71,10,3,4,23,71,10,3,4
2,2016-02-21 17:59:33,2016-02-21 18:26:48,17,52,7,2,6,18,52,7,2,6
3,2016-01-05 09:44:31,2016-01-05 10:03:32,9,5,1,1,1,10,5,1,1,1
4,2016-02-17 06:42:23,2016-02-17 06:56:31,6,48,7,2,2,6,48,7,2,2


The first 2 columns are the complete date time when the trip started/ended.

The next columns represent the hour of the day, day of year, week of year, month of year, day of week when the trip started. The remaining columns represent the hour of the day, day of year, week of year, month of year, day of week when the trip started

**Breaking down the date variable** into these granular information will **help us in understand when the trip was done from different perspectives**. Now that we have extracted the essentials from the pickup_datetime and dropoff_datetime variables, we will drop it from the dataset.

In [19]:
data = data.drop(columns = ['pickup_datetime', 'dropoff_datetime'])
data.dtypes

id                      string
vendor_id             category
passenger_count          int64
pickup_longitude       float64
pickup_latitude        float64
dropoff_longitude      float64
dropoff_latitude       float64
store_and_fwd_flag    category
trip_duration            int64
hod_pickup               int64
hod_dropoff              int64
doy_pickup               int64
doy_dropoff              int64
woy_pickup               int64
woy_dropoff              int64
moy_pickup               int64
moy_dropoff              int64
dow_pickup               int64
dow_dropoff              int64
dtype: object

## Univariate Analysis: Numerical Variables

In [20]:
# Numerical datatypes
data.select_dtypes(include=['int64','float64','Int64']).dtypes

passenger_count        int64
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
trip_duration          int64
hod_pickup             int64
hod_dropoff            int64
doy_pickup             int64
doy_dropoff            int64
woy_pickup             int64
woy_dropoff            int64
moy_pickup             int64
moy_dropoff            int64
dow_pickup             int64
dow_dropoff            int64
dtype: object

In [None]:
# seggregating variables into groups
trip_detail = ['passenger_count','trip_duration', 'hod_pickup', 'doy_pickup']
trip_points = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']

In [None]:
# custom function for easy and efficient analysis of numerical univariate

def UVA_numeric(data, var_group):
  '''
  Univariate_Analysis_numeric
  takes a group of variables (INTEGER and FLOAT) and plot/print all the descriptives and properties along with KDE.

  Runs a loop: calculate all the descriptives of i(th) variable and plot/print it
  '''

  size = len(var_group)
  plt.figure(figsize = (7*size,3), dpi = 100)
  
  #looping for each variable
  for j,i in enumerate(var_group):
    
    # calculating descriptives of variable
    mini = data[i].min()
    maxi = data[i].max()
    ran = data[i].max()-data[i].min()
    mean = data[i].mean()
    median = data[i].median()
    st_dev = data[i].std()
    skew = data[i].skew()
    kurt = data[i].kurtosis()

    # calculating points of standard deviation
    points = mean-st_dev, mean+st_dev

    #Plotting the variable with every information
    plt.subplot(1,size,j+1)
    sns.kdeplot(data[i], shade=True)
    sns.lineplot(points, [0,0], color = 'black', label = "std_dev")
    sns.scatterplot([mini,maxi], [0,0], color = 'orange', label = "min/max")
    sns.scatterplot([mean], [0], color = 'red', label = "mean")
    sns.scatterplot([median], [0], color = 'blue', label = "median")
    plt.xlabel('{}'.format(i), fontsize = 20)
    plt.ylabel('density')
    plt.title('std_dev = {}; kurtosis = {};\nskew = {}; range = {}\nmean = {}; median = {}'.format((round(points[0],2),round(points[1],2)),
                                                                                                   round(kurt,2),
       