### Green Taxi Trip Records

In [1]:
#Importar librerías.
import numpy as np
import pandas as pd

In [2]:
#Leer diccionario con vínculos.
diccionario = pd.read_pickle('datos_tlc/tlc_verdes')

In [3]:
#Descargar primer tabla (12/2018).
exec(diccionario['green_2018_61'])

In [4]:
#Ver primer tabla (12/2018).
green_2018_61.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 719654 entries, 0 to 719653
Data columns (total 20 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   VendorID               719654 non-null  int64         
 1   lpep_pickup_datetime   719654 non-null  datetime64[ns]
 2   lpep_dropoff_datetime  719654 non-null  datetime64[ns]
 3   store_and_fwd_flag     685031 non-null  object        
 4   RatecodeID             685031 non-null  float64       
 5   PULocationID           719654 non-null  int64         
 6   DOLocationID           719654 non-null  int64         
 7   passenger_count        685031 non-null  float64       
 8   trip_distance          719654 non-null  float64       
 9   fare_amount            719654 non-null  float64       
 10  extra                  719654 non-null  float64       
 11  mta_tax                719654 non-null  float64       
 12  tip_amount             719654 non-null  floa

Eliminar:
* VendorID (Código que indica el proveedor de datos)
* store_and_fwd_flag (Indica si el registro del viaje se mantuvo en memoria del vehículo antes de enviarlo al proveedor)
* RatecodeID (Código de tarifa final en vigor al final del viaje)
* ehail_fee (?)

In [5]:
#Eliminar columnas indeseadas.
green_2018_61 = green_2018_61.drop(columns = ['VendorID', 'store_and_fwd_flag', 'RatecodeID', 'ehail_fee'])

Separar:
* lpep_pickup_datetime
* lpep_dropoff_datetime

En día (YYYY-MM-DD) Hora de pickup Hora de dropoff

In [6]:
#Insertar 'pickup_day'.
green_2018_61.insert(0, 'pickup_day', green_2018_61['lpep_pickup_datetime'].dt.date)

In [7]:
#Insertar 'pickup_time'.
green_2018_61.insert(1, 'pickup_time', green_2018_61['lpep_pickup_datetime'].dt.time)

In [8]:
#Insertar 'dropoff_day'.
green_2018_61.insert(2, 'dropoff_day', green_2018_61['lpep_dropoff_datetime'].dt.date)

In [9]:
#Insertar 'dropoff_time'.
green_2018_61.insert(3, 'dropoff_time', green_2018_61['lpep_dropoff_datetime'].dt.time)

In [10]:
#Eliminar 'lpep_pickup_datetime' y 'lpep_dropoff_datetime'.
green_2018_61 = green_2018_61.drop(columns = ['lpep_pickup_datetime', 'lpep_dropoff_datetime'])

Identificar valores faltantes.

In [11]:
#Encontrar null.
green_2018_61.isnull().sum()

pickup_day                    0
pickup_time                   0
dropoff_day                   0
dropoff_time                  0
PULocationID                  0
DOLocationID                  0
passenger_count           34623
trip_distance                 0
fare_amount                   0
extra                         0
mta_tax                       0
tip_amount                    0
tolls_amount                  0
improvement_surcharge         0
total_amount                  0
payment_type              34623
trip_type                 35019
congestion_surcharge     719654
dtype: int64

In [12]:
#Sustituir nulos en 'congestion_surcharge' por '0'.
green_2018_61['congestion_surcharge'] = green_2018_61['congestion_surcharge'].fillna(value = 0)

In [13]:
#Convertir columna a tipo float.
green_2018_61['congestion_surcharge'].astype(float)

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
719649    0.0
719650    0.0
719651    0.0
719652    0.0
719653    0.0
Name: congestion_surcharge, Length: 719654, dtype: float64

Al parecer las columnas 'passenger_count' y 'payment_type' tienen la misma cantidad de valores faltantes, así como varias coincidencias de 'trip_type'.

In [14]:
#Revisar las tres columnas con valores faltantes.
(green_2018_61[['passenger_count', 'payment_type', 'trip_type']].query("passenger_count.isnull() == True", inplace = False)).sum()

passenger_count    0.0
payment_type       0.0
trip_type          0.0
dtype: float64

Imputar valores nulos de 'passenger_count' por el promedio redondeado a entero más cercano.

In [15]:
#Promedio de 'passenger_count'.
promedio = np.around(green_2018_61['passenger_count'].mean(), decimals = 0)

In [16]:
#Sustituir el promedio en los valores nulos de 'passenger_count'.
green_2018_61['passenger_count'].fillna(promedio, inplace = True)

Imputar valores nulos de 'payment_type' y 'trip_type' con su respectiva moda.

In [17]:
#Moda de 'payment_type' y 'trip_type'.
moda_payment = green_2018_61['payment_type'].mode().astype(int)
moda_trip = green_2018_61['trip_type'].mode().astype(int)

In [18]:
#Sustituir la moda en los valores nulos.
green_2018_61['payment_type'].fillna(value = moda_payment, inplace = True)
green_2018_61['trip_type'].fillna(value = moda_trip, inplace = True)

Se rellenan medias y modas para otros posibles nulos:

In [19]:
#Calcular medias.
media_distance = np.around(green_2018_61['trip_distance'].mean(), decimals = 0)
media_fare = np.around(green_2018_61['fare_amount'].mean(), decimals = 0)
media_tip = np.around(green_2018_61['tip_amount'].mean(), decimals = 0)
media_tolls = np.around(green_2018_61['tolls_amount'].mean(), decimals = 0)
media_i_s = np.around(green_2018_61['improvement_surcharge'].mean(), decimals = 0)


In [20]:
#Sustituir nulos por medias.
green_2018_61['trip_distance'].fillna(value = media_distance, inplace = True)
green_2018_61['fare_amount'].fillna(value = media_fare, inplace = True)
green_2018_61['tip_amount'].fillna(value = media_tip, inplace = True)
green_2018_61['tolls_amount'].fillna(value = media_tolls, inplace = True)
green_2018_61['improvement_surcharge'].fillna(value = media_i_s, inplace = True)


In [21]:
#Calcular modas.
moda_extra = green_2018_61['extra'].mode().astype(int)
moda_tax = green_2018_61['mta_tax'].mode().astype(int)


In [22]:
#Sustituir nulos por modas.
green_2018_61['extra'].fillna(value = moda_extra, inplace = True)
green_2018_61['mta_tax'].fillna(value = moda_tax, inplace = True)