# Aplicando EDA al dataset Taxi Fare

Instalamos las dependencias necesarias para realizar el análisis

In [2]:
!pip install pandas
!pip install numpy
!pip install seaborn

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


Importamos los módulos a utilizar posteriormente

```
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time

from multiprocessing import cpu_count, Pool
from math import radians, cos, sin, asin, sqrt
```

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
import numpy as np
from multiprocessing import cpu_count, Pool
from math import radians, cos, sin, asin, sqrt

# Esto es solo para la visualización en Pycharm
sns.set_style("ticks")

sns

<module 'seaborn' from '/home/bpm21/.local/lib/python3.9/site-packages/seaborn/__init__.py'>

Inicializamos algunas constantes a utilizar

* **FILE_PATH:** Contiene la ruta de nuestro dataset.
* **EARTH_RADIUS:** Es el valor promedio del radio de la Tierra en kilómetros.
* **CHUNK_SIZE:** Indica el tamaño del conjunto de datos que se procesará en cada iteración.
* **AVAILABLE_CPU:** Tiene el número de threads que estarán disponibles en el uso de este cuaderno.

In [2]:
FILE_PATH = "train.csv"
EARTH_RADIUS = 6378.0 # Lo utilizamos en el cálculo de la fórmula de Haversine.
CHUNK_SIZE = 100000
AVAILABLE_CPU =  cpu_count() - 1 # Disminuimos uno del total para evitar que la pc se queda inutilizable.

FILE_PATH, EARTH_RADIUS, CHUNK_SIZE, AVAILABLE_CPU

('train.csv', 6378.0, 100000, 11)

# Fórmula Haversine

Esta fórmula nos servirá para poder calcular la distancia entre 2 puntos geográficos.

* TODO Insertar ecuación Haversine en latex

Está fórmula será implementada en la función `calculate_haversine_distance`, la cual recibe una columna de tuplas con
los puntos de latitud y longitud tanto de la posición en **pickup** como en **drop off**.

In [3]:
def calculate_haversine_distance(pickup_position, drop_off_position):
    pickup_lat, pickup_lng = pickup_position
    drop_off_lat, drop_off_lng = drop_off_position

    pickup_lat, pickup_lng, drop_off_lat, drop_off_lng = map(
        radians,
        (pickup_lat, pickup_lng, drop_off_lat, drop_off_lng)
    )

    lat_diff = drop_off_lat - pickup_lat
    lng_diff = drop_off_lng - pickup_lng

    distance = sin(lat_diff * 0.5) ** 2 + cos(pickup_lat) * cos(drop_off_lat) * sin(lng_diff * 0.5) ** 2

    return 2 * EARTH_RADIUS * asin(sqrt(distance))

calculate_haversine_distance

<function __main__.calculate_haversine_distance(pickup_position, drop_off_position)>

In [4]:
chunk_list = []


def parallelize_chunk_treatment(df, func):
    df_split = np.array_split(df, AVAILABLE_CPU)
    pool = Pool(AVAILABLE_CPU)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    chunk_list.append(df)


def treat_chunk(chunk_):
    chunk_.dropna()
    chunk_['distance'] = chunk_.apply(
        lambda row: calculate_haversine_distance(
            (row.pickup_latitude, row.pickup_longitude),
            (row.dropoff_latitude, row.dropoff_longitude)),
        axis=1
    )
    return chunk_.drop(columns[2:5], axis=1)

In [5]:
columns = ['fare_amount',
           'pickup_datetime',
           'pickup_longitude',
           'pickup_latitude',
           'dropoff_longitude',
           'dropoff_latitude',
           'passenger_count'
           ]

df_chunk = pd.read_csv(FILE_PATH, chunksize = CHUNK_SIZE, usecols = columns)

start_time = time.time()
for chunk in df_chunk:
    parallelize_chunk_treatment(chunk, treat_chunk)
    del chunk

df_concat = pd.concat(chunk_list)
del chunk_list
print("--- %s seconds ---" % (time.time() - start_time))

--- 718.2036235332489 seconds ---


In [6]:
df_concat.drop(df_concat[df_concat['distance'] <= 0].index, inplace=True)
df_concat.drop(df_concat[df_concat['fare_amount'] <= 0].index, inplace=True)
df_concat.drop(df_concat[df_concat['passenger_count'] <= 0].index, inplace=True)

In [11]:
df_concat['pickup_datetime'].describe()

count                    53647190
unique                   25492351
top       2009-11-01 01:17:00 UTC
freq                           62
Name: pickup_datetime, dtype: object

In [12]:
df_concat['pickup_datetime']

0           2009-06-15 17:26:21 UTC
1           2010-01-05 16:52:16 UTC
2           2011-08-18 00:35:00 UTC
3           2012-04-21 04:30:42 UTC
4           2010-03-09 07:51:00 UTC
                     ...           
55423851    2014-03-15 03:28:00 UTC
55423852    2009-03-24 20:46:20 UTC
55423853    2011-04-02 22:04:24 UTC
55423854    2011-10-26 05:57:51 UTC
55423855    2014-12-12 11:33:00 UTC
Name: pickup_datetime, Length: 53647190, dtype: object

In [13]:
df_concat.head()

Unnamed: 0,fare_amount,pickup_datetime,dropoff_latitude,passenger_count,distance
0,4.5,2009-06-15 17:26:21 UTC,40.712278,1,1.031896
1,16.9,2010-01-05 16:52:16 UTC,40.782004,1,8.459418
2,5.7,2011-08-18 00:35:00 UTC,40.750562,2,1.391052
3,7.7,2012-04-21 04:30:42 UTC,40.758092,1,2.802346
4,5.3,2010-03-09 07:51:00 UTC,40.783762,1,2.001353


In [17]:
df_concat['pickup_datetime'][0]

'2009-06-15 17:26:21 UTC'

In [None]:
df_concat.loc['2009-01-01 00:00:00 UTC':'2010-01-01 00:00:00 UTC'].size