In [93]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
import math

# Importación del CSV

In [94]:
datapath = 'data'
df = pd.read_csv(os.path.join(datapath, 'uber_fares.csv'))
df

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1
2,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1
3,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...
199994,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1
199995,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1
199996,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2
199997,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1


In [95]:
#df.dtypes

# Preprocesamiento de los datos

In [96]:
# Sustituimos las fechas por timestamps
df['pickup_datetime'] = df['pickup_datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC').timestamp())

### Calculo de la distancia mediante el semiverseno
El semiverseno es la fómula que calcula la distancia en la sueperfície de una esfera dadas las coordenadas de los puntos en la misma.

Fórmula del semiverseno utilizada: 

$$ semiversin\left({d\over R}\right) = semiversin(φ_1 - φ_2) + cos(φ_1) cos(φ_2) semiversin(\small \Delta λ) $$

Donde:
* d es la distancia entre dos los dos puntos
* R es el radio de la esfera
* $φ_1$ es la la latitud del punto 1
* $φ_2$ es la la latitud del punto 2
* $\small \Delta λ$ es la la diferencia de longitudes de los puntos

In [97]:
# Función que implementa la distancia entre dos puntos usando el semiverseno, devuelve la distancia en metros
def coordsToDistance(long1: float, lat1: float, long2: float, lat2: float):
    # Radio de la tierra aprox
    R = 6367000
    # Grados a radianes
    degToRad = math.pi / 180
    distLong = (long2 - long1) * degToRad
    distLat = (lat1 - lat2) * degToRad
    a = (math.sin(distLat / 2) ** 2) + (math.cos(lat1) * math.cos(lat2) * (math.sin(distLong / 2) ** 2))
    if a < 0:
        return -1
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = R * c
    return d


In [98]:
# Reemplazamos las columnas de las coordenaas con la distancia
coords:np.ndarray = df.iloc[:, 2:6].to_numpy()
distances = np.apply_along_axis(lambda x: coordsToDistance(x[0], x[1], x[2], x[3]), 1, coords)
df.insert(2, 'distance', pd.Series(distances))

In [99]:
df

Unnamed: 0,fare_amount,pickup_datetime,distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,7.5,1.431021e+09,1682.407283,-73.999817,40.738354,-73.999512,40.723217,1
1,7.7,1.247854e+09,2456.178545,-73.994355,40.728225,-73.994710,40.750325,1
2,12.9,1.251143e+09,5887.916484,-74.005043,40.740770,-73.962565,40.772647,1
3,5.3,1.245997e+09,1835.958149,-73.976124,40.790844,-73.965316,40.803349,3
4,16.0,1.409241e+09,5651.207722,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...
199994,3.0,1.351418e+09,118.100350,-73.987042,40.739367,-73.986525,40.740297,1
199995,7.5,1.394756e+09,2446.027010,-73.984722,40.736837,-74.006672,40.739620,1
199996,30.9,1.246229e+09,15713.095137,-73.986017,40.756487,-73.858957,40.692588,2
199997,14.5,1.432127e+09,3672.768628,-73.997124,40.725452,-73.983215,40.695415,1


In [100]:
datetime.strptime('2015-05-07 19:52:06 UTC', '%Y-%m-%d %H:%M:%S UTC').timestamp()

1431021126.0