In [14]:
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

# Configurar el clúster de Dask
cluster = LocalCluster(n_workers=2, threads_per_worker=1, memory_limit='12GB')
client = Client(cluster)
def procesamiento_avanzado_particion(df):
    # Filtrar y eliminar columnas no deseadas
    columnas_a_eliminar = ['originating_base_num', 'access_a_ride_flag', 'wav_request_flag', 
                           'wav_match_flag', 'shared_request_flag', 'shared_match_flag',
                           'access_a_ride_flag']
    df = df.drop(columnas_a_eliminar, axis=1)
    
    # Reemplazar valores nulos por 0.00 en columnas relevantes
    relevant_columns = ['base_passenger_fare', 'tolls', 'bcf', 'sales_tax', 'congestion_surcharge',
                        'airport_fee', 'tips', 'driver_pay']
    df = df.fillna({col: 0.00 for col in relevant_columns})

    # Calcular la columna 'total_amount' sumando las columnas relevantes
    df['total_amount'] = df[relevant_columns].sum(axis=1)

    # Eliminar columnas no deseadas
    columnas_a_eliminar = ['tolls', 'bcf', 'sales_tax', 'congestion_surcharge', 
                           'airport_fee', 'tips', 'driver_pay']
    df = df.drop(columnas_a_eliminar, axis=1)
    
    # Convertir columnas de fechas a tipo DateTime en Dask
    for col in ['pickup_datetime', 'dropoff_datetime', 'request_datetime', 'on_scene_datetime']:
        df[col] = dd.to_datetime(df[col])

    # Agregar columnas para fechas, horas, minutos y segundos
    for col in ['pickup_datetime', 'dropoff_datetime', 'request_datetime', 'on_scene_datetime']:
        df[col + '_fecha'] = df[col].dt.strftime('%Y-%m-%d')
        df[col + '_hora'] = df[col].dt.strftime('%H')  # Formatear como HH:MM
        
    # Calcular la duración del viaje y de espera en segundos
    df['TiempoRecogida'] = (df['on_scene_datetime'] - df['request_datetime']).dt.total_seconds()

    # Eliminar filas con duraciones negativas o nulas
    df = df[df['TiempoRecogida'] >= 0]
    
    # Eliminar columnas originales
    df = df.drop(columns=['pickup_datetime', 'dropoff_datetime', 'request_datetime', 'on_scene_datetime'])

    # Renombrar las columnas del DataFrame de Dask
    new_column_names = {
        "hvfhs_license_num": "IdProveedor",
        "dispatching_base_num": "IdProveedor",
        "PULocationID": "IdZonaOrigen",
        "DOLocationID": "IdZonaDestino",
        "trip_miles": "DistanciaViaje",
        "base_passenger_fare": "TarifaPasajero",
        "trip_time": "DuracionViaje",
        "pickup_datetime_fecha": "FechaRecogida",
        "pickup_datetime_hora": "HoraRecogida",
        "dropoff_datetime_fecha": "FechaLlegada",
        "dropoff_datetime_hora": "HoraLlegada",
        "request_datetime_fecha": "FechaSolicitada",
        "request_datetime_hora": "HoraSolicitada",
        "on_scene_datetime_fecha": "FechaAtendida",
        "on_scene_datetime_hora": "HoraAtendida",
        "total_amount": "CostoTotal"
    }
    df = df.rename(columns=new_column_names)
     
    # Eliminar las columnas que tienen la información unificada de Fecha y Hora
    df.drop(columns=['FechaLlegada'], inplace=True)
    df.drop(columns=['HoraLlegada'], inplace=True)
    df.drop(columns=['FechaSolicitada'], inplace=True)
    df.drop(columns=['HoraSolicitada'], inplace=True)
    df.drop(columns=['FechaAtendida'], inplace=True)
    df.drop(columns=['HoraAtendida'], inplace=True)
    
    return df

# Cargar los datos en un DataFrame distribuido de Dask y particionarlo
directorio = "../datasets/raw/fhvhv_tripdata_2022-01.parquet"
df_dask = dd.read_parquet(directorio, engine='pyarrow')
df_dask_particionado = df_dask.repartition(npartitions=4)

# Aplicar el procesamiento avanzado a cada partición y escribir en CSV
for i, particion in enumerate(df_dask_particionado.to_delayed()):
    df_particion_procesado = procesamiento_avanzado_particion(particion.compute())
    
    # Crear un archivo CSV a partir de cada partición
    ruta_salida = f'../datasets/processed/ffvh_analytics/ffvh_analytics_part_{i+1}.csv'
    df_particion_procesado.to_csv(ruta_salida, index=False)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 57657 instead
  next(self.gen)
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may caus

In [3]:
import pandas as pd

In [15]:
df_fhvhv = pd.read_csv('../datasets/processed/ffvh_analytics/ffvh_analytics_part_1.csv')

In [16]:
df_fhvhv.head()

Unnamed: 0,IdProveedor,IdProveedor.1,IdZonaOrigen,IdZonaDestino,DistanciaViaje,DuracionViaje,TarifaPasajero,CostoTotal,FechaRecogida,HoraRecogida,TiempoRecogida
0,HV0003,B03404,170,161,1.18,664,24.9,53.64,2022-01-01,0,9.0
1,HV0003,B03404,237,161,0.82,460,11.97,28.46,2022-01-01,0,161.0
2,HV0003,B03404,237,161,1.18,595,29.82,59.41,2022-01-01,0,824.0
3,HV0003,B03404,262,229,1.65,303,7.91,17.9,2022-01-01,0,92.0
4,HV0003,B03404,229,141,1.65,461,9.44,20.75,2022-01-01,0,16.0


In [6]:
df_yellow = pd.read_csv('../datasets/processed/yellow_analytics/yellow_analytics_part_1.csv')

In [7]:
df_yellow.head()

Unnamed: 0,IdProveedor,TotalPasajeros,DistanciaViaje,IdTipoTarifa,IdZonaOrigen,IdZonaDestino,IdTipoPago,CostoTotal,FechaRecogida,HoraRecogida,DuracionViaje
0,1,2,3.8,1.0,142,236,1,21.95,2022-01-01,0,1069
1,1,1,2.1,1.0,236,42,1,13.3,2022-01-01,0,504
2,2,1,1.09,1.0,114,68,2,11.8,2022-01-01,0,602
3,2,1,4.3,1.0,68,163,1,30.3,2022-01-01,0,2252
4,2,1,5.07,1.0,233,87,1,26.0,2022-01-01,0,848
