In [6]:
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

# Configurar el clúster de Dask
cluster = LocalCluster(n_workers=2, threads_per_worker=1, memory_limit='12GB')
client = Client(cluster)
def procesamiento_avanzado_particion(df):
    # Filtrar y eliminar columnas no deseadas
    columnas_a_eliminar = ['originating_base_num', 'access_a_ride_flag', 'wav_request_flag', 
                           'wav_match_flag', 'shared_request_flag', 'shared_match_flag',
                           'access_a_ride_flag']
    df = df.drop(columnas_a_eliminar, axis=1)
    
    # Reemplazar valores nulos por 0.00 en columnas relevantes
    relevant_columns = ['base_passenger_fare', 'tolls', 'bcf', 'sales_tax', 'congestion_surcharge',
                        'airport_fee', 'tips', 'driver_pay']
    df = df.fillna({col: 0.00 for col in relevant_columns})

    # Calcular la columna 'total_amount' sumando las columnas relevantes
    df['total_amount'] = df[relevant_columns].sum(axis=1)

    # Eliminar columnas no deseadas
    columnas_a_eliminar = ['tolls', 'bcf', 'sales_tax', 'congestion_surcharge', 
                           'airport_fee', 'tips', 'driver_pay']
    df = df.drop(columnas_a_eliminar, axis=1)
    
    # Convertir columnas de fechas a tipo DateTime en Dask
    for col in ['pickup_datetime', 'dropoff_datetime', 'request_datetime', 'on_scene_datetime']:
        df[col] = dd.to_datetime(df[col])

    # Agregar columnas para fechas, horas, minutos y segundos
    for col in ['pickup_datetime', 'dropoff_datetime', 'request_datetime', 'on_scene_datetime']:
        df[col + '_fecha'] = df[col].dt.strftime('%Y-%m-%d')
        df[col + '_hora_minuto'] = df[col].dt.strftime('%H:%M')  # Formatear como HH:MM
        
    # Calcular la duración del viaje y de espera en segundos
    df['DuracionAtencion'] = (df['on_scene_datetime'] - df['request_datetime']).dt.total_seconds()

    # Eliminar filas con duraciones negativas o nulas
    df = df[df['DuracionAtencion'] >= 0]
    
    # Eliminar columnas originales
    df = df.drop(columns=['pickup_datetime', 'dropoff_datetime', 'request_datetime', 'on_scene_datetime'])

    # Renombrar las columnas del DataFrame de Dask
    new_column_names = {
        "hvfhs_license_num": "IdProveedor",
        "dispatching_base_num": "IdProveedor",
        "PULocationID": "IdZonaOrigen",
        "DOLocationID": "IdZonaDestino",
        "trip_miles": "DistanciaViaje",
        "base_passenger_fare": "TarifaPasajero",
        "pickup_datetime_fecha": "FechaRecogida",
        "pickup_datetime_hora_minuto": "HoraRecogida",
        "dropoff_datetime_fecha": "FechaLlegada",
        "dropoff_datetime_hora_minuto": "HoraLlegada",
        "request_datetime_fecha": "FechaSolicitada",
        "request_datetime_hora_minuto": "HoraSolicitada",
        "on_scene_datetime_fecha": "FechaAtendida",
        "on_scene_datetime_hora_minuto": "HoraAtendida",
        "total_amount": "CostoTotal"
    }
    df = df.rename(columns=new_column_names)

    return df

# Cargar los datos en un DataFrame distribuido de Dask y particionarlo
directorio = "../datasets/raw/fhvhv_tripdata_2023-01.parquet"
df_dask = dd.read_parquet(directorio, engine='pyarrow')
df_dask_particionado = df_dask.repartition(npartitions=4)

# Aplicar el procesamiento avanzado a cada partición y escribir en CSV
for i, particion in enumerate(df_dask_particionado.to_delayed()):
    df_particion_procesado = procesamiento_avanzado_particion(particion.compute())
    
    # Crear un archivo CSV a partir de cada partición
    ruta_salida = f'../datasets/processed/ffvh_analytics/ffvh_analytics_part_{i+1}.csv'
    df_particion_procesado.to_csv(ruta_salida, index=False)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 53468 instead
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.




In [None]:
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

# Configurar el clúster de Dask
cluster = LocalCluster(n_workers=2, threads_per_worker=1, memory_limit='12GB')
client = Client(cluster)

# Definir una función de procesamiento avanzado por partición
def procesamiento_avanzado_particion(df):
    # Filtrar y eliminar columnas no deseadas
    columnas_a_eliminar = ['originating_base_num', 'access_a_ride_flag', 'wav_request_flag', 
                           'wav_match_flag', 'shared_request_flag', 'shared_match_flag',
                           'access_a_ride_flag']
    df = df.drop(columnas_a_eliminar, axis=1)

    # Convertir columnas de fechas a tipo DateTime en Dask
    for col in ['pickup_datetime', 'dropoff_datetime', 'request_datetime', 'on_scene_datetime']:
        df[col] = dd.to_datetime(df[col])

    # Agregar columnas para fechas, horas, minutos y segundos
    for col in ['pickup_datetime', 'dropoff_datetime', 'request_datetime', 'on_scene_datetime']:
        df[col + '_fecha'] = df[col].dt.date
        df[col + '_hora_minuto'] = df[col].dt.strftime('%H:%M')  # Formatear como HH:MM
    # Calcular la duración del viaje y de espera en segundos
    df['duracion_viaje'] = (df['dropoff_datetime'] - df['pickup_datetime']).dt.total_seconds()
    df['duracion_atencion'] = (df['on_scene_datetime'] - df['request_datetime']).dt.total_seconds()

    # Eliminar filas con duraciones negativas o nulas
    df = df[df['duracion_viaje'] > 0]
    df = df[df['duracion_atencion'] >= 0]
    # Eliminar columnas originales
    df = df.drop(columns=['pickup_datetime', 'dropoff_datetime', 'request_datetime', 'on_scene_datetime'])
    # Reemplazar valores nulos por 0.00 en columnas relevantes
    relevant_columns = ['base_passenger_fare', 'tolls', 'bcf', 'sales_tax', 'congestion_surcharge',
                        'airport_fee', 'tips', 'driver_pay']
    df = df.fillna({col: 0.00 for col in relevant_columns})

    # Calcular la columna 'total_amount' sumando las columnas relevantes
    df['total_amount'] = df[relevant_columns].sum(axis=1)

    # Eliminar columnas no deseadas
    columnas_a_eliminar = ['tolls', 'bcf', 'sales_tax', 'congestion_surcharge', 
                           'airport_fee', 'tips', 'driver_pay']
    df = df.drop(columnas_a_eliminar, axis=1)

    return df

# Cargar los datos en un DataFrame distribuido de Dask y particionarlo
directorio = "../datasets/raw/fhvhv_tripdata_2022-01.parquet"
df_dask = dd.read_parquet(directorio, engine='pyarrow')
df_dask_particionado = df_dask.repartition(npartitions=4)

# Aplicar el procesamiento avanzado a cada partición y escribir en parquet
for i, particion in enumerate(df_dask_particionado.to_delayed()):
    df_particion_procesado = procesamiento_avanzado_particion(particion.compute())
    ruta_salida = f'../datasets/processed/ffvh_analytics/ffvh_analytics_part_{i}.parquet'
    df_particion_procesado.to_parquet(ruta_salida, engine='pyarrow')

In [12]:
import pandas as pd

In [13]:
df_fhvhv = pd.read_csv('../datasets/processed/ffvh_analytics/ffvh_analytics_part_1.csv')

In [14]:
df_fhvhv.head()

Unnamed: 0,IdProveedor,IdProveedor.1,IdZonaOrigen,IdZonaDestino,DistanciaViaje,trip_time,TarifaPasajero,CostoTotal,FechaRecogida,HoraRecogida,FechaLlegada,HoraLlegada,FechaSolicitada,HoraSolicitada,FechaAtendida,HoraAtendida,DuracionAtencion
0,HV0003,B03404,170,161,1.18,664,24.9,53.64,2022-01-01,00:07,2022-01-01,00:18,2022-01-01,00:05,2022-01-01,00:05,9.0
1,HV0003,B03404,237,161,0.82,460,11.97,28.46,2022-01-01,00:22,2022-01-01,00:30,2022-01-01,00:19,2022-01-01,00:22,161.0
2,HV0003,B03404,237,161,1.18,595,29.82,59.41,2022-01-01,00:57,2022-01-01,01:07,2022-01-01,00:43,2022-01-01,00:57,824.0
3,HV0003,B03404,262,229,1.65,303,7.91,17.9,2022-01-01,00:18,2022-01-01,00:23,2022-01-01,00:15,2022-01-01,00:17,92.0
4,HV0003,B03404,229,141,1.65,461,9.44,20.75,2022-01-01,00:28,2022-01-01,00:35,2022-01-01,00:25,2022-01-01,00:26,16.0


In [3]:
df_yellow = pd.read_csv('../datasets/processed/yellow_analytics/yellow_analytics_part_8.csv')

In [4]:
df_yellow.head()

Unnamed: 0,IdProveedor,TotalPasajeros,DistanciaViaje,IdTipoTarifa,IdZonaOrigen,IdZonaDestino,IdTipoPago,CostoTotal,FechaRecogida,HoraRecogida,DuracionViaje
0,2,1,3.21,1.0,249,50,1,20.16,2022-02-23,8,876
1,2,1,1.48,1.0,50,161,1,20.16,2022-02-23,8,1234
2,2,1,1.04,1.0,162,161,1,13.3,2022-02-23,8,671
3,2,1,2.72,1.0,170,237,1,21.96,2022-02-23,8,1358
4,1,1,1.0,1.0,236,262,1,12.95,2022-02-23,8,585


In [2]:
import pandas as pd