# Preparación de datos
## Importar bibliotecas

In [2]:
import os
import polars as pl
import polars.selectors as cs
import pandas as pd
from datetime import date, time
import numpy as np
import pyarrow.parquet as pq

In [None]:
# from os import listdir
# from os.path import isfile, join
#fhvhv_tripdata_2022-01.parquet

In [3]:


def cargar_archivos_parquet(directorio, archivos_a_considerar):
    # Crear un diccionario para almacenar los DataFrames de Polars
    diccionario_dataframes = {}

    # Iterar sobre los archivos Parquet a considerar
    for archivo in archivos_a_considerar:
        # Construir la ruta completa al archivo
        ruta_archivo = os.path.join(directorio, archivo)
        
        # Leer el archivo Parquet en un DataFrame de Polars
        df = pl.read_parquet(ruta_archivo)
        
        # Eliminar filas con valores nulos
        df = df.drop_nulls()
        
        # Obtener el mes del archivo
        mes = archivo.split('_')[2].split('-')[1].split('.')[0]  # Extraer el mes del nombre del archivo
        
        # Agregar el DataFrame al diccionario utilizando el mes como clave
        diccionario_dataframes[mes] = df

    return diccionario_dataframes

# Ruta al directorio que contiene los archivos Parquet
directorio = "..\\datasets\\raw\\"

# Número de archivos a considerar
num_archivos = 2

# Generar dinámicamente los nombres de archivo para los archivos a considerar
archivos_por_mes = [f"fhvhv_tripdata_2023-{str(i).zfill(2)}.parquet" for i in range(1, num_archivos + 1)]

# Cargar solo los archivos especificados
diccionario_dataframes = cargar_archivos_parquet(directorio, archivos_por_mes)

# Ahora el diccionario 'diccionario_dataframes' contiene un DataFrame por mes para los archivos Parquet especificados,
# donde se han eliminado las filas con valores nulos.



In [4]:
# Iterar sobre el diccionario de dataframes
for nombre, df in diccionario_dataframes.items():
    print(f"Información del DataFrame '{nombre}':")
    print("Cantidad de registros :", len(df), "Cantidad de las Columnas :", len(df.columns))
    print("Schema")
    print(df.schema)
    print("\n")

Información del DataFrame '01':
Cantidad de registros : 13587039 Cantidad de las Columnas : 24
Schema
OrderedDict([('hvfhs_license_num', String), ('dispatching_base_num', String), ('originating_base_num', String), ('request_datetime', Datetime(time_unit='ns', time_zone=None)), ('on_scene_datetime', Datetime(time_unit='ns', time_zone=None)), ('pickup_datetime', Datetime(time_unit='ns', time_zone=None)), ('dropoff_datetime', Datetime(time_unit='ns', time_zone=None)), ('PULocationID', Int64), ('DOLocationID', Int64), ('trip_miles', Float64), ('trip_time', Int64), ('base_passenger_fare', Float64), ('tolls', Float64), ('bcf', Float64), ('sales_tax', Float64), ('congestion_surcharge', Float64), ('airport_fee', Float64), ('tips', Float64), ('driver_pay', Float64), ('shared_request_flag', String), ('shared_match_flag', String), ('access_a_ride_flag', String), ('wav_request_flag', String), ('wav_match_flag', String)])


Información del DataFrame '02':
Cantidad de registros : 13287430 Cantidad d

In [11]:
import pandas as pd

# Esquema del DataFrame '01'
esquema_01 = pd.DataFrame({
    'columna': ['hvfhs_license_num', 'dispatching_base_num', 'originating_base_num', 'request_datetime', 'on_scene_datetime', 'pickup_datetime', 'dropoff_datetime', 'PULocationID', 'DOLocationID', 'trip_miles', 'trip_time', 'base_passenger_fare', 'tolls', 'bcf', 'sales_tax', 'congestion_surcharge', 'airport_fee', 'tips', 'driver_pay', 'shared_request_flag', 'shared_match_flag', 'access_a_ride_flag', 'wav_request_flag', 'wav_match_flag'],
    'tipo': ['String', 'String', 'String', 'Datetime', 'Datetime', 'Datetime', 'Datetime', 'Int64', 'Int64', 'Float64', 'Int64', 'Float64', 'Float64', 'Float64', 'Float64', 'Float64', 'Float64', 'Float64', 'Float64', 'String', 'String', 'String', 'String', 'String']
})

# Esquema del DataFrame '02'
esquema_02 = pd.DataFrame({
    'columna': ['hvfhs_license_num', 'dispatching_base_num', 'originating_base_num', 'request_datetime', 'on_scene_datetime', 'pickup_datetime', 'dropoff_datetime', 'PULocationID', 'DOLocationID', 'trip_miles', 'trip_time', 'base_passenger_fare', 'tolls', 'bcf', 'sales_tax', 'congestion_surcharge', 'airport_fee', 'tips', 'driver_pay', 'shared_request_flag', 'shared_match_flag', 'access_a_ride_flag', 'wav_request_flag', 'wav_match_flag'],
    'tipo': ['String', 'String', 'String', 'Datetime', 'Datetime', 'Datetime', 'Datetime', 'Int32', 'Int32', 'Float64', 'Int64', 'Float64', 'Float64', 'Float64', 'Float64', 'Float64', 'Float64', 'Float64', 'Float64', 'String', 'String', 'String', 'String', 'String']
})

# Encontrar las diferencias entre los dos esquemas
diferencias = esquema_01.merge(esquema_02, indicator=True, how='outer').loc[lambda x: x['_merge'] != 'both']

# Imprimir las diferencias
print("Las columnas con diferencias son:")
print(diferencias)



Las columnas con diferencias son:
        columna   tipo      _merge
0  DOLocationID  Int32  right_only
1  DOLocationID  Int64   left_only
2  PULocationID  Int32  right_only
3  PULocationID  Int64   left_only


In [5]:
# Supongamos que diccionario_dataframes es tu diccionario que contiene los DataFrames
# Reemplaza diccionario_dataframes con el nombre de tu diccionario

# Tomar el primer DataFrame del diccionario
primer_df = next(iter(diccionario_dataframes.values()))

# Definir un nuevo esquema con los tipos de datos y nombres de columna deseados
nuevo_esquema = {
    "hvfhs_license_num": pl.String,
    "dispatching_base_num": pl.String,
    "originating_base_num": pl.String,
    "request_datetime": pl.Datetime(time_unit='ns', time_zone=None),
    "on_scene_datetime": pl.Datetime(time_unit='ns', time_zone=None),
    "pickup_datetime": pl.Datetime(time_unit='ns', time_zone=None),
    "dropoff_datetime": pl.Datetime(time_unit='ns', time_zone=None),
    "PULocationID": pl.Int32,
    "DOLocationID": pl.Int32,
    "trip_miles": pl.Float64,
    "trip_time": pl.Int64,
    "base_passenger_fare": pl.Float64,
    "tolls": pl.Float64,
    "bcf": pl.Float64,
    "sales_tax": pl.Float64,
    "congestion_surcharge": pl.Float64,
    "airport_fee": pl.Float64,
    "tips": pl.Float64,
    "driver_pay": pl.Float64,
    "shared_request_flag": pl.String,
    "shared_match_flag": pl.String,
    "access_a_ride_flag": pl.String,
    "wav_request_flag": pl.String,
    "wav_match_flag": pl.String,
}

# Renombrar las columnas y cambiar los tipos de datos del DataFrame
df = primer_df.select([
    pl.col(col).cast(dtype).alias(col) for col, dtype in nuevo_esquema.items()
])

# Cambiar el nombre de la columna "airport_fee" a "Airport_fee"
#df = df.with_columns(primer_df['airport_fee'].alias('Airport_fee'))
#df = df.drop('airport_fee')
# Reemplazar el DataFrame original en el diccionario con el DataFrame modificado
nombre_clave = next(iter(diccionario_dataframes))  # Obtener la clave del primer DataFrame
diccionario_dataframes[nombre_clave] = df

# Verificar el resultado
print(diccionario_dataframes[nombre_clave].schema)

OrderedDict([('hvfhs_license_num', String), ('dispatching_base_num', String), ('originating_base_num', String), ('request_datetime', Datetime(time_unit='ns', time_zone=None)), ('on_scene_datetime', Datetime(time_unit='ns', time_zone=None)), ('pickup_datetime', Datetime(time_unit='ns', time_zone=None)), ('dropoff_datetime', Datetime(time_unit='ns', time_zone=None)), ('PULocationID', Int32), ('DOLocationID', Int32), ('trip_miles', Float64), ('trip_time', Int64), ('base_passenger_fare', Float64), ('tolls', Float64), ('bcf', Float64), ('sales_tax', Float64), ('congestion_surcharge', Float64), ('airport_fee', Float64), ('tips', Float64), ('driver_pay', Float64), ('shared_request_flag', String), ('shared_match_flag', String), ('access_a_ride_flag', String), ('wav_request_flag', String), ('wav_match_flag', String)])


In [5]:
# Iterar sobre el diccionario de dataframes
# for nombre, df in diccionario_dataframes.items():
#     print(f"Información del DataFrame '{nombre}':")
#     print("Cantidad de registros :", len(df), "Cantidad de las Columnas :", len(df.columns))
#     print("Schema")
#     print(df.schema)
#     print("\n")

In [6]:
# Ahora que todos los DataFrames tienen las mismas columnas, puedes proceder a concatenarlos.
dataframes = list(diccionario_dataframes.values())

# Concatenar verticalmente todos los DataFrames
df_concatenado = pl.concat(dataframes)


In [7]:
df_concatenado.shape

(26874469, 24)

In [8]:
df_concatenado.schema

OrderedDict([('hvfhs_license_num', String),
             ('dispatching_base_num', String),
             ('originating_base_num', String),
             ('request_datetime', Datetime(time_unit='ns', time_zone=None)),
             ('on_scene_datetime', Datetime(time_unit='ns', time_zone=None)),
             ('pickup_datetime', Datetime(time_unit='ns', time_zone=None)),
             ('dropoff_datetime', Datetime(time_unit='ns', time_zone=None)),
             ('PULocationID', Int32),
             ('DOLocationID', Int32),
             ('trip_miles', Float64),
             ('trip_time', Int64),
             ('base_passenger_fare', Float64),
             ('tolls', Float64),
             ('bcf', Float64),
             ('sales_tax', Float64),
             ('congestion_surcharge', Float64),
             ('airport_fee', Float64),
             ('tips', Float64),
             ('driver_pay', Float64),
             ('shared_request_flag', String),
             ('shared_match_flag', String),
          

In [9]:
df_concatenado.head()

hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
str,str,str,datetime[ns],datetime[ns],datetime[ns],datetime[ns],i32,i32,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,str,str
"""HV0003""","""B03404""","""B03404""",2023-01-01 00:18:06,2023-01-01 00:19:24,2023-01-01 00:19:38,2023-01-01 00:48:07,48,68,0.94,1709,25.95,0.0,0.78,2.3,2.75,0.0,5.22,27.83,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B03404""","""B03404""",2023-01-01 00:48:42,2023-01-01 00:56:20,2023-01-01 00:58:39,2023-01-01 01:33:08,246,163,2.78,2069,60.14,0.0,1.8,5.34,2.75,0.0,0.0,50.15,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B03404""","""B03404""",2023-01-01 00:15:35,2023-01-01 00:20:14,2023-01-01 00:20:27,2023-01-01 00:37:54,9,129,8.81,1047,24.37,0.0,0.73,2.16,0.0,0.0,0.0,20.22,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B03404""","""B03404""",2023-01-01 00:35:24,2023-01-01 00:39:30,2023-01-01 00:41:05,2023-01-01 00:48:16,129,129,0.67,431,13.8,0.0,0.41,1.22,0.0,0.0,0.0,7.9,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B03404""","""B03404""",2023-01-01 00:43:15,2023-01-01 00:51:10,2023-01-01 00:52:47,2023-01-01 01:04:51,129,92,4.38,724,20.49,0.0,0.61,1.82,0.0,0.0,0.0,16.48,"""N""","""N""",""" ""","""N""","""N"""


# tpep_pickup_datetime|tpep_dropoff_datetime / fechas  a segundos y fechas unicamente
# trip_distance tratar las filas con distanca cero 
# RatecodeID| 6 datos adminitidos  (99) valores faltantes
# payment_type 1 y 2  

In [10]:
df_concatenado.shape

(26874469, 24)

In [13]:
df_nyc = df_concatenado.clone()

In [17]:
df_nyc.schema

OrderedDict([('hvfhs_license_num', String),
             ('dispatching_base_num', String),
             ('originating_base_num', String),
             ('request_datetime', Datetime(time_unit='ns', time_zone=None)),
             ('on_scene_datetime', Datetime(time_unit='ns', time_zone=None)),
             ('pickup_datetime', Datetime(time_unit='ns', time_zone=None)),
             ('dropoff_datetime', Datetime(time_unit='ns', time_zone=None)),
             ('PULocationID', Int32),
             ('DOLocationID', Int32),
             ('trip_miles', Float64),
             ('trip_time', Int64),
             ('base_passenger_fare', Float64),
             ('tolls', Float64),
             ('bcf', Float64),
             ('sales_tax', Float64),
             ('congestion_surcharge', Float64),
             ('airport_fee', Float64),
             ('tips', Float64),
             ('driver_pay', Float64),
             ('shared_request_flag', String),
             ('shared_match_flag', String),
          

## General Preprocessing
### Preprocesamiento Variables `Temporales` 

In [14]:
df_nyc.head()

hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
str,str,str,datetime[ns],datetime[ns],datetime[ns],datetime[ns],i32,i32,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,str,str
"""HV0003""","""B03404""","""B03404""",2023-01-01 00:18:06,2023-01-01 00:19:24,2023-01-01 00:19:38,2023-01-01 00:48:07,48,68,0.94,1709,25.95,0.0,0.78,2.3,2.75,0.0,5.22,27.83,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B03404""","""B03404""",2023-01-01 00:48:42,2023-01-01 00:56:20,2023-01-01 00:58:39,2023-01-01 01:33:08,246,163,2.78,2069,60.14,0.0,1.8,5.34,2.75,0.0,0.0,50.15,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B03404""","""B03404""",2023-01-01 00:15:35,2023-01-01 00:20:14,2023-01-01 00:20:27,2023-01-01 00:37:54,9,129,8.81,1047,24.37,0.0,0.73,2.16,0.0,0.0,0.0,20.22,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B03404""","""B03404""",2023-01-01 00:35:24,2023-01-01 00:39:30,2023-01-01 00:41:05,2023-01-01 00:48:16,129,129,0.67,431,13.8,0.0,0.41,1.22,0.0,0.0,0.0,7.9,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B03404""","""B03404""",2023-01-01 00:43:15,2023-01-01 00:51:10,2023-01-01 00:52:47,2023-01-01 01:04:51,129,92,4.38,724,20.49,0.0,0.61,1.82,0.0,0.0,0.0,16.48,"""N""","""N""",""" ""","""N""","""N"""


In [15]:
from datetime import datetime

In [25]:
# Convertir las columnas de fechas a tipo DateTime en Polars
df_nyc = df_concatenado.with_columns(
    pl.col('pickup_datetime').dt.date().alias('tpep_pickup_date'),
    pl.col('dropoff_datetime').dt.date().alias('tpep_dropoff_date')
)

# Calcular la duración del viaje en segundos
df_nyc = df_nyc.with_columns(
    (pl.col('dropoff_datetime') - pl.col('pickup_datetime')).dt.total_seconds().alias('viaje_segundos')
)
# Eliminar las columnas originales de fecha y hora
df_nyc = df_nyc.drop(['pickup_datetime', 'dropoff_datetime'])


In [26]:
# Convertir las columnas de fechas a tipo DateTime en Polars
df_nyc = df_nyc.with_columns(
    pl.col('request_datetime').dt.date().alias('trequest_datetime'),
    pl.col('on_scene_datetime').dt.date().alias('tscene_datetime')
)

# Calcular la duración del viaje en segundos
df_nyc = df_nyc.with_columns(
    (pl.col('on_scene_datetime') - pl.col('request_datetime')).dt.total_seconds().alias('espera_segundos')
)
# Eliminar las columnas originales de fecha y hora
df_nyc = df_nyc.drop(['request_datetime', 'on_scene_datetime'])

In [27]:
df_nyc.head()

hvfhs_license_num,dispatching_base_num,originating_base_num,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,tpep_pickup_date,tpep_dropoff_date,viaje_segundos,trequest_datetime,tscene_datetime,espera_segundos
str,str,str,i32,i32,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,str,str,date,date,i64,date,date,i64
"""HV0003""","""B03404""","""B03404""",48,68,0.94,1709,25.95,0.0,0.78,2.3,2.75,0.0,5.22,27.83,"""N""","""N""",""" ""","""N""","""N""",2023-01-01,2023-01-01,1709,2023-01-01,2023-01-01,78
"""HV0003""","""B03404""","""B03404""",246,163,2.78,2069,60.14,0.0,1.8,5.34,2.75,0.0,0.0,50.15,"""N""","""N""",""" ""","""N""","""N""",2023-01-01,2023-01-01,2069,2023-01-01,2023-01-01,458
"""HV0003""","""B03404""","""B03404""",9,129,8.81,1047,24.37,0.0,0.73,2.16,0.0,0.0,0.0,20.22,"""N""","""N""",""" ""","""N""","""N""",2023-01-01,2023-01-01,1047,2023-01-01,2023-01-01,279
"""HV0003""","""B03404""","""B03404""",129,129,0.67,431,13.8,0.0,0.41,1.22,0.0,0.0,0.0,7.9,"""N""","""N""",""" ""","""N""","""N""",2023-01-01,2023-01-01,431,2023-01-01,2023-01-01,246
"""HV0003""","""B03404""","""B03404""",129,92,4.38,724,20.49,0.0,0.61,1.82,0.0,0.0,0.0,16.48,"""N""","""N""",""" ""","""N""","""N""",2023-01-01,2023-01-01,724,2023-01-01,2023-01-01,475


In [28]:
df_nyc.columns

['hvfhs_license_num',
 'dispatching_base_num',
 'originating_base_num',
 'PULocationID',
 'DOLocationID',
 'trip_miles',
 'trip_time',
 'base_passenger_fare',
 'tolls',
 'bcf',
 'sales_tax',
 'congestion_surcharge',
 'airport_fee',
 'tips',
 'driver_pay',
 'shared_request_flag',
 'shared_match_flag',
 'access_a_ride_flag',
 'wav_request_flag',
 'wav_match_flag',
 'tpep_pickup_date',
 'tpep_dropoff_date',
 'viaje_segundos',
 'trequest_datetime',
 'tscene_datetime',
 'espera_segundos']

In [29]:
df_nyc.shape

(26874469, 26)

In [30]:
# Eliminar las columnas originales de fecha y hora
df_nyc = df_nyc.drop(['dispatching_base_num', 'originating_base_num', 'access_a_ride_flag', 'wav_request_flag', 'wav_match_flag'])

In [31]:
df_nyc.shape

(26874469, 21)

In [32]:
# Convertir el DataFrame de Polars a un objeto Arrow
arrow_table = df_nyc.to_arrow()

# Guardar el objeto Arrow en un archivo Arrow
#pq.write_table(arrow_table, 'temperature.arrow')
#pq.write_table(arrow_table, '../datasets/processed/df_yellow.arrow')
pq.write_table(arrow_table, '../datasets/processed/df_ffvh.arrow', compression='zstd')

In [33]:
#df_nyc.describe()
df_nyc.schema

OrderedDict([('hvfhs_license_num', String),
             ('PULocationID', Int32),
             ('DOLocationID', Int32),
             ('trip_miles', Float64),
             ('trip_time', Int64),
             ('base_passenger_fare', Float64),
             ('tolls', Float64),
             ('bcf', Float64),
             ('sales_tax', Float64),
             ('congestion_surcharge', Float64),
             ('airport_fee', Float64),
             ('tips', Float64),
             ('driver_pay', Float64),
             ('shared_request_flag', String),
             ('shared_match_flag', String),
             ('tpep_pickup_date', Date),
             ('tpep_dropoff_date', Date),
             ('viaje_segundos', Int64),
             ('trequest_datetime', Date),
             ('tscene_datetime', Date),
             ('espera_segundos', Int64)])

In [16]:
# Calcular la media de la columna 'tviaje_segundos'
mean_seconds = df_nyc['viaje_segundos'].mean()

# Calcular la mediana de la columna 'tviaje_segundos'
median_seconds = df_nyc['viaje_segundos'].median()

# Calcular la desviación estándar de la columna 'tviaje_segundos'
std_seconds = df_nyc['viaje_segundos'].std()

# Obtener el valor mínimo de la columna 'viaje_segundos'
min_seconds = df_nyc['viaje_segundos'].min()

# Obtener el valor máximo de la columna 'viaje_segundos'
max_seconds = df_nyc['viaje_segundos'].max()

# Imprimir los resultados
print("Minimo:", min_seconds)
print("Maximo:", max_seconds)
print("Media:", mean_seconds)
print("Mediana:", median_seconds)
print("Desviación estándar:", std_seconds)


Minimo: -1694897908
Maximo: 601751
Media: 997.5984043618433
Mediana: 752.0
Desviación estándar: 278647.64816650894


In [17]:
# Filtrar los registros con valores negativos en la columna 'viaje_segundos'
negativos = df_nyc.filter(pl.col('viaje_segundos') < 0.0)

# Obtener la cantidad de registros con valores negativos
cantidad_negativos = negativos.height
print("Cantidad de registros con valores negativos en 'viaje_segundos':", cantidad_negativos)



Cantidad de registros con valores negativos en 'viaje_segundos': 819


In [18]:
# Eliminar los registros con valores negativos en la columna 'viaje_segundos'
#df_nyc = df_nyc.drop(pl.col('viaje_segundos') < 0.0)
# Filtrar las filas que cumplen la condición
df_yellow = df_nyc.filter(pl.col('viaje_segundos') >= 0)

: 

In [5]:
#df = pl.scan_ipc("temperature.arrow")
# create a SQL context, registering the frame as a table
#sql = pl.SQLContext(my_table=df)
# create a SQL query to execute
# Leer el archivo Arrow
arrow_table = pq.read_table('../datasets/processed/temperature.arrow')

# Convertir el objeto Arrow a un DataFrame de Polars
df_polars = pl.from_arrow(arrow_table)

In [6]:
df_polars 

event_date,temperature_2m
"datetime[ns, UTC]",f32
2022-01-01 00:00:00 UTC,8.346
2022-01-01 01:00:00 UTC,9.146
2022-01-01 02:00:00 UTC,7.996
2022-01-01 03:00:00 UTC,8.046
2022-01-01 04:00:00 UTC,7.646
…,…
2023-12-31 19:00:00 UTC,6.396
2023-12-31 20:00:00 UTC,6.296
2023-12-31 21:00:00 UTC,5.796
2023-12-31 22:00:00 UTC,4.396


In [8]:
import pandas as pd
import polars as pl
import pyarrow.parquet as pq
import pyarrow.dataset as ds

# Ejemplo de DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6]
})

# Convertir el DataFrame de Pandas a un DataFrame de Polars
df_polars = pl.from_pandas(df)

# Convertir el DataFrame de Polars a un objeto Arrow
arrow_table = df_polars.to_arrow()

# Guardar el objeto Arrow en un archivo Arrow
pq.write_table(arrow_table, 'data.arrow')

# Leer el archivo Arrow utilizando pl.scan_pyarrow_dataset()
dset = ds.dataset("data.arrow", format="ipc")  
df_loaded = (
    pl.scan_pyarrow_dataset(dset)
    .to_pandas()  # Convertir a DataFrame de Pandas para mostrarlo
)

# Mostrar el DataFrame cargado
print(df_loaded)



   A  B
0  1  4
1  2  5
2  3  6


In [19]:
import polars as pl
import pyarrow.parquet as pq

# Leer el archivo Arrow
arrow_table = pq.read_table('../datasets/processed/temperature.arrow')

# Convertir el objeto Arrow a un DataFrame de Polars
df_polars = pl.from_arrow(arrow_table)




In [28]:
df_polars

event_date,temperature_2m
"datetime[ns, UTC]",f32
2022-01-01 00:00:00 UTC,8.346
2022-01-01 01:00:00 UTC,9.146
2022-01-01 02:00:00 UTC,7.996
2022-01-01 03:00:00 UTC,8.046
2022-01-01 04:00:00 UTC,7.646
…,…
2023-12-31 19:00:00 UTC,6.396
2023-12-31 20:00:00 UTC,6.296
2023-12-31 21:00:00 UTC,5.796
2023-12-31 22:00:00 UTC,4.396


In [38]:
# Crear un contexto SQL y registrar el DataFrame como una tabla
sql = pl.SQLContext(frames={"df_polars": df_polars})

result = sql.execute(
    "SELECT * FROM df_polars WHERE temperature_2m < 10"
)



# Convertir el resultado a un DataFrame de Pandas para mostrarlo
result_df = result.collect().to_pandas()

# Mostrar el DataFrame resultante
print(result_df)


                    event_date  temperature_2m
0    2022-01-01 00:00:00+00:00           8.346
1    2022-01-01 01:00:00+00:00           9.146
2    2022-01-01 02:00:00+00:00           7.996
3    2022-01-01 03:00:00+00:00           8.046
4    2022-01-01 04:00:00+00:00           7.646
...                        ...             ...
7169 2023-12-31 19:00:00+00:00           6.396
7170 2023-12-31 20:00:00+00:00           6.296
7171 2023-12-31 21:00:00+00:00           5.796
7172 2023-12-31 22:00:00+00:00           4.396
7173 2023-12-31 23:00:00+00:00           3.446

[7174 rows x 2 columns]


In [25]:
type(result_df)

pandas.core.frame.DataFrame