# Importación de Librerías

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Importación del archivo

In [2]:
df = pd.read_parquet('../pressure.parquet')
df

Unnamed: 0,CodSta,Date,Pressure,Sta,Dept,City,Lat,Long
0,36015020,2017-10-03 06:00:00,992.500000,275,CASANARE,PAZ DE ARIPORO,5.816,-71.420
1,21195190,2014-02-14 05:00:00,785.200012,74,CUNDINAMARCA,PASCA,4.310,-74.312
2,21015050,2013-09-27 18:00:00,805.299988,48,HUILA,SAN AGUSTÍN,1.926,-76.428
3,21115010,2005-11-28 10:00:00,958.500000,62,HUILA,VILLAVIEJA,3.234,-75.168
4,28035060,2008-04-08 04:00:00,988.700012,230,CESAR,VALLEDUPAR,10.464,-73.248
...,...,...,...,...,...,...,...,...
22611003,48015050,2024-10-16 23:04:00,1000.500000,290,AMAZONAS,LETICIA,-4.194,-69.941
22611004,48015050,2024-10-16 09:08:00,1004.099976,290,AMAZONAS,LETICIA,-4.194,-69.941
22611005,16015501,2024-10-16 19:02:00,972.900024,37,NORTE DE SANTANDER,CÚCUTA,7.931,-72.510
22611006,48015040,2024-10-16 22:40:00,997.400024,289,AMAZONAS,PUERTO NARIÑO,-3.780,-70.363


In [3]:
df['Date'] = pd.to_datetime(df['Date'],format='%m/%d/%Y %I:%M:%S %p')
df

Unnamed: 0,CodSta,Date,Pressure,Sta,Dept,City,Lat,Long
0,36015020,2017-10-03 06:00:00,992.500000,275,CASANARE,PAZ DE ARIPORO,5.816,-71.420
1,21195190,2014-02-14 05:00:00,785.200012,74,CUNDINAMARCA,PASCA,4.310,-74.312
2,21015050,2013-09-27 18:00:00,805.299988,48,HUILA,SAN AGUSTÍN,1.926,-76.428
3,21115010,2005-11-28 10:00:00,958.500000,62,HUILA,VILLAVIEJA,3.234,-75.168
4,28035060,2008-04-08 04:00:00,988.700012,230,CESAR,VALLEDUPAR,10.464,-73.248
...,...,...,...,...,...,...,...,...
22611003,48015050,2024-10-16 23:04:00,1000.500000,290,AMAZONAS,LETICIA,-4.194,-69.941
22611004,48015050,2024-10-16 09:08:00,1004.099976,290,AMAZONAS,LETICIA,-4.194,-69.941
22611005,16015501,2024-10-16 19:02:00,972.900024,37,NORTE DE SANTANDER,CÚCUTA,7.931,-72.510
22611006,48015040,2024-10-16 22:40:00,997.400024,289,AMAZONAS,PUERTO NARIÑO,-3.780,-70.363


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22611008 entries, 0 to 22611007
Data columns (total 8 columns):
 #   Column    Dtype         
---  ------    -----         
 0   CodSta    uint64        
 1   Date      datetime64[ns]
 2   Pressure  float32       
 3   Sta       int64         
 4   Dept      category      
 5   City      category      
 6   Lat       float64       
 7   Long      float64       
dtypes: category(2), datetime64[ns](1), float32(1), float64(2), int64(1), uint64(1)
memory usage: 1013.5 MB


In [5]:
df.describe()

Unnamed: 0,CodSta,Date,Pressure,Sta,Lat,Long
count,22611010.0,22611008,22611010.0,22611010.0,22611010.0,22611010.0
mean,212513800.0,2018-11-15 07:34:17.647383552,853.0234,175.6673,5.500518,-74.56782
min,11030010.0,2001-03-15 23:29:00,400.1,0.0,-4.194,-81.701
25%,21205520.0,2016-06-28 04:00:00,741.0,90.0,4.367,-75.74
50%,24055070.0,2019-12-12 08:10:00,850.4,179.0,5.075,-74.274
75%,35075080.0,2022-06-21 22:00:00,995.0,254.0,7.121,-73.472
max,5311500000.0,2024-10-16 23:59:00,1199.7,402.0,15.797,0.0
std,712153000.0,,157.8434,102.1961,3.322486,2.176698


In [6]:
df.isna().sum()

CodSta      0
Date        0
Pressure    0
Sta         0
Dept        0
City        0
Lat         0
Long        0
dtype: int64

In [7]:
df.nunique()

CodSta          403
Date        1035829
Pressure     380266
Sta             403
Dept             33
City            309
Lat             381
Long            386
dtype: int64

In [8]:
df.duplicated().sum()

600026

In [9]:
duplicados = df[df.duplicated()].sort_values(by='Date', ascending=True).reset_index(drop=True)
duplicados

Unnamed: 0,CodSta,Date,Pressure,Sta,Dept,City,Lat,Long
0,15065501,2019-08-28 00:00:00,998.900024,31,LA GUAJIRA,ALBANIA,11.138,-72.616
1,2621500070,2019-08-28 00:00:00,812.500000,376,ANTIOQUIA,ITUANGO,7.175,-75.766
2,23195090,2019-08-28 00:00:00,818.700012,147,SANTANDER,SURATÁ,7.366,-72.988
3,29004520,2019-08-28 00:00:00,1007.599976,232,ATLÁNTICO,BARRANQUILLA,11.006,-74.785
4,2121500048,2019-08-28 00:00:00,880.400024,342,TOLIMA,IBAGUÉ,4.419,-75.206
...,...,...,...,...,...,...,...,...
600021,52055230,2024-10-16 23:58:00,718.099976,306,NARIÑO,ALDANA,0.857,-77.678
600022,26075150,2024-10-16 23:58:00,904.700012,200,VALLE DEL CAUCA,PALMIRA,3.533,-76.382
600023,48015050,2024-10-16 23:58:00,1000.200012,290,AMAZONAS,LETICIA,-4.194,-69.941
600024,16015501,2024-10-16 23:58:00,972.799988,37,NORTE DE SANTANDER,CÚCUTA,7.931,-72.510


No se borraron archivos duplicados, pues como tomaremos datos para el promedio esto ayudaría a no tener outlayers


In [10]:
df = df.sort_values(by='Date', ascending=True).reset_index(drop=True)
df

Unnamed: 0,CodSta,Date,Pressure,Sta,Dept,City,Lat,Long
0,57015010,2001-03-15 23:29:00,1131.900024,323,VALLE DEL CAUCA,BUENAVENTURA,4.096,-81.609
1,57015010,2001-03-24 18:00:00,1131.199951,323,VALLE DEL CAUCA,BUENAVENTURA,4.096,-81.609
2,57015010,2001-03-26 06:16:00,1131.699951,323,VALLE DEL CAUCA,BUENAVENTURA,4.096,-81.609
3,57015010,2001-04-05 13:03:00,1132.099976,323,VALLE DEL CAUCA,BUENAVENTURA,4.096,-81.609
4,57015010,2001-04-10 07:48:00,1131.500000,323,VALLE DEL CAUCA,BUENAVENTURA,4.096,-81.609
...,...,...,...,...,...,...,...,...
22611003,52055230,2024-10-16 23:58:00,718.099976,306,NARIÑO,ALDANA,0.857,-77.678
22611004,48015040,2024-10-16 23:58:00,996.400024,289,AMAZONAS,PUERTO NARIÑO,-3.780,-70.363
22611005,52055230,2024-10-16 23:58:00,718.099976,306,NARIÑO,ALDANA,0.857,-77.678
22611006,48015040,2024-10-16 23:59:00,996.400024,289,AMAZONAS,PUERTO NARIÑO,-3.780,-70.363


In [11]:
df[['Dept', 'City']] = df[['Dept', 'City']].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22611008 entries, 0 to 22611007
Data columns (total 8 columns):
 #   Column    Dtype         
---  ------    -----         
 0   CodSta    uint64        
 1   Date      datetime64[ns]
 2   Pressure  float32       
 3   Sta       int64         
 4   Dept      object        
 5   City      object        
 6   Lat       float64       
 7   Long      float64       
dtypes: datetime64[ns](1), float32(1), float64(2), int64(1), object(2), uint64(1)
memory usage: 1.3+ GB


In [12]:
result = (df.groupby([df['Date'].dt.year, 'Dept', 'City','CodSta'],observed=True)
          .size()
          .reset_index(name='Observations')
          .query('Observations > 1')
          .sort_values('Date'))
result

Unnamed: 0,Date,Dept,City,CodSta,Observations
0,2001,VALLE DEL CAUCA,BUENAVENTURA,57015010,21
1,2002,VALLE DEL CAUCA,BUENAVENTURA,57015010,37
3,2003,VALLE DEL CAUCA,BUENAVENTURA,57015010,67
4,2004,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",21206940,1011
5,2004,NARIÑO,PUERRES,52055150,4
...,...,...,...,...,...
2873,2024,CHOCÓ,ISTMINA,54010010,3106
2874,2024,CHOCÓ,ISTMINA,54050010,2759
2875,2024,CHOCÓ,LLORÓ,1117500060,180
2877,2024,CHOCÓ,QUIBDÓ,11045010,2244


In [13]:
result_2004 = result[result['Date'] == 2004]
result_2004

Unnamed: 0,Date,Dept,City,CodSta,Observations
4,2004,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",21206940,1011
5,2004,NARIÑO,PUERRES,52055150,4
6,2004,VALLE DEL CAUCA,BUENAVENTURA,57015010,57


Resumen de cantidad de observaciones por año:

In [14]:
print("\nResumen por año:")
result.groupby('Date')['Observations'].agg(['count', 'sum'])


Resumen por año:


Unnamed: 0_level_0,count,sum
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,1,21
2002,1,37
2003,1,67
2004,3,1072
2005,58,181871
2006,67,374309
2007,75,433153
2008,79,480291
2009,79,469227
2010,78,460523


# Modificando Dataframe

In [15]:
Dept = 'BOGOTÁ, D.C.'
Y = 2024
df['Year'] = df['Date'].dt.year
df_filtrado = df.loc[(df['Dept'] == Dept) & (df['Year'] == Y), ['Dept','City', 'Date', 'Pressure', 'CodSta']].copy()
df_filtrado

Unnamed: 0,Dept,City,Date,Pressure,CodSta
19276972,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-01-01 00:00:00,753.105774,21205523
19276990,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-01-01 00:00:00,687.663330,35025502
19277007,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-01-01 00:00:00,702.124329,21205512
19277009,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-01-01 00:00:00,707.658020,21205509
19277064,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-01-01 00:10:00,687.663696,35025502
...,...,...,...,...,...
22610507,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-10-16 23:30:00,753.070312,21205523
22610668,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-10-16 23:40:00,753.006775,21205523
22610685,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-10-16 23:40:00,753.006775,21205523
22610832,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-10-16 23:50:00,753.007629,21205523


Vamos a identificar a que hora pertenece cada observacion tomada:

In [16]:
df_filtrado['Date_only'] = df_filtrado['Date'].dt.date
df_filtrado['Hour'] = df_filtrado['Date'].dt.hour

def get_interval(hour):
    if 0 <= hour < 6:
        return '06'
    elif 6 <= hour < 12:
        return '12'
    elif 12 <= hour < 18:
        return '18'
    elif 18 <= hour < 24:
        return '00'

df_filtrado['Interval'] = df_filtrado['Hour'].apply(get_interval)
df_filtrado

Unnamed: 0,Dept,City,Date,Pressure,CodSta,Date_only,Hour,Interval
19276972,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-01-01 00:00:00,753.105774,21205523,2024-01-01,0,06
19276990,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-01-01 00:00:00,687.663330,35025502,2024-01-01,0,06
19277007,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-01-01 00:00:00,702.124329,21205512,2024-01-01,0,06
19277009,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-01-01 00:00:00,707.658020,21205509,2024-01-01,0,06
19277064,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-01-01 00:10:00,687.663696,35025502,2024-01-01,0,06
...,...,...,...,...,...,...,...,...
22610507,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-10-16 23:30:00,753.070312,21205523,2024-10-16,23,00
22610668,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-10-16 23:40:00,753.006775,21205523,2024-10-16,23,00
22610685,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-10-16 23:40:00,753.006775,21205523,2024-10-16,23,00
22610832,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-10-16 23:50:00,753.007629,21205523,2024-10-16,23,00


Ahora, agrupando, sacaremos el promedio de cada intervalo de hora escogido:

In [17]:
df_avg = df_filtrado.groupby(['CodSta','Dept', 'City', 'Date_only', 'Interval']).agg(
    AvgPressure=('Pressure', 'mean')).reset_index()
df_avg

Unnamed: 0,CodSta,Dept,City,Date_only,Interval,AvgPressure
0,21205012,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-09-04,06,754.450012
1,21205012,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-09-04,12,753.699951
2,21205012,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-09-05,00,755.000000
3,21205012,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-09-05,06,754.600037
4,21205012,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-09-05,12,753.399963
...,...,...,...,...,...,...
3988,2120500204,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-10-15,18,752.383301
3989,2120500204,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-10-16,00,753.500000
3990,2120500204,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-10-16,06,753.616699
3991,2120500204,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-10-16,12,754.299988


Ahora, para las horas que comparten fechas, lo convertiremos en Columnas, y las observaciones serán los promedios tomados anteriormente

In [18]:
df_pivot = df_avg.pivot(index=['CodSta','Dept','City', 'Date_only'], columns='Interval', values='AvgPressure').reset_index()
df_pivot.columns.name = None
df_pivot.rename(columns={'Date_only': 'Date'}, inplace=True)

df_pivot

Unnamed: 0,CodSta,Dept,City,Date,00,06,12,18
0,21205012,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-09-04,,754.450012,753.699951,
1,21205012,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-09-05,755.000000,754.600037,753.399963,753.833313
2,21205012,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-09-06,753.433350,754.183350,752.683350,752.483337
3,21205012,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-09-07,753.966614,753.466614,752.166687,752.766663
4,21205012,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-09-08,754.766663,753.833313,753.333313,753.799988
...,...,...,...,...,...,...,...,...
1051,2120500204,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-10-12,754.250000,753.100037,754.049988,751.950012
1052,2120500204,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-10-13,755.916687,754.066650,755.283386,753.799988
1053,2120500204,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-10-14,755.149963,755.216614,756.133301,753.799988
1054,2120500204,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2024-10-15,754.316650,754.133301,755.549988,752.383301


Harenmos el mismo procedimiento en una sola celda:

In [19]:
Dept = 'VALLE DEL CAUCA'
Y = 2010

df['Year'] = df['Date'].dt.year
df_filtrado = df.loc[(df['Dept'] == Dept) & (df['Year'] == Y), ['Dept','City', 'Date', 'Pressure', 'CodSta']].copy()

df_filtrado['Date_only'] = df_filtrado['Date'].dt.date
df_filtrado['Hour'] = df_filtrado['Date'].dt.hour

def get_interval(hour):
    if (0 <= hour) and (hour < 6):
        return '06'
    elif (6 <= hour) and (hour < 12):
        return '12'
    elif (12 <= hour) and (hour < 18):
        return '18'
    elif (18 <= hour) and (hour < 24):
        return '00'

df_filtrado['Interval'] = df_filtrado['Hour'].apply(get_interval)
df_avg = df_filtrado.groupby(['CodSta','Dept', 'City', 'Date_only', 'Interval'],observed=True).agg(
    AvgPressure=('Pressure', 'mean')).reset_index()

df_pivot = df_avg.pivot(index=['CodSta','Dept', 'City', 'Date_only'], columns='Interval', values='AvgPressure').reset_index()
df_pivot.columns.name = None
df_pivot.rename(columns={'Date_only': 'Date'}, inplace=True)

df_pivot

Unnamed: 0,CodSta,Dept,City,Date,00,06,12,18
0,26055100,VALLE DEL CAUCA,CALI,2010-01-01,779.049988,782.983337,787.466614,777.166687
1,26055100,VALLE DEL CAUCA,CALI,2010-01-02,777.133301,784.750000,780.833313,775.216614
2,26055100,VALLE DEL CAUCA,CALI,2010-01-03,777.566650,783.250000,781.033386,774.966614
3,26055100,VALLE DEL CAUCA,CALI,2010-01-04,778.266663,783.366699,784.833313,776.133301
4,26055100,VALLE DEL CAUCA,CALI,2010-01-05,778.783264,783.149963,784.283386,777.633301
...,...,...,...,...,...,...,...,...
2584,57015010,VALLE DEL CAUCA,BUENAVENTURA,2010-12-22,,1131.900024,,744.799988
2585,57015010,VALLE DEL CAUCA,BUENAVENTURA,2010-12-23,,,752.000000,
2586,57015010,VALLE DEL CAUCA,BUENAVENTURA,2010-12-25,504.000000,,,
2587,57015010,VALLE DEL CAUCA,BUENAVENTURA,2010-12-26,,,,744.799988


In [20]:
df_pivot[['CodSta','City']].drop_duplicates()

Unnamed: 0,CodSta,City
0,26055100,CALI
319,26055110,JAMUNDÍ
681,26055120,CALI
1043,26085170,CALI
1398,26095320,BUGA
1717,26105250,CARTAGO
2066,54077210,BUENAVENTURA
2318,57015010,BUENAVENTURA


In [21]:
fechas_por_ciudad = df_avg.groupby('City').agg({'CodSta':'count'}).reset_index()

fechas_por_ciudad

Unnamed: 0,City,CodSta
0,BUENAVENTURA,1720
1,BUGA,1257
2,CALI,4074
3,CARTAGO,1377
4,JAMUNDÍ,1428


In [22]:
fechas_por_ciudad = df_avg.groupby('CodSta')['Date_only'].nunique().reset_index()
fechas_por_ciudad = fechas_por_ciudad.rename(columns={'Date_only': 'Cantidad_Fechas_Unicas'})
fechas_por_ciudad = fechas_por_ciudad.sort_values('Cantidad_Fechas_Unicas', ascending=False)

fechas_por_ciudad

Unnamed: 0,CodSta,Cantidad_Fechas_Unicas
1,26055110,362
2,26055120,362
3,26085170,355
5,26105250,349
0,26055100,319
4,26095320,319
7,57015010,271
6,54077210,252


In [23]:
suma_total = fechas_por_ciudad['Cantidad_Fechas_Unicas'].sum()
suma_total

2589

In [24]:
df_pivot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2589 entries, 0 to 2588
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   CodSta  2589 non-null   uint64 
 1   Dept    2589 non-null   object 
 2   City    2589 non-null   object 
 3   Date    2589 non-null   object 
 4   00      2448 non-null   float32
 5   06      2440 non-null   float32
 6   12      2498 non-null   float32
 7   18      2470 non-null   float32
dtypes: float32(4), object(3), uint64(1)
memory usage: 121.5+ KB


In [25]:
df_pivot.describe()

Unnamed: 0,CodSta,00,06,12,18
count,2589.0,2448.0,2440.0,2498.0,2470.0
mean,32039160.0,896.97406,897.848389,899.801453,898.566589
std,11874710.0,70.840576,68.728134,69.226936,75.189964
min,26055100.0,473.399994,640.400024,512.0,489.600006
25%,26055110.0,838.833313,840.753296,838.954163,833.304138
50%,26085170.0,902.716614,903.4375,904.833313,900.740051
75%,26105250.0,905.950012,906.087463,907.765015,904.425049
max,57015010.0,1157.900024,1132.099976,1157.900024,1157.900024


# Uniendo dataframes

Haremos lo mismo, pero ahora para crear 4 dataframes y unirlos:

In [26]:
Dept = 'NARIÑO'
Y = 2004

df['Year'] = df['Date'].dt.year
df_filtrado = df.loc[(df['Dept'] == Dept) & (df['Year'] == Y), ['Dept','City', 'Date', 'Pressure', 'CodSta']].copy()

df_filtrado['Date_only'] = df_filtrado['Date'].dt.date
df_filtrado['Hour'] = df_filtrado['Date'].dt.hour

def get_interval(hour):
    if (0 <= hour) and (hour < 6):
        return '00'
    elif (6 <= hour) and (hour < 12):
        return '06'
    elif (12 <= hour) and (hour < 18):
        return '12'
    elif (18 <= hour) and (hour < 24):
        return '18'

df_filtrado['Interval'] = df_filtrado['Hour'].apply(get_interval)
df_avg = df_filtrado.groupby(['CodSta','Dept', 'City', 'Date_only', 'Interval'],observed=True).agg(
    AvgPressure=('Pressure', 'mean')).reset_index()

df4 = df_avg.pivot(index=['CodSta','Dept', 'City', 'Date_only'], columns='Interval', values='AvgPressure').reset_index()
df4.columns.name = None
df4.rename(columns={'Date_only': 'Date'}, inplace=True)

df4

Unnamed: 0,CodSta,Dept,City,Date,00,06,12,18
0,52055150,NARIÑO,PUERRES,2004-08-24,,1131.699951,,
1,52055150,NARIÑO,PUERRES,2004-09-02,,,1131.699951,
2,52055150,NARIÑO,PUERRES,2004-09-08,,,,1131.599976
3,52055150,NARIÑO,PUERRES,2004-12-31,954.0,,,


In [27]:
Dept = 'VALLE DEL CAUCA'
Y = 2004

df['Year'] = df['Date'].dt.year
df_filtrado = df.loc[(df['Dept'] == Dept) & (df['Year'] == Y), ['Dept','City', 'Date', 'Pressure', 'CodSta']].copy()

df_filtrado['Date_only'] = df_filtrado['Date'].dt.date
df_filtrado['Hour'] = df_filtrado['Date'].dt.hour

def get_interval(hour):
    if (0 <= hour) and (hour < 6):
        return '00'
    elif (6 <= hour) and (hour < 12):
        return '06'
    elif (12 <= hour) and (hour < 18):
        return '12'
    elif (18 <= hour) and (hour < 24):
        return '18'

df_filtrado['Interval'] = df_filtrado['Hour'].apply(get_interval)
df_avg = df_filtrado.groupby(['CodSta','Dept', 'City', 'Date_only', 'Interval'],observed=True).agg(
    AvgPressure=('Pressure', 'mean')).reset_index()

df1 = df_avg.pivot(index=['CodSta','Dept', 'City', 'Date_only'], columns='Interval', values='AvgPressure').reset_index()
df1.columns.name = None
df1.rename(columns={'Date_only': 'Date'}, inplace=True)

df1

Unnamed: 0,CodSta,Dept,City,Date,00,06,12,18
0,57015010,VALLE DEL CAUCA,BUENAVENTURA,2004-01-01,1132.099976,,,
1,57015010,VALLE DEL CAUCA,BUENAVENTURA,2004-01-11,,1131.300049,1132.0,
2,57015010,VALLE DEL CAUCA,BUENAVENTURA,2004-01-18,,,1131.5,
3,57015010,VALLE DEL CAUCA,BUENAVENTURA,2004-01-21,,,1131.599976,
4,57015010,VALLE DEL CAUCA,BUENAVENTURA,2004-01-25,,,1131.900024,
5,57015010,VALLE DEL CAUCA,BUENAVENTURA,2004-01-31,,,,1131.699951
6,57015010,VALLE DEL CAUCA,BUENAVENTURA,2004-02-01,,,1131.699951,
7,57015010,VALLE DEL CAUCA,BUENAVENTURA,2004-02-08,,,1131.699951,
8,57015010,VALLE DEL CAUCA,BUENAVENTURA,2004-02-15,,,1131.699951,
9,57015010,VALLE DEL CAUCA,BUENAVENTURA,2004-02-18,,1132.0,,


In [28]:
Dept = 'VALLE DEL CAUCA'
Y = 2003

df['Year'] = df['Date'].dt.year
df_filtrado = df.loc[(df['Dept'] == Dept) & (df['Year'] == Y), ['Dept','City', 'Date', 'Pressure', 'CodSta']].copy()

df_filtrado['Date_only'] = df_filtrado['Date'].dt.date
df_filtrado['Hour'] = df_filtrado['Date'].dt.hour

def get_interval(hour):
    if (0 <= hour) and (hour < 6):
        return '00'
    elif (6 <= hour) and (hour < 12):
        return '06'
    elif (12 <= hour) and (hour < 18):
        return '12'
    elif (18 <= hour) and (hour < 24):
        return '18'

df_filtrado['Interval'] = df_filtrado['Hour'].apply(get_interval)
df_avg = df_filtrado.groupby(['CodSta','Dept', 'City', 'Date_only', 'Interval'],observed=True).agg(
    AvgPressure=('Pressure', 'mean')).reset_index()

df2 = df_avg.pivot(index=['CodSta','Dept', 'City', 'Date_only'], columns='Interval', values='AvgPressure').reset_index()
df2.columns.name = None
df2.rename(columns={'Date_only': 'Date'}, inplace=True)

df2

Unnamed: 0,CodSta,Dept,City,Date,00,06,12,18
0,57015010,VALLE DEL CAUCA,BUENAVENTURA,2003-01-15,,,,1132.0
1,57015010,VALLE DEL CAUCA,BUENAVENTURA,2003-01-22,,,,1132.0
2,57015010,VALLE DEL CAUCA,BUENAVENTURA,2003-01-26,,,,1132.0
3,57015010,VALLE DEL CAUCA,BUENAVENTURA,2003-02-12,,,1132.0,
4,57015010,VALLE DEL CAUCA,BUENAVENTURA,2003-02-18,,1132.099976,,
5,57015010,VALLE DEL CAUCA,BUENAVENTURA,2003-02-25,,1131.800049,,
6,57015010,VALLE DEL CAUCA,BUENAVENTURA,2003-03-01,1131.300049,,,
7,57015010,VALLE DEL CAUCA,BUENAVENTURA,2003-03-10,,,1131.900024,
8,57015010,VALLE DEL CAUCA,BUENAVENTURA,2003-03-17,,,1131.300049,
9,57015010,VALLE DEL CAUCA,BUENAVENTURA,2003-03-20,,,,1131.599976


In [29]:
Dept = 'BOGOTÁ, D.C.'
Y = 2004

df['Year'] = df['Date'].dt.year
df_filtrado = df.loc[(df['Dept'] == Dept) & (df['Year'] == Y), ['Dept','City', 'Date', 'Pressure', 'CodSta']].copy()

df_filtrado['Date_only'] = df_filtrado['Date'].dt.date
df_filtrado['Hour'] = df_filtrado['Date'].dt.hour

def get_interval(hour):
    if (0 <= hour) and (hour < 6):
        return '00'
    elif (6 <= hour) and (hour < 12):
        return '06'
    elif (12 <= hour) and (hour < 18):
        return '12'
    elif (18 <= hour) and (hour < 24):
        return '18'

df_filtrado['Interval'] = df_filtrado['Hour'].apply(get_interval)
df_avg = df_filtrado.groupby(['CodSta','Dept', 'City', 'Date_only', 'Interval'],observed=True).agg(
    AvgPressure=('Pressure', 'mean')).reset_index()

df3 = df_avg.pivot(index=['CodSta','Dept', 'City', 'Date_only'], columns='Interval', values='AvgPressure').reset_index()
df3.columns.name = None
df3.rename(columns={'Date_only': 'Date'}, inplace=True)

df3

Unnamed: 0,CodSta,Dept,City,Date,00,06,12,18
0,21206940,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2004-01-15,,1183.000000,,
1,21206940,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2004-04-14,,1182.000000,,
2,21206940,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2004-05-15,,,1183.000000,
3,21206940,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2004-06-17,1183.000000,,,
4,21206940,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2004-08-18,,,1183.000000,
...,...,...,...,...,...,...,...,...
70,21206940,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2004-12-27,742.750000,743.500000,741.400024,742.50
71,21206940,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2004-12-28,741.500000,742.666687,741.000000,741.50
72,21206940,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2004-12-29,741.500000,743.000000,740.000000,742.00
73,21206940,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2004-12-30,741.333313,742.750000,740.250000,741.25


## Union preliminar

Aqui hemos unido los dataframes anteriormente creados:

In [30]:
lista_dfs = [df1, df2, df3, df4]

df_final = pd.concat(lista_dfs, axis=0, ignore_index=True)
columnas = ['CodSta', 'Dept', 'City', 'Date', '00','06','12', '18']
df_final = df_final[columnas]
    
df_final = df_final.sort_values('Date',ascending=True)
df_final

Unnamed: 0,CodSta,Dept,City,Date,00,06,12,18
54,57015010,VALLE DEL CAUCA,BUENAVENTURA,2003-01-15,,,,1132.00
55,57015010,VALLE DEL CAUCA,BUENAVENTURA,2003-01-22,,,,1132.00
56,57015010,VALLE DEL CAUCA,BUENAVENTURA,2003-01-26,,,,1132.00
57,57015010,VALLE DEL CAUCA,BUENAVENTURA,2003-02-12,,,1132.000000,
58,57015010,VALLE DEL CAUCA,BUENAVENTURA,2003-02-18,,1132.099976,,
...,...,...,...,...,...,...,...,...
185,21206940,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2004-12-28,741.500000,742.666687,741.000000,741.50
186,21206940,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2004-12-29,741.500000,743.000000,740.000000,742.00
187,21206940,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2004-12-30,741.333313,742.750000,740.250000,741.25
188,21206940,"BOGOTÁ, D.C.","BOGOTÁ, D.C.",2004-12-31,742.000000,742.500000,740.333313,742.00


In [31]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 193 entries, 54 to 192
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   CodSta  193 non-null    uint64 
 1   Dept    193 non-null    object 
 2   City    193 non-null    object 
 3   Date    193 non-null    object 
 4   00      75 non-null     float32
 5   06      100 non-null    float32
 6   12      104 non-null    float32
 7   18      86 non-null     float32
dtypes: float32(4), object(3), uint64(1)
memory usage: 10.6+ KB


In [32]:
df_final.describe()

Unnamed: 0,CodSta,00,06,12,18
count,193.0,75.0,100.0,104.0,86.0
mean,42997160.0,840.973572,871.489868,892.608032,854.513611
std,17431450.0,189.226028,196.167618,204.109787,188.278717
min,21206940.0,409.899994,409.899994,409.600006,409.899994
25%,21206940.0,741.366699,742.5,740.166687,741.762512
50%,57015010.0,742.166687,743.325012,741.0,742.5
75%,57015010.0,1131.25,1131.400024,1131.599976,1131.400024
max,57015010.0,1183.0,1183.0,1183.0,1132.099976


# Parte final

Ahora lo haremos con todo el dataframe:

In [3]:
def get_interval(hour):
    if (0 <= hour) and (hour < 6):
        return '06'
    elif (6 <= hour) and (hour < 12):
        return '12'
    elif (12 <= hour) and (hour < 18):
        return '18'
    elif (18 <= hour) and (hour < 24):
        return '00'

df_filtrado = df[['Dept', 'City', 'Lat','Long','Date', 'Pressure', 'CodSta']].copy()
df_filtrado['Date_only'] = df_filtrado['Date'].dt.date
df_filtrado['Hour'] = df_filtrado['Date'].dt.hour
df_filtrado['Interval'] = df_filtrado['Hour'].apply(get_interval)

df_avg = df_filtrado.groupby(['CodSta', 'Dept', 'City','Lat','Long', 'Date_only', 'Interval'], observed=True).agg(
        AvgPressure=('Pressure', 'mean')).reset_index()
    
df_final = df_avg.pivot(index=['CodSta', 'Dept', 'City', 'Lat','Long','Date_only'], 
                            columns='Interval', values='AvgPressure').reset_index()
df_final.columns.name = None
df_final.rename(columns={'Date_only': 'Date'}, inplace=True)
    

df_final = df_final.sort_values(['Dept','City','CodSta','Date'], ascending=True)

df_final

Unnamed: 0,CodSta,Dept,City,Lat,Long,Date,00,06,12,18
547895,44197020,AMAZONAS,LA PEDRERA,-1.310,-69.619,2017-06-13,1004.683350,,,1003.224976
547896,44197020,AMAZONAS,LA PEDRERA,-1.310,-69.619,2017-06-14,1005.916687,1005.266663,1006.766663,1004.000000
547897,44197020,AMAZONAS,LA PEDRERA,-1.310,-69.619,2017-06-15,1005.649963,1005.799988,1007.316650,1003.883301
547898,44197020,AMAZONAS,LA PEDRERA,-1.310,-69.619,2017-06-16,1004.383301,1005.716614,1007.133301,1003.016663
547899,44197020,AMAZONAS,LA PEDRERA,-1.310,-69.619,2017-06-17,1003.616699,1004.216614,1005.600037,1002.549988
...,...,...,...,...,...,...,...,...,...,...
683277,3526500201,VICHADA,LA PRIMAVERA,5.481,-70.421,2024-10-12,999.549988,997.016663,999.299988,995.716736
683278,3526500201,VICHADA,LA PRIMAVERA,5.481,-70.421,2024-10-13,1001.450012,999.100037,1001.633301,999.083313
683279,3526500201,VICHADA,LA PRIMAVERA,5.481,-70.421,2024-10-14,1000.766663,1000.649963,1003.033386,999.100037
683280,3526500201,VICHADA,LA PRIMAVERA,5.481,-70.421,2024-10-15,999.600037,1000.466614,1002.100037,998.266663


Y guardamos el archivo para el uso del modelo:

In [4]:
df_final.to_parquet('../finalpresion.parquet')