In [1]:
import os
import glob
import pandas as pd
from datetime import datetime
import time

In [2]:
variable_path_base = r'../data/variables'
variable_files_pattern = f'var-dia-*.csv'
variable_datetime_format = '%Y-%m-%d %H:%M' #'%Y-%m-%d'
variable_column_Valor = 'ValorFilled' 
variable_file_output = 'var-mes-{etiqueta}.csv'

In [3]:
variable_path_base = variable_path_base.split('/')
variable_files = glob.glob(os.path.join(*variable_path_base, variable_files_pattern))
print(variable_files)

['..\\data\\variables\\var-dia-PTPM_CON-fill-median-10.csv', '..\\data\\variables\\var-dia-PT_10_TT_D-fill-median-10.csv', '..\\data\\variables\\var-dia-Q_MEDIA_D-fill-median-10.csv', '..\\data\\variables\\var-dia-Q_MN_D-fill-median-10.csv', '..\\data\\variables\\var-dia-Q_MX_D-fill-median-10.csv', '..\\data\\variables\\var-dia-TMN_CON-fill-median-10.csv', '..\\data\\variables\\var-dia-TMX_CON-fill-median-10.csv']


In [4]:
dtypes = {
    'CodigoEstacion':'string','Etiqueta':'category','Fecha': 'string',
    #'IdParametro':'category','DescripcionSerie':'category','Frecuencia':'category','Grado':'category','Calificador':'category','NivelAprobacion':'category',
}

usecols = {
    'CodigoEstacion','Etiqueta', 'Fecha', variable_column_Valor
    #'IdParametro','DescripcionSerie','Frecuencia','Grado','Calificador','NivelAprobacion'
}

dateparse = lambda x: datetime.strptime(x, variable_datetime_format)


In [11]:
def groupVariableByMonth(f):
    start_time = time.time()
    
    print(f'Archivo: {f}')
    print(f'Leyendo h:{time.asctime()}...')
    df = pd.read_csv(f, dtype = dtypes, parse_dates = ['Fecha'], usecols = usecols, date_parser = dateparse)#, index_col = ['Fecha', 'CodigoEstacion'])
    print(f'**** r:{df.shape} h:{time.asctime()} t:{(time.time() - start_time) / 60} ****')
    
    df.rename(columns = {variable_column_Valor: 'Valor'}, inplace = True)
    
    etiqueta = df.loc[0,'Etiqueta']
    
    print(f'Etiqueta leída: {etiqueta}')
    
    print(f'Agrupando por mes...')
    dfm = df.groupby([df.CodigoEstacion,df.Etiqueta,df.Fecha.dt.to_period('M')]).Valor.agg(['min','mean','median','max']).dropna().reset_index()
    print(f'**** r:{dfm.shape} h:{time.asctime()} t:{(time.time() - start_time) / 60} ****')
    print(dfm.head())
    
    p = os.path.join(*variable_path_base, variable_file_output.format(etiqueta = etiqueta))
    
    print(f'Guardando en {p}...')
    dfm.to_csv(p, index = False)
    print(f'**** h:{time.asctime()} t:{(time.time() - start_time) / 60} ****')

In [12]:
%%time
for f in variable_files:
    groupVariableByMonth(f)


Archivo: ..\data\variables\var-dia-PTPM_CON-fill-median-10.csv
Leyendo h:Mon Jun 15 18:14:47 2020...
**** r:(18694560, 4) h:Mon Jun 15 18:23:11 2020 t:8.393712492783864 ****
Etiqueta leída: PTPM_CON
Agrupando por mes...
**** r:(523024, 7) h:Mon Jun 15 18:23:25 2020 t:8.62424388329188 ****
  CodigoEstacion  Etiqueta    Fecha  min       mean  median    max
0       11010010  PTPM_CON  1991-01  0.0  11.958065    6.10   56.0
1       11010010  PTPM_CON  1991-02  0.0  20.017857    2.25  125.4
2       11010010  PTPM_CON  1991-03  0.0  21.890323    4.40   95.2
3       11010010  PTPM_CON  1991-04  0.0  10.826667    2.05   88.4
4       11010010  PTPM_CON  1991-05  0.0  23.612903   15.00   89.7
Guardando en ..\data\variables\var-men-PTPM_CON-fill-median-10.csv...
**** h:Mon Jun 15 18:23:33 2020 t:8.768150047461193 ****
Archivo: ..\data\variables\var-dia-PT_10_TT_D-fill-median-10.csv
Leyendo h:Mon Jun 15 18:23:34 2020...
**** r:(304020, 4) h:Mon Jun 15 18:23:44 2020 t:0.17018896738688152 ****
Etiqu

In [7]:
#estacion = '13067020'
#year = 2005
#print(dfm.loc[(dfm.CodigoEstacion == estacion) & (dfm.Fecha.dt.year == year)].head(2))
#print(df[(df.CodigoEstacion == estacion) & (df.Fecha.dt.year == year)].head(2))