# TRMM Preparación de datos

In [None]:
% matplotlib inline
import glob
import numpy as np
import pandas as pd
import xarray as xr
import cartopy
import cartopy.crs as ccrs
import matplotlib.pyplot as plt

## Leer varios archivos y combinarlos en un solo conjunto de datos

In [None]:
files = glob.glob("./data/*.nc")
len(files)

### Usar / escribir algunas funciones auxiliares

In [None]:
# %load src/extract_timestep.py
def extract_timestep(ds):
    import re
    import datetime
    import numpy as np
    file_header = ds.attrs["FileHeader"]
    text = file_header.split(";")[5]
    match = re.search(r'\d{4}-\d{2}-\d{2}', text)
    date = datetime.datetime.strptime(match.group(), '%Y-%m-%d').date()
    return date

In [None]:
def read_netcdfs(paths):    
    datasets = [xr.open_dataset(p) for p in paths]
    for e, ds in enumerate(datasets):
        datasets[e] = ds.assign(time=np.datetime64(extract_timestep(ds)))
    combined = xr.concat(datasets, dim="time")
    return combined

### Cargar los archivos

In [None]:
file_paths = glob.glob("./data/*.nc")
ds = read_netcdfs(file_paths)

In [None]:
ds

<img src="_img/xr-dataset-diagram.png" width=90%/>

Source: [xarray docs](http://xarray.pydata.org/en/stable/data-structures.html)

In [None]:
ds.sel(time=ds.time[1])["precipitation"].plot()

## Calcular mm por mes

In [None]:
def compute_mm_from_mm_per_hour(ds, days):
    return ds * 24 *  days

def process_data(ds):
    ds_temp = ds.copy(deep=True)
    for e, timestamp in enumerate(ds.coords["time"]):
        # get days a month
        days_a_month = pd.to_datetime(timestamp.values).day
        # subset data
        _ = ds_temp.sel(time=timestamp.values)["precipitation"]
        # compute mm/month
        prec_mm_per_month = compute_mm_from_mm_per_hour(_, days_a_month)
        # mask values lower than 1 
        #prec_mm_per_month = prec_mm_per_month.where(prec_mm_per_month >= 1)
        # reassign new values
        ds_temp["precipitation"][e] = prec_mm_per_month  
                
    return ds_temp 

In [None]:
ds_processed = process_data(ds)  
ds_processed

In [None]:
ds_processed.sel(time=ds_processed.time[1])["precipitation"].plot()

## Guardar el objeto en el disco como archivo `.nc` ([netcdf](https://en.wikipedia.org/wiki/NetCDF))

In [None]:
ds_processed.to_netcdf("./data/processed/peru.nc")