# Process Drougth data
Estamos procesando la informacion de las sequias para hacerlo un modelo supervisado. En la explicación del [MSM](https://smn.conagua.gob.mx/es/climatologia/monitor-de-sequia/monitor-de-sequia-en-mexico), explican que desde el 2016, la definición de lo que consideran sequía cambia, por esta razón tomaremos los datos a partir de esa fecha.

## Imports

In [1]:
import pandas as pd
import warnings

from dateutil.relativedelta import relativedelta


from src.data.utils import (
    get_general_path, join_paths, zeroes_to_cve, save_as_pickle
)

## Constants

In [2]:
RAW_DATA_PATH = 'data/raw'
INTERIM_DATA_PATH = 'data/interim'
TARGET_DATA_FILE = 'target_datasets.pkl'

DROUGTH_DATA_FILE = 'drought_data.parquet'

PROCESSED_DROUGHT_DATA_FILE = 'processed_drought_data.parquet'


TIME_IDENTIFIER = '00:00'

DATE_NAME = 'date'
NEW_DATE = 'standard_date'
THRESHOLD_DATE = '2016-01-01'
TARGET_NAME = 'num_drought_index'
COMPARISION_DAY = 18 # WAS 15 But changed due to fluctiations

## Config

In [8]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 1000)

## Functions

In [4]:
def transform_numerical_drought_index(idx):
    """This function transforms an index on the DROUGHT_INDEX into the respective numerical form
    For example:
    none -> 0
    D1 -> 1
    D2 -> 2
    ...
    DN-> N
    Where N is an integer.
    """
    if isinstance(idx, str):
        return float(idx.replace('D', ''))
    return 0

def targets_displaced_data(data, date_col, original_target, *months):
    """ This function generates a target with the DROUGHT_INDEX as if you can see the future. That is to say:
    What would be the DROUGHT INDEX for an specific instance in the future (1 month, 3 months, 6 months, 1 year).
    """
    general_data = data.copy()
    for month in months:
        general_data.set_index(data.index)
        displaced_data = general_data.copy()
        column = f'displaced_date{month}_months'
        target = f'drougth_index_next{month}_months'
        date_displacement = lambda x: x + relativedelta(months=-month)
        displaced_data[column] = displaced_data[date_col].apply(date_displacement)
        print(displaced_data[column].head())
        target_df = create_index_for_row_data(displaced_data, column).copy()
        target_df.rename(columns={original_target: target}, inplace=True)
        general_data[target] = target_df[target]
    drought_targets = [drought_target for drought_target in general_data.columns if 'DROUGHT_INDEX' in drought_target]
    return general_data[drought_targets]

## Read data

In [5]:
general_path = get_general_path()
drougth_data_path = join_paths(general_path, RAW_DATA_PATH, DROUGTH_DATA_FILE)
drought_data = pd.read_parquet(drougth_data_path)

## Process Data

In [6]:
drought_data['mun_id'] = (
    drought_data.CVE_ENT.apply(zeroes_to_cve, zeroes=2) + 
    "_" + 
    drought_data.CVE_MUN.apply(zeroes_to_cve, zeroes=3)
)

date_columns = [col for col in drought_data.columns if TIME_IDENTIFIER in str(col)]
other_columns = [col for col in drought_data.columns if not (TIME_IDENTIFIER in str(col))]
data = pd.melt(drought_data, id_vars=other_columns, var_name=DATE_NAME, value_name=TARGET_NAME)
data[DATE_NAME] = pd.to_datetime(data[DATE_NAME])

data.loc[data[DATE_NAME].dt.day > COMPARISION_DAY, NEW_DATE] = data[DATE_NAME] - pd.to_timedelta(data[DATE_NAME].dt.day,'d') + pd.to_timedelta(28,'d') 
data.loc[data[DATE_NAME].dt.day <= COMPARISION_DAY, NEW_DATE] = data[DATE_NAME] - pd.to_timedelta(data[DATE_NAME].dt.day,'d') + pd.to_timedelta(15,'d') 
data[TARGET_NAME] = data[TARGET_NAME].apply(transform_numerical_drought_index)
data['mun_id__date'] = data['mun_id'] + '__' + data[NEW_DATE].dt.strftime('%Y%m%d')
data.set_index('mun_id__date', inplace=True)

## Results

## Conclusion

In [7]:
procesed_drougth_data_path = join_paths(general_path, INTERIM_DATA_PATH, PROCESSED_DROUGHT_DATA_FILE)
data.to_parquet(procesed_drougth_data_path)