# Create Target of Drougth data with different future scopes
Estamos procesando la informacion de las sequias para hacerlo un modelo supervisado. En la explicación del [MSM](https://smn.conagua.gob.mx/es/climatologia/monitor-de-sequia/monitor-de-sequia-en-mexico), explican que desde el 2016, la definición de lo que consideran sequía cambia, por esta razón tomaremos los datos a partir de esa fecha.



## Imports

In [1]:
import pandas as pd
import warnings

from pandas.tseries.offsets import DateOffset

from src.data.utils import (
    get_general_path, join_paths, zeroes_to_cve, save_as_pickle
)

## Constants

In [4]:
INTERIM_DATA_PATH = 'data/interim'
TARGET_DATA_FILE = 'target_datasets.pkl'

PROCESSED_DROUGTH_DATA_FILE = 'processed_drought_data.parquet'

DATE_NAME = 'date'
NEW_DATE = 'standard_date'
THRESHOLD_DATE = '2016-01-01'
TARGET_NAME = 'num_drought_index'
COMPARISION_DAY = 18 # WAS 15 But changed due to fluctiations

## Config

In [17]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 1000)

## Functions

## Read data

In [6]:
general_path = get_general_path()
processed_drought_data_path = join_paths(general_path, INTERIM_DATA_PATH, PROCESSED_DROUGTH_DATA_FILE)
processed_drought_data = pd.read_parquet(processed_drought_data_path)

In [24]:
processed_drought_data.drop_duplicates('standard_date')

Unnamed: 0_level_0,CVE_CONCATENADA,CVE_ENT,CVE_MUN,NOMBRE_MUN,ENTIDAD,ORG_CUENCA*,CLV_OC,CON_CUENCA,CVE_CONC,mun_id,date,num_drought_index,standard_date
mun_id__date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
01_001__20030128,1001,1,1,Aguascalientes,Aguascalientes,Lerma-Santiago-Pacífico,VIII,Rio Santiago,16,01_001,2003-01-31,0.0,2003-01-28
01_001__20030228,1001,1,1,Aguascalientes,Aguascalientes,Lerma-Santiago-Pacífico,VIII,Rio Santiago,16,01_001,2003-02-28,0.0,2003-02-28
01_001__20030328,1001,1,1,Aguascalientes,Aguascalientes,Lerma-Santiago-Pacífico,VIII,Rio Santiago,16,01_001,2003-03-31,0.0,2003-03-28
01_001__20030428,1001,1,1,Aguascalientes,Aguascalientes,Lerma-Santiago-Pacífico,VIII,Rio Santiago,16,01_001,2003-04-30,0.0,2003-04-28
01_001__20030528,1001,1,1,Aguascalientes,Aguascalientes,Lerma-Santiago-Pacífico,VIII,Rio Santiago,16,01_001,2003-05-31,0.0,2003-05-28
01_001__20030628,1001,1,1,Aguascalientes,Aguascalientes,Lerma-Santiago-Pacífico,VIII,Rio Santiago,16,01_001,2003-06-30,0.0,2003-06-28
01_001__20030728,1001,1,1,Aguascalientes,Aguascalientes,Lerma-Santiago-Pacífico,VIII,Rio Santiago,16,01_001,2003-07-31,0.0,2003-07-28
01_001__20030828,1001,1,1,Aguascalientes,Aguascalientes,Lerma-Santiago-Pacífico,VIII,Rio Santiago,16,01_001,2003-08-31,0.0,2003-08-28
01_001__20030928,1001,1,1,Aguascalientes,Aguascalientes,Lerma-Santiago-Pacífico,VIII,Rio Santiago,16,01_001,2003-09-30,0.0,2003-09-28
01_001__20031028,1001,1,1,Aguascalientes,Aguascalientes,Lerma-Santiago-Pacífico,VIII,Rio Santiago,16,01_001,2003-10-31,0.0,2003-10-28


## Process Data

In [9]:
######### EN ESTA PARTE DEBEMOS DE EMPEZAR EN OTRO NOTEBOOK
df_to_manipulate = processed_drought_data.copy()
df_to_manipulate = df_to_manipulate[['mun_id', 'num_drought_index', 'standard_date']]

months_into_future = {1:'1M', 2:'2M', 3: '3M', 4: '4M', 6:'6M'}

individual_datasets = {}
for months, month in months_into_future.items():
    df = df_to_manipulate.copy()
    test_date = f'standard_date{month}'
    index_date = f"mun_id__date{month}"
    
    df[test_date] = df['standard_date'] - DateOffset(months=months)
    df[index_date] =  df["mun_id"] + "__" + df[test_date].dt.strftime('%Y%m%d')
    
    # Since not all values are valid to compute since there is a problem with the information, we need this filter:
    df = df[df['standard_date'] >  pd.to_datetime(THRESHOLD_DATE)]
    
    individual_dataset_for_prediction = df[['mun_id', test_date, index_date, 'num_drought_index']]
    individual_dataset_for_prediction.rename(columns={index_date: 'mun_id__date', 'num_drought_index': 'num_drought_index_future'}, inplace=True)
    individual_dataset_for_prediction.set_index('mun_id__date', inplace=True)
    individual_datasets[month] = individual_dataset_for_prediction

## Results

In [10]:
individual_datasets['1M']

Unnamed: 0_level_0,mun_id,standard_date1M,num_drought_index_future
mun_id__date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01_001__20151215,01_001,2015-12-15,0.0
01_002__20151215,01_002,2015-12-15,0.0
01_003__20151215,01_003,2015-12-15,0.0
01_004__20151215,01_004,2015-12-15,0.0
01_005__20151215,01_005,2015-12-15,0.0
...,...,...,...
32_054__20240515,32_054,2024-05-15,1.0
32_055__20240515,32_055,2024-05-15,3.0
32_056__20240515,32_056,2024-05-15,2.0
32_057__20240515,32_057,2024-05-15,1.0


In [11]:
individual_datasets['6M']

Unnamed: 0_level_0,mun_id,standard_date6M,num_drought_index_future
mun_id__date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01_001__20150715,01_001,2015-07-15,0.0
01_002__20150715,01_002,2015-07-15,0.0
01_003__20150715,01_003,2015-07-15,0.0
01_004__20150715,01_004,2015-07-15,0.0
01_005__20150715,01_005,2015-07-15,0.0
...,...,...,...
32_054__20231215,32_054,2023-12-15,1.0
32_055__20231215,32_055,2023-12-15,3.0
32_056__20231215,32_056,2023-12-15,2.0
32_057__20231215,32_057,2023-12-15,1.0


## Conclusion

In [12]:
target_files = join_paths(general_path, INTERIM_DATA_PATH, TARGET_DATA_FILE)
save_as_pickle(what=individual_datasets, where=target_files)