# Dependencies

In [8]:
import pandas as pd
import numpy as np
from dateutil.relativedelta import relativedelta
import time

# Data input

In [2]:
geodata_path = "../data/raw/muni.pkl"
geo_data = pd.read_pickle(geodata_path)

In [3]:
droughtdata_path = "../data/raw/drought_data.parquet"
drought_data = pd.read_parquet(droughtdata_path)

# Helper functions

In [4]:
def create_index_for_row_data(df, date_col):
    df['index'] = df[date_col].dt.date.astype('str') + '__' + df.CVE_CONCATENADA.astype('str')
    df.set_index('index', inplace=True)
    return df

def create_column_for_merge(df, col1, col2):
    df[f'{col1}_{col2}'] = df[col1].astype('str') + '__' + df[col2].astype('str')
    return df

def prepare_data_for_graph(df):
    df['intersection_list'] = df.geometry.apply(get_intersections_as_list, df=df)
    data_for_graph = df[['CVEGEO', 'intersection_list']].copy()
    data_for_graph = data_for_graph.explode('intersection_list')
    return data_for_graph, df

def get_intersections_as_list(geom, df):
    """Assumes that df has a geometry column that may or not intersect"""
    CVEGEO_list = df[geom.intersects(df.geometry)].CVEGEO.to_list()
    return CVEGEO_list

# Data Proccess

In [5]:
df = drought_data
df.CVE_CONCATENADA = df.CVE_CONCATENADA.astype(str).apply(lambda x: x.zfill(5))

id_columns = [col for col in df.columns if not('00:00:00' in col)]
target = 'DROUGHT_INDEX'

data = pd.melt(df, id_vars=id_columns, var_name='DATE', value_name=target)
data.DATE = pd.to_datetime(data.DATE)
data = data[data.DATE > pd.to_datetime("2016-01-01")]

data['day'] = data.DATE.apply(lambda x: x.day)
half_month_condition = data.day > 15
data.loc[half_month_condition, 'DATE'] = data.loc[half_month_condition].DATE.apply(lambda x: x.replace(day=28))
data.drop('day', axis=1, inplace=True)

print('Drought index')
data.DROUGHT_INDEX = data.DROUGHT_INDEX.str.replace('D', '').astype('float').fillna(0)

# Data for features

data_for_features = create_column_for_merge(data, 'CLV_OC', 'DATE')
data_for_features = create_index_for_row_data(data_for_features, 'DATE')

operators = ['mean', 'min', 'max', 'median']
clv_rename = {oper: f"di_clv_oc_group__{oper}" for oper in operators}

clv_oc_date = data_for_features.groupby(['CLV_OC', 'DATE']).DROUGHT_INDEX.agg(operators).reset_index()
clv_oc_date = create_column_for_merge(clv_oc_date, 'CLV_OC', 'DATE').drop(['CLV_OC', 'DATE'], axis=1).rename(columns=clv_rename)
data_for_features = data_for_features.merge(clv_oc_date, on='CLV_OC_DATE', how='left').set_index(data.index)

data_for_features['di_vs_group_di__mean'] = data_for_features['di_clv_oc_group__mean'] - data_for_features['DROUGHT_INDEX']
data_for_features['di_vs_group_di__min'] = data_for_features['di_clv_oc_group__min'] - data_for_features['DROUGHT_INDEX']
data_for_features['di_vs_group_di__max'] = data_for_features['di_clv_oc_group__max'] - data_for_features['DROUGHT_INDEX']
data_for_features['di_vs_group_di__median'] = data_for_features['di_clv_oc_group__median'] - data_for_features['DROUGHT_INDEX']

# Create target for 1, 3 and 6 months into the future. 
print('Target creation')
for months in (1,3,6):
    displaced_drought_data = data.copy()
    column = f'DISPLACED_DATE__{months}MONTHS'
    target_new =  f'DROUGHT_INDEX__NEXT_{months}MONTHS'
    displaced_drought_data[column] = displaced_drought_data['DATE'].apply(lambda date: date + relativedelta(months=-months)) 
    temporary_df = create_index_for_row_data(displaced_drought_data, column)
    temporary_df.rename(columns={'DROUGHT_INDEX': target_new}, inplace=True)
    data_for_features[target_new] = temporary_df[target_new]
    
print('Index presence creation')
data_for_features.loc[data[target]==0, 'is_0_index'] = 1
data_for_features.loc[data[target]==1, 'is_1_index'] = 1
data_for_features.loc[data[target]==2, 'is_2_index'] = 1
data_for_features.loc[data[target]==3, 'is_3_index'] = 1
data_for_features.loc[data[target]==4, 'is_4_index'] = 1

data_for_features.is_0_index = data_for_features.is_0_index.fillna(0)
data_for_features.is_1_index = data_for_features.is_1_index.fillna(0)
data_for_features.is_2_index = data_for_features.is_2_index.fillna(0)
data_for_features.is_3_index = data_for_features.is_3_index.fillna(0)
data_for_features.is_4_index = data_for_features.is_4_index.fillna(0)

############################################################################ NEEDS REFACTOR
############################################################################ neighbour features

print('Creation of graph')
data_for_graph, reduced_data_with_intersection_list = prepare_data_for_graph(geo_data)

helper_data = data[['CVE_CONCATENADA', 'DATE', 'DROUGHT_INDEX']]

reduced_data_with_intersection_list = reduced_data_with_intersection_list[['CVEGEO', 'intersection_list']].rename(columns={"CVEGEO":"CVE_CONCATENADA"})
# Esto es algo que siempre tendremos calculado (entonces hace falta guardarlo) : 
dict_neigh = reduced_data_with_intersection_list.set_index('CVE_CONCATENADA').intersection_list.to_dict()

def helper_function(cve_concat):
    operators = ['mean', 'min', 'max', 'median', 'std']
    neighbours = helper_data[helper_data.CVE_CONCATENADA.isin(dict_neigh.get(cve_concat))]
    neighbours = neighbours.groupby('DATE').DROUGHT_INDEX.agg(operators).reset_index()
    neighbours['CVE_CONCATENADA'] = cve_concat
    neighbours = create_index_for_row_data(neighbours, 'DATE')
    neigh_rename = {oper: f"di_neighbour_group__{oper}" for oper in operators}
    return  neighbours.drop(['DATE', 'CVE_CONCATENADA'], axis=1).rename(columns=neigh_rename)

# Takes 42 seconds
print('neighbour_calc')
neighbour_di = pd.concat(reduced_data_with_intersection_list.CVE_CONCATENADA.apply(helper_function).to_list())


for col in neighbour_di: 
    data_for_features[col] = neighbour_di[col]

Drought index
Target creation
Index presence creation
Creation of graph
neighbour_calc


In [6]:
# Save targets for data
targets = data_for_features[['DROUGHT_INDEX__NEXT_1MONTHS', 'DROUGHT_INDEX__NEXT_3MONTHS', 'DROUGHT_INDEX__NEXT_6MONTHS']]

# Feature computation

In [11]:
features = pd.DataFrame()

rolling_df_180D_by_cve = data_for_features.groupby('CVE_CONCATENADA').rolling('180D', on='DATE', min_periods=10)

for di_coef in (0,1,2,3,4):
    features[f'proportion_is_{di_coef}_index_last180_days_same_cve'] = rolling_df_180D_by_cve[f'is_{di_coef}_index'].mean()

# operation drought last 180 days same cve_concat
def calculate_to_feature_df_last180_days_same_cve(df, feature):
    print(f'Computing for {feature}')
    t = time.perf_counter()
    df[f'{feature}_last180_days_same_cve__mean'] = rolling_df_180D_by_cve[feature].mean()
    df[f'{feature}_last180_days_same_cve__median'] = rolling_df_180D_by_cve[feature].median()
    df[f'{feature}_last180_days_same_cve__std'] = rolling_df_180D_by_cve[feature].std()
    df[f'{feature}_last180_days_same_cve__kurt'] = rolling_df_180D_by_cve[feature].kurt()
    df[f'{feature}_last180_days_same_cve__skew'] = rolling_df_180D_by_cve[feature].skew()
    df[f'{feature}_last180_days_same_cve__max'] = rolling_df_180D_by_cve[feature].max()
    df[f'{feature}_last180_days_same_cve__min'] = rolling_df_180D_by_cve[feature].min()
    df[f'{feature}_last180_days_same_cve__range'] = df[f'{feature}_last180_days_same_cve__max'] - df[f'{feature}_last180_days_same_cve__min']
    df[f'{feature}_last180_days_same_cve__central_diff'] = df[f'{feature}_last180_days_same_cve__mean'] - df[f'{feature}_last180_days_same_cve__median']
    df[f'{feature}_last180_days_same_cve__central_diff_range_proportion'] = features[f'{feature}_last180_days_same_cve__central_diff']/features[f'{feature}_last180_days_same_cve__range']
    #df[f'{feature}_last180_days_same_cve__tendency'] = rolling_df_180D_by_cve[feature].apply(lambda x: (x[-1]-x[0])/len(x))
    print(time.perf_counter()-t)

calculate_to_feature_df_last180_days_same_cve(features, 'DROUGHT_INDEX')
calculate_to_feature_df_last180_days_same_cve(features, 'di_vs_group_di__mean')
calculate_to_feature_df_last180_days_same_cve(features, 'di_vs_group_di__min')
calculate_to_feature_df_last180_days_same_cve(features, 'di_vs_group_di__max')
calculate_to_feature_df_last180_days_same_cve(features, 'di_vs_group_di__median')

calculate_to_feature_df_last180_days_same_cve(features, 'di_neighbour_group__mean')
calculate_to_feature_df_last180_days_same_cve(features, 'di_neighbour_group__min')
calculate_to_feature_df_last180_days_same_cve(features, 'di_neighbour_group__max')
calculate_to_feature_df_last180_days_same_cve(features, 'di_neighbour_group__median')
calculate_to_feature_df_last180_days_same_cve(features, 'di_neighbour_group__std')


Computing for DROUGHT_INDEX
2.594438500003889
Computing for di_vs_group_di__mean
2.6906204998958856
Computing for di_vs_group_di__min
2.960608399938792
Computing for di_vs_group_di__max
3.060362200019881
Computing for di_vs_group_di__median
3.079937699949369
Computing for di_neighbour_group__mean
3.085919899865985
Computing for di_neighbour_group__min
3.059578299988061
Computing for di_neighbour_group__max
3.175798599841073
Computing for di_neighbour_group__median
3.1241186999250203
Computing for di_neighbour_group__std


  df[f'{feature}_last180_days_same_cve__max'] = rolling_df_180D_by_cve[feature].max()


3.140877300174907


  df[f'{feature}_last180_days_same_cve__min'] = rolling_df_180D_by_cve[feature].min()
  df[f'{feature}_last180_days_same_cve__range'] = df[f'{feature}_last180_days_same_cve__max'] - df[f'{feature}_last180_days_same_cve__min']
  df[f'{feature}_last180_days_same_cve__central_diff'] = df[f'{feature}_last180_days_same_cve__mean'] - df[f'{feature}_last180_days_same_cve__median']
  df[f'{feature}_last180_days_same_cve__central_diff_range_proportion'] = features[f'{feature}_last180_days_same_cve__central_diff']/features[f'{feature}_last180_days_same_cve__range']


In [18]:
features_to_save = create_index_for_row_data(features.reset_index(), 'DATE').drop(['CVE_CONCATENADA', 'DATE'], axis=1)

# Save features

In [23]:
features_to_save.to_parquet('../data/interim/drought_data_features2024.parquet')

In [26]:
targets.to_parquet('../data/interim/drought_data_targets2024.parquet')

In [43]:
data[['CVE_CONCATENADA', 'DATE', 'DROUGHT_INDEX']].to_parquet('../data/interim/drought_data_info2024.parquet')