Information was obtained from https://www.inegi.org.mx/app/biblioteca/ficha.html?upc=889463807469

In [1]:
import geopandas as gpd
import os
import networkx as nx
import pandas as pd

from src.data.utils import create_index_for_row_data

In [2]:
data = gpd.read_file('../data/raw/00mun.shp')

In [3]:
def get_intersections_as_list(geom, df):
    """Assumes that df has a geometry column that may or not intersect"""
    CVE_CONCATENADA_list = df[geom.intersects(df.geometry)].CVE_CONCATENADA.to_list()
    return CVE_CONCATENADA_list

def prepare_data_for_graph(df):
    df['CVE_CONCATENADA'] = df.CVEGEO.astype(int)
    df['intersection_list'] = df.geometry.apply(get_intersections_as_list, df=df)
    data_for_graph = df[['CVE_CONCATENADA', 'intersection_list']].copy()
    data_for_graph = data_for_graph.explode('intersection_list')
    return data_for_graph, df

def create_network(df, from_val='CVE_CONCATENADA', to_val='intersection_list'):
    G = nx.from_pandas_edgelist(df, from_val, to_val)
    return G

def get_neighbours_from_graph(data, graph, *neighbours_distance):
    neighbour_columns = ['CVE_CONCATENADA']
    for n in neighbours_distance:
        n_col = f'neighbours__{n}'
        calc_neighbours_func = lambda x: list(nx.descendants_at_distance(graph, x, n))
        data[n_col] = data.CVE_CONCATENADA.apply(calc_neighbours_func)
        neighbour_columns.append(n_col)
    graph_data = data[neighbour_columns].set_index('CVE_CONCATENADA')
    return graph_data

def add_graph_features(data, G):
    data['triangles'] = pd.Series(nx.triangles(G))
    data['clustering'] = pd.Series(nx.clustering(G))
    data['square_clustering'] = pd.Series(nx.square_clustering(G))
    data['degree_centrality'] = pd.Series(nx.degree_centrality(G))
    data['eigenvector_centrality'] = pd.Series(nx.eigenvector_centrality(G))
    data['katz_centrality'] = pd.Series(nx.katz_centrality(G))
    data['closeness_centrality'] = pd.Series(nx.closeness_centrality(G))
    data['information_centrality'] = pd.Series(nx.information_centrality(G))
    data['betweenness_centrality'] = pd.Series(nx.betweenness_centrality(G))
    data['node_clique_number'] = pd.Series(nx.node_clique_number(G))
    data['voterank'] = pd.Series(nx.voterank(G)).reset_index().set_index(0)['index']
    return data

In [4]:
    data_for_graph, data = prepare_data_for_graph(data)
    municipal_network = create_network(data_for_graph, from_val='CVE_CONCATENADA', to_val='intersection_list')
    graph_data = get_neighbours_from_graph(data, municipal_network, 1,2,3)
    graph_data = add_graph_features(graph_data, municipal_network)
    neighbour_data = graph_data.copy()

In [17]:
#features1 = pd.read_csv('../data/interim/drought_data_features.csv', index_col=0)

features2 = pd.read_csv('../data/interim/meteorological_data_features.csv', index_col=0)

In [18]:
features2

Unnamed: 0_level_0,apparent_temperature_max__last120_days_mean,apparent_temperature_mean__last120_days_mean,apparent_temperature_min__last120_days_mean,day_duration__last120_days_mean,precipitation_hours__last120_days_mean,precipitation_sum__last120_days_mean,rain_sum__last120_days_mean,shortwave_radiation_sum__last120_days_mean,temperature_2m_max__last120_days_mean,temperature_2m_mean__last120_days_mean,...,precipitation_sum__last120_days_range,rain_sum__last120_days_range,shortwave_radiation_sum__last120_days_range,temperature_2m_max__last120_days_range,temperature_2m_mean__last120_days_range,temperature_2m_min__last120_days_range,temperature_max_apparent_range__last120_days_range,temperature_mean_apparent_range__last120_days_range,temperature_min_apparent_range__last120_days_range,temperature_range__last120_days_range
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2003-01-28__1001,,,,,,,,,,,...,,,,,,,,,,
2003-02-28__1001,,,,,,,,,,,...,,,,,,,,,,
2003-03-28__1001,,,,,,,,,,,...,,,,,,,,,,
2003-04-28__1001,,,,,,,,,,,...,,,,,,,,,,
2003-05-28__1001,25.151667,17.163333,9.669167,44313.5,0.308333,0.218333,0.218333,25.581333,28.180833,20.192500,...,8.0,8.0,20.18,15.8,15.6,15.1,8.2,7.0,6.9,12.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-15__32058,20.593333,12.561667,5.319167,40331.0,0.508333,0.165833,0.165833,19.997917,22.914167,15.026667,...,7.0,7.0,15.13,12.1,8.5,8.0,5.2,3.4,3.1,11.6
2023-03-28__32058,20.913333,12.617500,5.126667,40759.5,0.083333,0.021667,0.021667,21.011917,23.364167,15.270000,...,2.1,2.1,15.51,12.1,8.5,7.7,4.0,2.8,2.6,11.0
2023-04-28__32058,22.441667,13.785000,5.947500,42356.5,0.116667,0.028333,0.028333,23.516333,24.883333,16.567500,...,2.1,2.1,16.45,13.6,10.3,9.1,5.3,2.6,2.8,10.2
2023-05-15__32058,23.633333,14.794167,6.805833,43444.5,0.116667,0.040000,0.040000,25.094250,26.040000,17.608333,...,2.1,2.1,12.28,11.5,10.1,10.0,4.9,2.6,2.8,8.1


In [6]:
#features = pd.concat([features1, features2], axis=1)
features = pd.concat([features1], axis=1)

In [7]:
drought_data = pd.read_csv('../data/interim/drought_data.csv', index_col=0)
#drought_data.NEW_DATE = pd.to_datetime(drought_data.NEW_DATE)

In [8]:
features['CVE_CONCATENADA'] = drought_data['CVE_CONCATENADA']
features['NEW_DATE'] = pd.to_datetime(drought_data['NEW_DATE'])

In [9]:
def function_for_neighbour_features(row, data):
    grouped_dfs = []
    statistic_value_list = ['mean', 'std', 'min', 'max', 'median']
    for neighbour_num in row.index:
        if not 'neighbour' in neighbour_num:
            continue
        df_grouped = data[
            data.CVE_CONCATENADA.isin(
                row[neighbour_num]
            )
        ].drop('CVE_CONCATENADA',axis=1).groupby('NEW_DATE')
        all_statistics_list = []
        for statistic in statistic_value_list:
            df_grouped_statistic = df_grouped.agg(statistic)
            df_grouped_statistic.columns = [f'{col}__{neighbour_num}_{statistic}' for col in df_grouped_statistic.columns]
            all_statistics_list.append(df_grouped_statistic)
        all_statistics_df = pd.concat(all_statistics_list, axis=1)
        all_statistics_df['CVE_CONCATENADA'] = row.name
        all_statistics_df.reset_index(inplace=True)
        grouped_dfs.append(all_statistics_df) 
    final_neighbour_features_result = pd.concat(grouped_dfs, axis=1)
    final_neighbour_features_result = final_neighbour_features_result.loc[:,~final_neighbour_features_result.columns.duplicated()].copy()
    return final_neighbour_features_result

neighbour_features_series = neighbour_data.apply(function_for_neighbour_features, data=features, axis=1)

In [10]:
neighbour_features = pd.concat(neighbour_features_series.to_list())
neighbour_features.NEW_DATE = pd.to_datetime(neighbour_features.NEW_DATE)
neighbour_features = create_index_for_row_data(neighbour_features, 'NEW_DATE')
neighbour_features.drop(['NEW_DATE', 'CVE_CONCATENADA'], axis=1, inplace=True)

In [11]:
def complement_neighbour_features(data, alpha=1, beta=2):
    df = data.copy()
    statistic_value_list = ['mean', 'std', 'min', 'max', 'median']
    possible_candidates = list(set([col.rsplit('__',1)[0] for col in df.columns.to_list()]))
    new_feature_list = []
    for val in possible_candidates:
        for stat in statistic_value_list:
            new_feature = f'{val}__{beta}vs{alpha}_{stat}'
            alpha_feature = f'{val}__{beta}_{stat}'
            beta_feature = f'{val}__{alpha}_{stat}'
            df[new_feature] = df[beta_feature]-df[alpha_feature]
            new_feature_list.append(new_feature)
    return df[new_feature_list]

neighbour_features_12 = complement_neighbour_features(neighbour_features, 1, 2)
neighbour_features_13 = complement_neighbour_features(neighbour_features, 1, 3)
neighbour_features_23 = complement_neighbour_features(neighbour_features, 2, 3)

  df[new_feature] = df[beta_feature]-df[alpha_feature]
  df[new_feature] = df[beta_feature]-df[alpha_feature]
  df[new_feature] = df[beta_feature]-df[alpha_feature]
  df[new_feature] = df[beta_feature]-df[alpha_feature]
  df[new_feature] = df[beta_feature]-df[alpha_feature]
  df[new_feature] = df[beta_feature]-df[alpha_feature]
  df[new_feature] = df[beta_feature]-df[alpha_feature]
  df[new_feature] = df[beta_feature]-df[alpha_feature]
  df[new_feature] = df[beta_feature]-df[alpha_feature]
  df[new_feature] = df[beta_feature]-df[alpha_feature]
  df[new_feature] = df[beta_feature]-df[alpha_feature]
  df[new_feature] = df[beta_feature]-df[alpha_feature]
  df[new_feature] = df[beta_feature]-df[alpha_feature]
  df[new_feature] = df[beta_feature]-df[alpha_feature]
  df[new_feature] = df[beta_feature]-df[alpha_feature]
  df[new_feature] = df[beta_feature]-df[alpha_feature]
  df[new_feature] = df[beta_feature]-df[alpha_feature]
  df[new_feature] = df[beta_feature]-df[alpha_feature]
  df[new_f

In [12]:
all_neighbour_features = pd.concat(
    [neighbour_features,
     neighbour_features_12,
     neighbour_features_13,
     neighbour_features_23
    ], axis=1)


In [16]:
all_neighbour_features.to_csv('../data/interim/drought_data_neighbour_features.csv')