In [1]:
import datetime
import geopandas as gpd
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from ast import literal_eval

In [2]:
data = pd.read_csv('../data/raw/meteorological_information.csv', index_col=0)
PATH_FEATURES = '../data/interim/drought_data_features.csv'
drought_data_features = pd.read_csv(PATH_FEATURES, index_col=0)

In [3]:
def create_index_for_row_data(df, date_col):
    df['index'] = df[date_col].dt.date.astype('str') + '__' + df.CVE_CONCATENADA.astype('str')
    df.set_index('index', inplace=True)
    return df

In [5]:
check_index = drought_data_features.index

In [8]:
def exploit_municipal_data(municipal_data):
    CVE_CONCATENADA = municipal_data.CVEGEO.values[0] 
    daily_information = pd.DataFrame(literal_eval(municipal_data.meteorological_information.values[0])['daily'])
    daily_information['time'] = pd.to_datetime(daily_information['time'])
    daily_information = daily_information[daily_information.time > pd.to_datetime('2003-01-01')]
    daily_information['sunrise'] = pd.to_datetime(daily_information['sunrise'])
    daily_information['sunset'] = pd.to_datetime(daily_information['sunset'])
    daily_information['day_duration'] = (daily_information['sunset']-daily_information['sunrise']).dt.seconds
    daily_information['temperature_range'] = daily_information['temperature_2m_max'] - daily_information['temperature_2m_min']  
    daily_information['temperature_max_apparent_range'] = daily_information['temperature_2m_max'] - daily_information['apparent_temperature_max']  
    daily_information['temperature_min_apparent_range'] = daily_information['temperature_2m_min'] - daily_information['apparent_temperature_min']  
    daily_information['temperature_mean_apparent_range'] = daily_information['temperature_2m_mean'] - daily_information['apparent_temperature_mean'] 
    daily_information['CVE_CONCATENADA'] = CVE_CONCATENADA
    daily_information.rename(columns={'time':'DATE'}, inplace=True)
    daily_information['NEW_DATE'] = daily_information['DATE'] + datetime.timedelta(days=10)
    daily_information.drop(['sunrise', 'sunset'], axis=1, inplace=True)
    daily_information = create_index_for_row_data(daily_information, 'DATE')
    ## Calculate rolling windows
    days = 120
    drop_cols = ['weathercode', 'CVE_CONCATENADA', 'DATE']
    daily_information_rolling = daily_information.drop(drop_cols, axis=1).rolling(f'{days}D', on='NEW_DATE', min_periods=days, closed='left')
    daily_information_rolling_info = {}
    daily_information_rolling_info['mean'] = daily_information_rolling.mean()
    daily_information_rolling_info['std'] = daily_information_rolling.std()
    daily_information_rolling_info['max'] = daily_information_rolling.max()
    daily_information_rolling_info['min'] = daily_information_rolling.min()
    daily_information_rolling_info['median'] = daily_information_rolling.median()
    daily_information_rolling_info['skew'] = daily_information_rolling.skew()
    daily_information_rolling_info['kurt'] = daily_information_rolling.kurt()
    daily_information_rolling_info['mean_vs_median'] = daily_information_rolling_info['mean'] - daily_information_rolling_info['median']
    daily_information_rolling_info['range'] = daily_information_rolling_info['max'] - daily_information_rolling_info['min']
    daily_information_rolling_dfs = []
    for operation, data in daily_information_rolling_info.items():
        rename_col_dict = {
            col:f'{col}__last{days}_days_{operation}' 
            for col in data.drop('NEW_DATE', axis=1).columns}
        data.rename(columns=rename_col_dict, inplace=True)
        daily_information_rolling_dfs.append(data)
    concatenated_daily_information_rolling_df = pd.concat(daily_information_rolling_dfs, axis=1)
    return concatenated_daily_information_rolling_df[concatenated_daily_information_rolling_df.index.isin(check_index)]

In [9]:
municipal_data_list = [] 
for i, mun in enumerate(data.CVEGEO.unique()):
    municipal_data = data[data.CVEGEO==mun]
    exploited_municipal_data = exploit_municipal_data(municipal_data)
    municipal_data_list.append(exploited_municipal_data)
    if not i%100:
        print(i)
municipal_data_meteorological_features = pd.concat(municipal_data_list)


0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400


In [13]:
municipal_data_meteorological_features = municipal_data_meteorological_features.loc[:,~municipal_data_meteorological_features.columns.duplicated()].copy()

In [16]:
municipal_data_meteorological_features.drop('NEW_DATE', axis=1).to_csv('../data/interim/meteorological_data_features.csv')