In [2]:
import pandas as pd
import geopandas as gpd

## Load Processed Era5 Data

In [6]:
mun_gdf = gpd.read_file(r'../../../countries/portugal/datasets/municipality_data/municipalities-shapefile-2/concelhos.shp')
erA5_std_mun_df = pd.read_excel(r'../../../countries/portugal/datasets/era5/excel/quarterly/weighted_std_mun.xlsx', index_col=0)
erA5_hist_mun_df = pd.read_excel(r'../../../countries/portugal/datasets/era5/excel/quarterly/weighted_hist_dev_mun.xlsx', index_col=0)

### Obtain Missing Municipalities

In [7]:
mun_metadata  = pd.read_excel(r'../../../countries/portugal/datasets/municipality_data/concelhos-metadata.xlsx', dtype={'dicofre' : 'string'})
mun_metadata = mun_metadata[['dicofre','designacao']]
mun_gdf = mun_gdf.merge(mun_metadata, left_on='CCA_2', right_on='dicofre').drop(columns='NAME_2')
mun_gdf = mun_gdf.rename(columns={'designacao': 'NAME_2'})

### Obtaining the districts and Municipalities Data

In [8]:
mainland = mun_gdf[~mun_gdf['NAME_1'].isin(['Azores', 'Madeira'])]
# Grouping and dissolving the municipalities by district
districts_gdf = mainland.dissolve(by='NAME_1', as_index=False)
districts_gdf = districts_gdf[['NAME_1', 'NAME_2', 'HASC_2','CCA_2', 'geometry']]
municipalities_mainland = mainland[['NAME_2', 'HASC_2','CCA_2', 'geometry']]
districts_gdf.rename(columns={'HASC_2' : 'district_id'}, inplace=True)
municipalities_mainland = municipalities_mainland.rename(columns={'HASC_2' : 'district_id'})

### Merge Municipalities

In [9]:
erA5_hist_mun_df = erA5_hist_mun_df.merge(municipalities_mainland, on='district_id')
erA5_std_mun_df = erA5_std_mun_df.merge(municipalities_mainland, on='district_id')

### Format Time

In [10]:
erA5_hist_mun_df.time = pd.to_datetime(erA5_hist_mun_df.time).dt.to_period("Q")
erA5_std_mun_df.time = pd.to_datetime(erA5_std_mun_df.time).dt.to_period("Q")

### Obtain Lags

In [None]:
erA5_hist_mun_df.columns

Index(['district_id', 'time', 't2m_wtd', 'msl_wtd', 'stl2_wtd', 'sp_wtd',
       'tp_wtd', 'tp_1000_wtd', 'NAME_2', 'CCA_2', 'geometry'],
      dtype='object')

In [11]:
for var in ['t2m_wtd', 'msl_wtd', 'stl2_wtd', 'sp_wtd', 'tp_wtd', 'tp_1000_wtd']:
    for i in range(1, 5):
        shift_var_name = f'{var}_L{i}_hd'
        erA5_hist_mun_df[shift_var_name] = erA5_hist_mun_df.groupby('district_id')[var].shift(i)


In [12]:
for var in ['t2m_wtd', 'msl_wtd', 'stl2_wtd', 'sp_wtd', 'tp_wtd', 'tp_1000_wtd']:
    for i in range(1, 5):
        shift_var_name = f'{var}_L{i}_std'
        erA5_std_mun_df[shift_var_name] = erA5_std_mun_df.groupby('district_id')[var].shift(i)

## Factor Analysis

In [13]:
df_eu_era5_tp = pd.read_excel(r'../../../countries/medit_data/era5-x0.25_timeseries_pr_timeseries_monthly_1950-2022_mean_historical_era5_x0.25_mean.xlsx')
df_eu_era5_t2m = pd.read_excel(r'../../../countries/medit_data/era5-x0.25_timeseries_tas_timeseries_monthly_1950-2022_mean_historical_era5_x0.25_mean.xlsx')

df_eu_era5_tp = df_eu_era5_tp.melt(id_vars=['time'], var_name='country', value_name='tp')
df_eu_era5_t2m = df_eu_era5_t2m.melt(id_vars=['time'], var_name='country', value_name='t2m')

df_eu_era5 = df_eu_era5_tp.merge(df_eu_era5_t2m, on = ['time', 'country'])

#### HD

In [17]:
# Ensure 'time' is in datetime format
df_eu_era5['time'] = pd.to_datetime(df_eu_era5['time'], format='%Y-%m')

# Filter the data for the date range between 1950-01 and 1980-12
historical_eu_era5 = df_eu_era5[(df_eu_era5['time'] >= '1950-01-01') & (df_eu_era5['time'] <= '1980-12-31')]

# Group by 'country' and calculate the average 't2m' for each country
historical_average_eu_era5_t2m = historical_eu_era5.groupby('country')['t2m'].mean().to_dict()
historical_average_eu_era5_tp = historical_eu_era5.groupby('country')['tp'].mean().to_dict()


In [18]:
df_eu_era5_reduced = df_eu_era5[(df_eu_era5['time'] >= '2004-01-01') & (df_eu_era5['time'] <= '2023-12-31')]

In [19]:
# Ensure df_eu_era5_reduced is explicitly a copy if it is a slice of another DataFrame
df_eu_era5_reduced = df_eu_era5_reduced.copy()

# Subtract the historical average for the 'tp' column using .loc
df_eu_era5_reduced['tp_hd'] = df_eu_era5_reduced.apply(
    lambda row: row['tp'] - historical_average_eu_era5_tp[row['country']], axis=1
)

# Subtract the historical average for the 't2m' column using .loc
df_eu_era5_reduced['t2m_hd'] = df_eu_era5_reduced.apply(
    lambda row: row['t2m'] - historical_average_eu_era5_t2m[row['country']], axis=1
)


#### Resample Time

In [20]:
df_eu_era5_reduced.head()


Unnamed: 0,time,country,tp,t2m,tp_hd,t2m_hd
648,2004-01-01,ALB,198.23,1.59,84.300618,-10.255457
649,2004-02-01,ALB,173.52,3.89,59.590618,-7.955457
650,2004-03-01,ALB,159.36,6.76,45.430618,-5.085457
651,2004-04-01,ALB,130.73,11.62,16.800618,-0.225457
652,2004-05-01,ALB,147.94,13.2,34.010618,1.354543


In [21]:
df_eu_era5_reduced['time'] = pd.to_datetime(df_eu_era5_reduced['time']).copy()


# Group by 'country', then resample 'time' within each group to quarterly frequency
df_quarterly_era5_eu = df_eu_era5_reduced.groupby('country').resample('Q', on='time').mean(numeric_only=True)

# Reset the index to bring 'time' and 'country' back as columns
df_quarterly_era5_eu.reset_index(inplace=True)

# If needed, reset the index
df_quarterly_era5_eu.reset_index(inplace=True)

In [22]:
df_quarterly_era5_eu.time = pd.to_datetime(df_quarterly_era5_eu.time).dt.to_period("Q")
df_quarterly_era5_eu.head()

Unnamed: 0,index,country,time,tp,t2m,tp_hd,t2m_hd
0,0,ALB,2004Q1,177.036667,4.08,63.107285,-7.765457
1,1,ALB,2004Q2,122.886667,14.626667,8.957285,2.78121
2,2,ALB,2004Q3,67.996667,19.99,-45.932715,8.144543
3,3,ALB,2004Q4,135.96,9.416667,22.030618,-2.42879
4,4,ALB,2005Q1,158.256667,3.1,44.327285,-8.745457


### Drop Columns

In [23]:
columns_to_drop = erA5_std_mun_df.filter(regex=r'^(stl2_|msl_|sp_)').columns
erA5_std_mun_df = erA5_std_mun_df.drop(columns=columns_to_drop)

In [24]:
columns_to_drop = erA5_hist_mun_df.filter(regex=r'^(stl2_|msl_|sp_)').columns
erA5_hist_mun_df = erA5_hist_mun_df.drop(columns=columns_to_drop)

### Perform Factor Analysis

#### Perform factor analysis for the `hd` variables

In [25]:
import pandas as pd
from sklearn.decomposition import FactorAnalysis


# Step 1: Ensure 'time' is in datetime format if not already

# Step 2: Group by 'time', calculating the mean of 'tp_hd' and 't2m_hd' across all countries for each time period
agg_data = df_quarterly_era5_eu.groupby('time')[['tp_hd', 't2m_hd']].mean()

# Step 3: Apply Factor Analysis to the aggregated data (one factor per time period)
fa_hd = FactorAnalysis(n_components=2, random_state=0)

# Fit the model to the aggregated data
fa_hd.fit(agg_data)

# Transform the data into two factors
factors_hd = fa_hd.transform(agg_data)

# Step 4: Create a DataFrame for the factors and align with the time index
factors_df = pd.DataFrame(factors_hd, index=agg_data.index, columns=['medit_factor_tp', 'medit_factor_t2m'])

# # Step 5: Merge the factors back into the original dataframe by 'time'
df_quarterly_era5_eu = df_quarterly_era5_eu.merge(factors_df, on='time', how='left')


In [26]:
for var in ['medit_factor_tp', 'medit_factor_t2m']:
    for i in range(1, 5):
        shift_var_name = f'{var}_L{i}'
        df_quarterly_era5_eu[shift_var_name] = df_quarterly_era5_eu.groupby('country')[var].shift(i)
        df_quarterly_era5_eu[shift_var_name] = df_quarterly_era5_eu.groupby('country')[var].shift(i)


### Normalise District And Concelhos Names

In [27]:
import unicodedata

# Function to normalize the names
def normalize_municipality_name(name):
    # Normalize the string (remove diacritical marks)
    name_without_diacritics = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('ascii')
    # Convert to lowercase
    return name_without_diacritics.lower()

# Apply the normalization function to the municipality column
#erA5_hist_dist_df.loc[:, 'NAME_2'] = erA5_hist_dist_df['NAME_2'].apply(normalize_municipality_name)
#erA5_std_dist_df.loc[:, 'NAME_2'] = erA5_std_dist_df['NAME_2'].apply(normalize_municipality_name)
erA5_hist_mun_df.loc[:, 'NAME_2'] = erA5_hist_mun_df['NAME_2'].apply(normalize_municipality_name)
erA5_std_mun_df.loc[:, 'NAME_2'] = erA5_std_mun_df['NAME_2'].apply(normalize_municipality_name)
# Now the 'municipality' column will have the normalized names


## Create Final Climate DF

In [28]:
erA5_hist_mun_df = erA5_hist_mun_df.rename(columns={'t2m_wtd' : 't2m_wtd_hd', 'msl_wtd' :'msl_wtd_hd', 'stl2_wtd' : 'stl2_wtd_hd',
       'sp_wtd' : 'sp_wtd_hd', 'tp_wtd' : 'tp_wtd_hd'})
erA5_hist_mun_df = erA5_hist_mun_df.rename(columns={'tp_1000_wtd': 'tp_1000_wtd_hd'})
erA5_std_mun_df = erA5_std_mun_df.rename(columns={'t2m_wtd' : 't2m_wtd_std', 'msl_wtd' :'msl_wtd_std', 'stl2_wtd' : 'stl2_wtd_std',
       'sp_wtd' : 'sp_wtd_std', 'tp_wtd' : 'tp_wtd_std'})
erA5_std_mun_df = erA5_std_mun_df.rename(columns={'tp_1000_wtd': 'tp_1000_wtd_std'})


### Merge

In [29]:
erA5_std_mun_df.columns

Index(['district_id', 'time', 't2m_wtd_std', 'tp_wtd_std', 'tp_1000_wtd_std',
       'NAME_2', 'CCA_2', 'geometry', 't2m_wtd_L1_std', 't2m_wtd_L2_std',
       't2m_wtd_L3_std', 't2m_wtd_L4_std', 'tp_wtd_L1_std', 'tp_wtd_L2_std',
       'tp_wtd_L3_std', 'tp_wtd_L4_std', 'tp_1000_wtd_L1_std',
       'tp_1000_wtd_L2_std', 'tp_1000_wtd_L3_std', 'tp_1000_wtd_L4_std'],
      dtype='object')

In [30]:
final_era5_df = erA5_hist_mun_df.merge(erA5_std_mun_df[['time', 't2m_wtd_std', 'tp_1000_wtd_std',
       'tp_wtd_std', 'NAME_2', 't2m_wtd_L1_std', 't2m_wtd_L2_std', 't2m_wtd_L3_std',
       't2m_wtd_L4_std', 'tp_wtd_L1_std', 'tp_wtd_L2_std', 'tp_wtd_L3_std', 'tp_wtd_L4_std', 'tp_1000_wtd_L1_std',
       'tp_1000_wtd_L2_std', 'tp_1000_wtd_L3_std', 'tp_1000_wtd_L4_std']], on = ['time', 'NAME_2'])

In [33]:
df_quarterly_era5_eu = df_quarterly_era5_eu.drop(columns=['country']).drop_duplicates()


In [34]:
final_era5_df = pd.merge(final_era5_df, df_quarterly_era5_eu, how='left', on='time')

In [35]:
final_era5_df.head()

Unnamed: 0,district_id,time,t2m_wtd_hd,tp_wtd_hd,tp_1000_wtd_hd,NAME_2,CCA_2,geometry,t2m_wtd_L1_hd,t2m_wtd_L2_hd,...,medit_factor_tp,medit_factor_t2m,medit_factor_tp_L1,medit_factor_tp_L2,medit_factor_tp_L3,medit_factor_tp_L4,medit_factor_t2m_L1,medit_factor_t2m_L2,medit_factor_t2m_L3,medit_factor_t2m_L4
0,PT.AV.AG,1990Q1,-2.837953,-0.001022,-1.02155,agueda,101,"POLYGON ((-8.37635 40.69291, -8.37624 40.69271...",,,...,,,,,,,,,,
1,PT.AV.AG,1990Q2,2.114756,-0.002236,-2.23593,agueda,101,"POLYGON ((-8.37635 40.69291, -8.37624 40.69271...",-2.837953,,...,,,,,,,,,,
2,PT.AV.AG,1990Q3,7.842431,-0.00248,-2.479658,agueda,101,"POLYGON ((-8.37635 40.69291, -8.37624 40.69271...",2.114756,-2.837953,...,,,,,,,,,,
3,PT.AV.AG,1990Q4,-2.779849,0.001644,1.644437,agueda,101,"POLYGON ((-8.37635 40.69291, -8.37624 40.69271...",7.842431,2.114756,...,,,,,,,,,,
4,PT.AV.AG,1991Q1,-4.932674,0.002001,2.001435,agueda,101,"POLYGON ((-8.37635 40.69291, -8.37624 40.69271...",-2.779849,7.842431,...,,,,,,,,,,


## New Variables for Model

In [36]:
import numpy as np
final_era5_df['tp_2_wtd_hd'] = final_era5_df.groupby('district_id')['tp_1000_wtd_hd'].transform(lambda x: x**2)
final_era5_df['tp_3_wtd_hd'] = final_era5_df.groupby('district_id')['tp_1000_wtd_hd'].transform(lambda x: x**3)
final_era5_df['t2m_2_wtd_hd'] = final_era5_df.groupby('district_id')['t2m_wtd_hd'].transform(lambda x: x**2)
final_era5_df['t2m_3_wtd_hd'] = final_era5_df.groupby('district_id')['t2m_wtd_hd'].transform(lambda x: x**3)



In [37]:
for var in ['tp_2_wtd_hd', 'tp_3_wtd_hd', 't2m_2_wtd_hd', 't2m_3_wtd_hd']:
    for i in range(1, 5):
        shift_var_name = f'{var}_L{i}'
        final_era5_df[shift_var_name] = final_era5_df.groupby('district_id')[var].shift(i)

In [38]:
final_era5_df = final_era5_df.drop_duplicates(subset=['NAME_2', 'time'])

## Save Data

In [42]:
final_era5_df.to_excel(r'../../../countries/portugal/datasets/era5/excel/quarterly/complete_data_mun.xlsx')