# Reduce features via autoencoder, apply Isolation Forest as a anomaly detection model, calculate extra stats for the resulting data

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import precision_recall_curve, auc
import json

In [None]:
shelter_data = gpd.read_file('../data/lt_ee_data_for_model.geojson')

In [None]:
emb_gdf = shelter_data[['resnet50_emb']]
expanded_df = emb_gdf['resnet50_emb'].apply(lambda x: x.split(','))

In [None]:
expanded_df = expanded_df.apply(pd.Series)

In [None]:
expanded_df = expanded_df.astype(float)

In [None]:
expanded_df.columns = [f"resnet50_emb_{i}" for i in range(expanded_df.shape[1])]

### Structured data can be expanded with other tabular parameters

In [None]:
structured_df = shelter_data[['Shape_Length', 'Shape_Area', 'type', 'geometry']]

In [None]:
numeric_columns = ['Shape_Length', 'Shape_Area']
categorical_columns = ['type']
embedding_columns = expanded_df.columns

In [None]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cats = encoder.fit_transform(structured_df[categorical_columns])
encoded_cat_df = gpd.GeoDataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_columns))

scaler = StandardScaler()
scaled_params = scaler.fit_transform(structured_df[numeric_columns])
numeric_df = gpd.GeoDataFrame(scaled_params, columns=numeric_columns)

combined_df = gpd.GeoDataFrame(pd.concat([numeric_df, encoded_cat_df, expanded_df], axis=1), geometry=structured_df.geometry)

In [None]:
X = combined_df.drop('geometry', axis=1).to_numpy()
y = shelter_data['is_shelter'].to_numpy()

In [None]:
X_targets = X[y == 1]
X_tensor = torch.tensor(X, dtype=torch.float32)
X_targets_tensor = torch.tensor(X_targets, dtype=torch.float32)

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, input_dim)
        )
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

def train_autoencoder(model, data, epochs=60, batch_size=32, lr=0.0008):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    dataset = torch.utils.data.TensorDataset(data, data)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for batch in dataloader:
            inputs, _ = batch
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, inputs)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/len(dataloader):.6f}")
    return model

In [None]:
input_dim = X_targets.shape[1]
autoencoder = Autoencoder(input_dim)
train_autoencoder(autoencoder, X_targets_tensor)

In [None]:
autoencoder.eval()
with torch.no_grad():
    X_reconstructed_tensor = autoencoder(X_tensor)
    X_encoded_tensor = autoencoder.encoder(X_tensor)
reconstruction_errors = torch.mean((X_tensor - X_reconstructed_tensor) ** 2, dim=1).numpy()
X_encoded = X_encoded_tensor.numpy()

In [None]:
X_iforest_train = X_encoded[y == 1]
iso_forest = IsolationForest(n_estimators=500, contamination=0.01, random_state=42)
iso_forest.fit(X_iforest_train)

In [None]:
iso_scores = iso_forest.decision_function(X_encoded)

In [None]:
shelter_similarity_scores = iso_scores - reconstruction_errors

In [None]:
target_like_df = shelter_data[['geometry', 'is_shelter']]
target_like_df['shelter_like_score'] = shelter_similarity_scores

In [None]:
# adjust for sensitivity - hiher percentile -> more sensitive
q2_shelters = np.percentile(shelter_similarity_scores[y == 1], 75)
threshold = q2_shelters
target_like_df['could_be_shelter'] = (target_like_df['shelter_like_score'] >= threshold).astype(int)

In [None]:
shelter_data[['shelter_like_score', 'is_shelter', 'could_be_shelter']] = target_like_df[['shelter_like_score', 'is_shelter', 'could_be_shelter']]

In [None]:
shelter_data.drop_duplicates(subset=['geometry', 'year']).to_file('../data/possible_shelters.geojson', driver='GeoJSON')

In [None]:
nuts = gpd.read_file('../data/NUTS_RG_20M_2024_4326.geojson')

In [None]:
nuts = nuts[nuts['NUTS_NAME'].isin(['Vilniaus apskritis', 'Põhja-Eesti'])][['geometry', 'CNTR_CODE', 'NUTS_ID', 'NUTS_NAME']].reset_index(drop=True)

In [None]:
df = shelter_data.drop_duplicates(subset=['geometry', 'year'])
df_area = df[['geometry']].to_crs(epsg=3857)
df_area['area_m2'] = df_area['geometry'].area
df_area = df_area.to_crs(df.crs)
shelter_data['area_m2'] = df_area['area_m2']

In [None]:
df = gpd.sjoin(shelter_data, nuts, how='left', predicate='within')
df = df.drop_duplicates(subset=['geometry'])

In [None]:
officially_protected_pop_vln_pct = 0.2
official_vln_pop_2024 = 851_346
area_per_person = df[(df['is_shelter'] == 1) & (df['CNTR_CODE'] == 'LT') & (df['year'] == 2024)]['area_m2'].sum() / (officially_protected_pop_vln_pct * official_vln_pop_2024)

In [None]:
df_new_shelters = df[(df['is_shelter'] != 1) & (df['could_be_shelter'] == 1)].groupby(['CNTR_CODE', 'NUTS_NAME', 'NUTS_ID', 'year']).agg({
    'area_m2': ['sum'],
    'could_be_shelter': ['sum']
})

In [None]:
df_existing_shelters = df[(df['is_shelter'] == 1)].groupby(['CNTR_CODE', 'NUTS_NAME', 'NUTS_ID', 'year']).agg({
    'area_m2': ['sum'],
    'is_shelter': ['sum']
})

In [None]:
df_total = df_new_shelters.join(df_existing_shelters, how='left', rsuffix='_existing')

In [None]:
df_total['total_area'] = (df_total['area_m2'] + df_total['area_m2_existing'])['sum']
df_total['total_count'] = (df_total['could_be_shelter'] + df_total['is_shelter'])['sum']
df_total['pop_protected'] = (df_total['area_m2_existing'] / area_per_person)
df_total['pop_could_protect'] = (df_total['total_area'] / area_per_person)
df_total = df_total.droplevel(1, axis=1)
df_pop = pd.read_csv('../data/nuts3_pop.csv')

In [None]:
df_total = pd.merge(df_total.reset_index(), df_pop, left_on='NUTS_ID', right_on='region_id', how='left')
df_total['pop_pct_protected'] = df_total['pop_protected'] / df_total['total_pop']
df_total['pop_pct_could_protect'] = df_total['pop_could_protect'] / df_total['total_pop']

In [None]:
region_data = {}

for region_id, group in df_total.groupby('NUTS_ID'):
    group = group.sort_values('year')
    years = list(group['year'])

    # Construct region entry
    region_data[region_id] = {
        'name': group['NUTS_NAME'].iloc[0],
        'country': group['country'].iloc[0],
        'official_shelters': list(group['is_shelter']),
        'suggested_shelters': list(group['could_be_shelter']),
        'population_protected': list(group['pop_pct_protected']),
        'years': years
    }
with open('../data/results.json', 'w') as json_file:
    json.dump(region_data, json_file)