In [16]:
import pandas as pd
import numpy as np
import random
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

def seed_everything(seed: int = 42):

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Usage:
seed_everything(42)

In [17]:
uhi = pd.read_csv('uhi_imputed.csv')

In [18]:
X = uhi.drop(columns=['Latitude', 'Longitude', 'UHI Index', 'datetime'], axis=1)
y = uhi['UHI Index']

In [19]:
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)

scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))

X_tensor = torch.tensor(X_scaled
, dtype=torch.float32)
y_tensor = torch.tensor(y_scaled, dtype=torch.float32).view(-1, 1)

class UHIDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
# train = UHIDataset(X_tensor, y_tensor)
# train_loader = DataLoader(train, batch_size=32, shuffle=True)

In [20]:
class ResidualBlock(nn.Module):
    def __init__(self, dim):
        super(ResidualBlock, self).__init__()
        self.block = nn.Sequential(
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim),
            nn.LeakyReLU(),
            nn.Dropout(0.1),
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim)
        )
    def forward(self, x):
        return x + self.block(x)

class Model(nn.Module):
    def __init__(self, input_dim):
        super(Model, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LeakyReLU(negative_slope=0.01),
            ResidualBlock(512),
            nn.Linear(512, 256),
            nn.LeakyReLU(negative_slope=0.01),
            ResidualBlock(256),
            nn.Linear(256, 128),
            nn.LeakyReLU(negative_slope=0.01),
            ResidualBlock(128),
            nn.Linear(128, 64),
            nn.LeakyReLU(negative_slope=0.01),
            ResidualBlock(64),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.net(x)


In [21]:
input_dim = X.shape[1]
model = Model(input_dim)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [22]:
k = 10
kf = KFold(n_splits=k, shuffle=True, random_state=42)

fold_r2_scores = []

dataset = UHIDataset(X_tensor, y_tensor)

for fold, (train_idx, val_idx) in enumerate(kf.split(np.arange(len(dataset)))):
    print(f"\nFold {fold+1}/{k}")
    
    train_subset = Subset(dataset, train_idx)
    val_subset = Subset(dataset, val_idx)
    
    train_loader = DataLoader(train_subset, batch_size=128, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=128, shuffle=False)

    num_epochs = 300
    
    model = Model(input_dim=X_tensor.shape[1])
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=1e-6)
    criterion = nn.MSELoss()
    

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, targets in train_loader:
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()

        scheduler.step()
        
        avg_loss = running_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss}, Learning Rate: {scheduler.get_last_lr()[0]}")
        
    model.eval()
    val_preds_list = []
    val_targets_list = []
    with torch.no_grad():
        for inputs, targets in val_loader:
           
            outputs = model(inputs)
          
            val_preds_list.append(outputs.numpy())
            val_targets_list.append(targets.numpy())
    
    val_preds = np.concatenate(val_preds_list, axis=0)
    val_targets = np.concatenate(val_targets_list, axis=0)
    
    val_preds_orig = scaler_y.inverse_transform(val_preds)
    val_targets_orig = scaler_y.inverse_transform(val_targets)
    
    fold_r2 = r2_score(val_targets_orig, val_preds_orig)
    print(f"Fold {fold+1} R2: {fold_r2}")
    fold_r2_scores.append(fold_r2)

avg_r2 = np.mean(fold_r2_scores)
print(f"\nAverage R2 across {k} folds: {avg_r2}")

# num_epochs = 200

# for epoch in range(num_epochs):
#     model.train()
#     running_loss = 0
    
#     for inputs, targets in train_loader:
#         optimizer.zero_grad()
#         outputs = model(inputs)
#         loss = criterion(outputs, targets)
#         loss.backward()
#         optimizer.step()

#         running_loss += loss.item()

#     avg_loss = running_loss / len(train_loader)
#     print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss}')

# model.eval()
# with torch.no_grad():
#     preds_scaled = model(X_tensor)  
#     preds_orig = scaler_y.inverse_transform(preds_scaled.numpy())
#     y_orig = y.values.reshape(-1, 1)
#     loss = criterion(preds_scaled, y_tensor)
#     r2 = r2_score(y_orig, preds_orig)
#     print(f'Loss: {loss.item()}, R2: {r2}')


Fold 1/10
Epoch 1/300, Loss: 0.8999317273308959, Learning Rate: 0.0009999726120980734
Epoch 2/300, Loss: 0.8417412118066715, Learning Rate: 0.0009998904513956854
Epoch 3/300, Loss: 0.8219349520115913, Learning Rate: 0.0009997535269026829
Epoch 4/300, Loss: 0.7988628767713716, Learning Rate: 0.0009995618536343797
Epoch 5/300, Loss: 0.7852990400942066, Learning Rate: 0.0009993154526099096
Epoch 6/300, Loss: 0.7811678399013567, Learning Rate: 0.0009990143508499217
Epoch 7/300, Loss: 0.7670513580117044, Learning Rate: 0.0009986585813736167
Epoch 8/300, Loss: 0.753342525113987, Learning Rate: 0.0009982481831951274
Epoch 9/300, Loss: 0.7478642810749102, Learning Rate: 0.0009977832013192385
Epoch 10/300, Loss: 0.7414290806915187, Learning Rate: 0.0009972636867364526
Epoch 11/300, Loss: 0.7331109537353998, Learning Rate: 0.0009966896964173982
Epoch 12/300, Loss: 0.7319290358808976, Learning Rate: 0.000996061293306582
Epoch 13/300, Loss: 0.7214471116850648, Learning Rate: 0.0009953785463154864

In [23]:
feature_columns = ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 
                   'B07', 'B08', 'B8A', 'B11', 'B12', 'LST', 'is_building', 
                   'Air Temp at Surface [degC]', 'Relative Humidity [percent]', 
                   'Avg Wind Speed [m/s]', 'Wind Direction [degrees]', 'Solar Flux [W/m^2]']

submission = pd.read_csv('validation_imputed.csv')

submission_prepared = submission[feature_columns].copy()

submission_prepared_scaled = scaler_X.transform(submission_prepared)

submission_tensor = torch.tensor(submission_prepared_scaled, dtype=torch.float32)

model.eval()

with torch.no_grad():
    final_predictions_tensor = model(submission_tensor)
    final_predictions_np = scaler_y.inverse_transform(final_predictions_tensor.numpy())

final_prediction_series = pd.Series(final_predictions_np.flatten())

In [24]:
sub = pd.read_csv('Submission_template.csv')

submission_df = pd.DataFrame({'Longitude':sub['Longitude'].values, 'Latitude':sub['Latitude'].values, 'UHI Index':final_prediction_series.values})

submission_df.to_csv('submission.csv', index=False)