In [80]:
from google.colab import drive
import pandas as pd
import random
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [81]:
def setup_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    #torch.use_deterministic_algorithms(True)
    torch.backends.cudnn.deterministic = True


setup_seed(42)

In [82]:
features_filepath = '/content/drive/MyDrive/LakeRegression/340_Veri_toplam_temiz.xlsx'
feats_df = pd.read_excel(features_filepath)

In [83]:
feats_df.head()

Unnamed: 0,Date,Station,CDOM (RFU),CDOM (µg/L),Klorofil-a (µg/L),Tuzluluk (‰),TDS (mg/l),Secchi Disk (m),Çözünmüş Oksijen (mg/l),Elektriksel İletkenlik (µs/cm),...,Sıcaklık (ÇO) (°C),Sıcaklık (Eİ) (°C),Toplam Azot (µg/l),Toplam Fosfor (mg/L) PO4-P,Toplam Fosfor (µg/L),Sıcaklık ort,Kuzey,Dogu,X,Y
0,2017-04-27,1,8936.96,79.45,86.14,0.7,707,0.33,10.84,1456,...,15.6,14.9,6000,0.075,75,15.25,41561,36078,235,537
1,2017-04-27,2,9912.94,87.98,61.24,0.73,684,0.38,10.63,1404,...,16.7,14.7,4900,0.088,88,15.7,41571,36083,280,427
2,2017-04-27,3,7921.96,67.57,48.4,0.75,703,0.38,10.32,1377,...,16.3,15.9,7600,0.087,87,16.1,41579,36088,325,340
3,2017-04-27,4,9071.61,79.86,39.7,0.8,811,0.32,10.54,1569,...,16.3,15.5,4760,0.064,64,15.9,41586,36090,345,263
4,2017-04-27,5,8817.51,77.11,72.52,0.83,836,0.3,10.59,1634,...,16.0,14.6,7100,0.07,70,15.3,41595,36096,398,165


In [84]:
#selected_features = ['Klorofil-a (µg/L)', '']
selected_features = list(feats_df.columns)[4:6]
base_cols = ['Date', 'Station', 'X', 'Y']
feats_df = feats_df[['Date', 'Station', 'X', 'Y'] + selected_features]
feats_df.head()

Unnamed: 0,Date,Station,X,Y,Klorofil-a (µg/L),Tuzluluk (‰)
0,2017-04-27,1,235,537,86.14,0.7
1,2017-04-27,2,280,427,61.24,0.73
2,2017-04-27,3,325,340,48.4,0.75
3,2017-04-27,4,345,263,39.7,0.8
4,2017-04-27,5,398,165,72.52,0.83


In [85]:
feats_df[selected_features].shape

(340, 2)

In [86]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
feats_df[selected_features] = scaler.fit_transform(feats_df[selected_features].values.reshape(-1, len(selected_features)))
feats_df.head()

Unnamed: 0,Date,Station,X,Y,Klorofil-a (µg/L),Tuzluluk (‰)
0,2017-04-27,1,235,537,2.643996,-2.536425
1,2017-04-27,2,280,427,1.361115,-2.301421
2,2017-04-27,3,325,340,0.699582,-2.144752
3,2017-04-27,4,345,263,0.251347,-1.75308
4,2017-04-27,5,398,165,1.942276,-1.518076


In [87]:
import os
import h5py
import numpy as np
from tqdm import tqdm

np.random.seed(0)

unique_dates = feats_df['Date'].unique()

X_train = []
y_train = []
X_val = []
y_val = []
X_test = []
y_test = []
patch_size = 9
for i in tqdm(range(1, 35)):
    if i != 22 and i != 23:
        folder_path = f'/content/drive/MyDrive/LakeRegression/data/{i}'
        for j in range(10):
            file_path = f'{folder_path}/station_{j}_{patch_size}.h5'
            if os.path.isfile(file_path):
                with h5py.File(file_path, 'r') as f:
                    a_group_key = list(f.keys())[0]
                    image = np.array(f[a_group_key])

                    unique_date = unique_dates[i - 1]
                    unique_station = j + 1
                    label = feats_df.loc[(feats_df['Date'] == unique_date) & (feats_df['Station'] == unique_station)][selected_features].values[0]

                    date = pd.to_datetime(unique_date)
                    if date < pd.to_datetime('2019-03-15'):
                        X_train.append(image)
                        y_train.append(label)
                    elif date >= pd.to_datetime('2019-03-15') and date < pd.to_datetime('2019-05-01'):
                        X_val.append(image)
                        y_val.append(label)
                    else:
                        X_test.append(image)
                        y_test.append(label)

print(f'Train % {len(X_train) / (len(X_train) + len(X_val) + len(X_test))} | Val % {len(X_val) / (len(X_train) + len(X_val) + len(X_test))} | Test % {len(X_test) / (len(X_train) + len(X_val) + len(X_test))}')

X_train = np.array(X_train)
y_train = np.array(y_train)
X_val = np.array(X_val)
y_val = np.array(y_val)
X_test = np.array(X_test)
y_test = np.array(y_test)

print(f'Train shape: {X_train.shape} | {y_train.shape} | Val shape: {X_val.shape} | {y_val.shape} | Test shape: {X_test.shape} | {y_test.shape}')

100%|██████████| 34/34 [00:01<00:00, 29.39it/s]

Train % 0.8125 | Val % 0.09375 | Test % 0.09375
Train shape: (260, 12, 9, 9) | (260, 2) | Val shape: (30, 12, 9, 9) | (30, 2) | Test shape: (30, 12, 9, 9) | (30, 2)





In [88]:
import torch
from torch.utils.data import Dataset, DataLoader, SequentialSampler
from torchvision import transforms

torch.manual_seed(0)

class LakeDataset(Dataset):
    def __init__(self, X, y, transform=None):
        self.X = X.astype(np.float32)
        self.y = y.astype(np.float32)
        self.transform = transform

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        image = self.X[idx]
        label = self.y[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

means = []
stds = []
for i in range(12):
    means.append(np.mean(X_train[:, i, :, :]))
    stds.append(np.std(X_train[:, i, :, :]))

transform = transforms.Compose([
    transforms.Lambda(lambda x: torch.from_numpy(x).float()),
    transforms.Normalize(means, stds)
])

train_dataset = LakeDataset(X_train, y_train, transform=transform)
val_dataset = LakeDataset(X_val, y_val, transform=transform)
test_dataset = LakeDataset(X_test, y_test, transform=transform)

# use sequential sampler to preserve the date order
train_loader = DataLoader(train_dataset, batch_size=32, sampler=SequentialSampler(train_dataset))
val_loader = DataLoader(val_dataset, batch_size=32, sampler=SequentialSampler(val_dataset))
test_loader = DataLoader(test_dataset, batch_size=32, sampler=SequentialSampler(test_dataset))

print(f'Train loader: {len(train_loader)} | Val loader: {len(val_loader)} | Test loader: {len(test_loader)}')

Train loader: 9 | Val loader: 1 | Test loader: 1


In [114]:
i = 0

for x, y in train_loader:
    if i>0:
        break

    print(y.unsqueeze(1)[0])
    i+=1



tensor([[ 2.6440, -2.5364]])


In [90]:
import torch
from torch import nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=12, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(20736, 128)


        self.out1 = nn.Linear(128, 1)
        self.out2 = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.dropout(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        out1 = self.out1(x)
        out2 = self.out2(x)
        return torch.cat((out1,out2), 1)


if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

In [115]:
from torch import optim
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

def train(model, train_loader, val_loader, criterion, optimizer, epochs, device):
    print(f'Using device: {device}')
    for epoch in range(epochs):
        train_loss = 0.0
        train_predictions = []
        train_actuals = []
        model.train()
        for X, y in train_loader:
            X = X.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            y_hat = model(X)
            #loss = 0
            #i = 0
            loss = criterion(y_hat, y)




            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_predictions.extend(y_hat.cpu().detach().numpy())
            train_actuals.extend(y.cpu().numpy())

        train_loss /= len(train_loader)

        train_r2 = r2_score(train_actuals, train_predictions)
        train_mse = mean_squared_error(train_actuals, train_predictions)
        train_rmse = mean_squared_error(train_actuals, train_predictions, squared=False)

        val_loss = 0.0
        val_predictions = []
        val_actuals = []
        model.eval()
        with torch.no_grad():
            for X, y in val_loader:
                X = X.to(device)
                y = y.to(device)
                y_hat = model(X)
                loss = criterion(y_hat, y)
                #i = 0
                #for o in y_hat:
                #    loss += criterion(o, y.unsqueeze(1)[i][0])
                #    i+= 1



                val_loss += loss.item()
                val_predictions.extend(y_hat.cpu().detach().numpy())
                val_actuals.extend(y.cpu().numpy())
        val_loss /= len(val_loader)

        val_r2 = r2_score(val_actuals, val_predictions)
        val_mse = mean_squared_error(val_actuals, val_predictions)
        val_rmse = mean_squared_error(val_actuals, val_predictions, squared=False)

        print(f'Epoch {epoch + 1}/{epochs} | Train loss: {train_loss:.4f} R2: {train_r2:.4f} MSE: {train_mse:.4f} RMSE: {train_rmse:.4f} | Val loss: {val_loss:.4f} R2: {val_r2:.4f} MSE: {val_mse:.4f} RMSE: {val_rmse:.4f}')
    return model

model = CNN().to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=3e-5)

epochs = 50
model = train(model, train_loader, val_loader, criterion, optimizer, epochs, device)

Using device: cuda
Epoch 1/50 | Train loss: 1.1303 R2: -0.0613 MSE: 1.1287 RMSE: 1.0620 | Val loss: 0.9189 R2: -4.5035 MSE: 0.9189 RMSE: 0.9125
Epoch 2/50 | Train loss: 1.0395 R2: 0.0210 MSE: 1.0416 RMSE: 1.0201 | Val loss: 0.8552 R2: -4.0511 MSE: 0.8552 RMSE: 0.8739
Epoch 3/50 | Train loss: 1.0155 R2: 0.0399 MSE: 1.0207 RMSE: 1.0099 | Val loss: 0.8276 R2: -3.8340 MSE: 0.8276 RMSE: 0.8542
Epoch 4/50 | Train loss: 0.9786 R2: 0.0695 MSE: 0.9880 RMSE: 0.9938 | Val loss: 0.7900 R2: -3.5313 MSE: 0.7900 RMSE: 0.8251
Epoch 5/50 | Train loss: 0.9288 R2: 0.1085 MSE: 0.9452 RMSE: 0.9722 | Val loss: 0.7325 R2: -3.0989 MSE: 0.7325 RMSE: 0.7802
Epoch 6/50 | Train loss: 0.8658 R2: 0.1575 MSE: 0.8923 RMSE: 0.9446 | Val loss: 0.6468 R2: -2.5318 MSE: 0.6468 RMSE: 0.7173
Epoch 7/50 | Train loss: 0.8045 R2: 0.2047 MSE: 0.8419 RMSE: 0.9175 | Val loss: 0.5564 R2: -1.9939 MSE: 0.5564 RMSE: 0.6549
Epoch 8/50 | Train loss: 0.7539 R2: 0.2447 MSE: 0.8000 RMSE: 0.8944 | Val loss: 0.4769 R2: -1.5723 MSE: 0.4769 R

In [None]:
a = torch.Tensor([0,0,0])
b = torch.Tensor([2,2,0])
criterion(a,b)

In [129]:
def test(model, test_loader, device):
    test_predictions = []
    test_actuals = []
    model.eval()
    with torch.no_grad():
        for X, y in test_loader:
            X = X.to(device)
            y = y.to(device)
            y_hat = model(X)
            test_predictions.extend(y_hat.cpu().detach().numpy())
            test_actuals.extend(y.cpu().numpy())
    test_r2 = r2_score(test_actuals, test_predictions)
    test_mse = mean_squared_error(test_actuals, test_predictions)
    test_rmse = mean_squared_error(test_actuals, test_predictions, squared=False)
    print(f'Test R2: {test_r2:.4f} MSE: {test_mse:.4f} RMSE: {test_rmse:.4f}')

test(model, test_loader, device)

Test R2: -4.8369 MSE: 1.1608 RMSE: 1.0438
