In [None]:
%%capture
!pip install radiant-mlhub
!pip install gdown
!pip install pandas-path

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import json
import re
from glob import glob
import tarfile                                                  # for extracting tar.gz file
from pathlib import Path
import os
from radiant_mlhub import Dataset, Collection, client           # api for dataset download from radiant mlhub
# Path style access for pandas
from pandas_path import path
from PIL import Image as pil_image                              # for viewing image using pil

In [None]:
# radiant mlhub api key
os.environ['MLHUB_API_KEY'] = '45163c068524cc5f245e2786f969788f1a8435925564fcbda0c8b779c8437f4b'

dataset = Dataset.fetch('nasa_tropical_storm_competition')

# download directory is data_custom
download_dir = Path('./data_custom').resolve()
archive_paths = dataset.download(output_dir=download_dir)
# download and extract data
for archive_path in archive_paths:
    print(f'Extracting {archive_path}...')
    with tarfile.open(archive_path) as tfile:
        tfile.extractall(path=download_dir)
print('Done')

In [None]:
# remove tar.gz files
!rm ./data_custom/*.tar.gz

In [None]:
%%capture
!wget https://radiant-mlhub.s3-us-west-2.amazonaws.com/nasa-tropical-storm-challenge/training_set_features.csv
!wget https://radiant-mlhub.s3-us-west-2.amazonaws.com/nasa-tropical-storm-challenge/test_set_features.csv
!wget https://radiant-mlhub.s3-us-west-2.amazonaws.com/nasa-tropical-storm-challenge/training_set_labels.csv
!wget https://radiant-mlhub.s3-us-west-2.amazonaws.com/nasa-tropical-storm-challenge/test_set_labels.csv

In [None]:
DATA_PATH = r"./data_custom"

In [None]:
# train data and labels
train_metadata = pd.read_csv("./training_set_features.csv")
train_labels = pd.read_csv("./training_set_labels.csv")
# replace space with hyphen in col names in df
train_metadata.columns = train_metadata.columns.str.replace(' ','_').str.lower()
train_labels.columns = train_labels.columns.str.replace(' ','_').str.lower()

In [None]:
print(train_metadata.shape)
train_metadata.head()

In [None]:
train_labels.head()

In [None]:
train_labels.describe()

In [None]:
# Merge train_metadata with train_labels on the image_id field
full_metadata = train_metadata.merge(train_labels, on="image_id")

In [None]:
full_metadata = full_metadata.sort_values(by=['image_id']).reset_index(drop=True)
full_metadata.head()

In [None]:
full_metadata["file_name"] = DATA_PATH + r"/nasa_tropical_storm_competition_train_source" + r"/nasa_tropical_storm_competition_train_source_" + full_metadata.image_id + "/image.jpg"
full_metadata.head()

In [None]:
#sample image file path
full_metadata["file_name"][0]

In [None]:
# check image size and loading without error or not
examples = [
    pil_image.open(full_metadata.iloc[i]["file_name"]).convert("RGB") for i in range(5)
]
for image in examples:
    print(image.size)

In [None]:
# Add a temporary column for number of images per storm
images_per_storm = full_metadata.groupby("storm_id").size().to_frame("images_per_storm")
full_metadata = full_metadata.merge(images_per_storm, how="left", on="storm_id")

In [None]:
# Each storm is sorted by relative time
# Identify the final 20% of images per storm
full_metadata["pct_of_storm"] = (
    full_metadata.groupby("storm_id").cumcount() / full_metadata.images_per_storm
)
train = full_metadata[full_metadata.pct_of_storm < 0.8].drop(
    ["images_per_storm", "pct_of_storm"], axis=1
)
val = full_metadata[full_metadata.pct_of_storm >= 0.8].drop(
    ["images_per_storm", "pct_of_storm"], axis=1
)

In [None]:
# Confirm pct of images in the validation set is approximately 20%
len(val) / len(full_metadata) * 100

In [None]:
# Sample 10% to increase performance

# not sampling 10% data using all the data

# train = train.sample(frac=0.1, replace=False, random_state=1)
# val = val.sample(frac=0.1, replace=False, random_state=1)

In [None]:
print(f"Train set shape:{train.shape} \nvalidation set shape: {val.shape}")

In [None]:
# Separate features from labels
x_train = train.drop("wind_speed", axis=1)
y_train = train.wind_speed

x_val = val.drop("wind_speed", axis=1)
y_val = val.wind_speed

In [None]:
import torch
from torch.utils.data import Dataset
from torchvision import transforms

from torch import nn

import warnings

import pytorch_lightning as pl
from torch.utils.data import DataLoader
import torchvision.models as models
from tqdm import tqdm
from pytorch_lightning.callbacks import EarlyStopping


warnings.filterwarnings("ignore")

In [None]:
class DatasetWIND(Dataset):
    """Reads in an image, transforms pixel values, and serves
    a dictionary containing the image id, image tensors, and label.
    """

    def __init__(self, x_train, y_train=None):
        self.data = x_train
        self.label = y_train
        self.transform = transforms.Compose(
            [
                transforms.CenterCrop(128),
                transforms.ToTensor(),
                # All models expect the same normalization mean & std
                # https://pytorch.org/docs/stable/torchvision/models.html
                transforms.Normalize(
                    mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
                ),
            ]
        )

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        image = pil_image.open(self.data.iloc[index]["file_name"]).convert("RGB")
        image = self.transform(image)
        image_id = self.data.iloc[index]["image_id"]
        if self.label is not None:
            label = self.label.iloc[index]
            sample = {"image_id": image_id, "image": image, "label": label}
        else:
            sample = {
                "image_id": image_id,
                "image": image,
            }
        return sample

In [None]:
class RMSELoss(nn.Module):
    """Measures root mean square error."""

    def __init__(self):
        super(RMSELoss, self).__init__()
        self.mse = nn.MSELoss()

    def forward(self, pred, true):
        return torch.sqrt(self.mse(pred, true))

In [None]:
class PretrainedWindModel(pl.LightningModule):
    def __init__(self):
        super(PretrainedWindModel, self).__init__()

        self.num_outputs = 1  # One prediction for regression

        # Instantiate training and validation datasets
        self.model = self.prepare_model()

    def prepare_model(self):
        res_model = models.resnet152(pretrained=True)
        # Input size of 2048 for resnet152
        # https://pytorch.org/hub/pytorch_vision_resnet/
        res_model.fc = nn.Sequential(
            nn.Linear(2048, hidden_size),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, self.num_outputs),
        )
        return res_model

    def forward(self, image):
        return self.model(image)

    def training_step(self, batch, batch_idx):
        x = batch["image"]
        y = batch["label"]
        criterion = RMSELoss()
        # Switch to training mode
        outputs = self.model.train().forward(x).squeeze()
        loss = criterion(outputs, y.type(torch.FloatTensor).cuda())
        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        x = batch["image"]
        y = batch["label"]
        criterion = RMSELoss()
        # Switch to evaluation mode
        outputs =  self.model.eval().forward(x).squeeze()
        loss = criterion( outputs , y.type(torch.FloatTensor).cuda() )
        tensorboard_logs = {"val_loss": loss}
        return {"val_loss": loss, "log": tensorboard_logs}

    def train_dataloader(self):
        train_dataset = DatasetWIND(x_train, y_train)
        return DataLoader(train_dataset, num_workers= num_workers, batch_size= batch_size)

    def val_dataloader(self):
        val_dataset = DatasetWIND(x_val, y_val)
        return DataLoader(val_dataset, num_workers= num_workers, batch_size= batch_size)

    def configure_optimizers(self):
        return torch.optim.AdamW(self.model.parameters(), lr=lr)

    def training_epoch_end(self, outputs):
        avg_train_loss = torch.stack(tuple(output["loss"] for output in outputs)).mean()
        # return {
        #     "avg_epoch_train_loss": avg_train_loss,
        #     "progress_bar": {"avg_epoch_train_loss": avg_train_loss},
        #     "log": {"avg_epoch_train_loss": avg_train_loss},
        # }

    def validation_epoch_end(self, outputs):
        avg_val_loss = torch.stack(tuple(output["val_loss"] for output in outputs)).mean()
        tensorboard_logs = {'avg_val_loss':avg_val_loss}
        return {
            "avg_epoch_val_loss": avg_val_loss,
            "progress_bar": {"avg_epoch_val_loss": avg_val_loss},
            "log": tensorboard_logs
        }


In [None]:
# Required hparams
x_train = x_train
y_train = y_train
x_val = x_val
y_val = y_val
# # Optional hparams
lr = 2e-4
hidden_size = 100
dropout = 0.1
max_epochs = 30
batch_size = 32
num_workers = 4
gradient_clip_val = 1
val_sanity_checks = 0
output_path = r"./plmodel/model-outputs/"
log_path = r"./plmodel/logs"

In [None]:
# Where final model will be saved
output_path = Path(output_path)
output_path.mkdir(parents=True,exist_ok=True)

#  Where TensorBoard logs will be saved
log_path = Path(log_path)
log_path.mkdir(parents=True, exist_ok=True)
logger = pl.loggers.TensorBoardLogger(
    log_path, name="benchmark_model"
)

# early stopping
early_stop = EarlyStopping('val_loss', patience=3, mode='min', strict = False, verbose = False)

# fast_dev_run = True

trainer = pl.Trainer(gpus = 1,
            max_epochs= max_epochs,
            default_root_dir= output_path,
            logger= logger,
            callbacks=[early_stop],
            checkpoint_callback=pl.callbacks.ModelCheckpoint(
                dirpath= output_path,
                monitor="avg_epoch_val_loss",
                mode="min",
                verbose=True,
            ),
            gradient_clip_val=gradient_clip_val,
            num_sanity_val_steps=val_sanity_checks,
        )

model = PretrainedWindModel()
trainer.fit(model)

#### Model Saving using two methods
- pytorch runtime checkpoint
- Torchscript (can be loaded in non-Python environments)

In [None]:
# ------------------------------------ Method 1: Pytorch checkpoints -----------------------------------------
trainer.save_checkpoint("./plmodel/model-outputs/final_model.ckpt", weights_only=True)

# ------------------------------------ Method 2: Torchscript -----------------------------------------
script = model.to_torchscript()
# save for use in production environment
torch.jit.save(script, "model.pt")

In [None]:
%reload_ext tensorboard
%tensorboard --logdir /plmodel/logs/

In [None]:
test_metadata = pd.read_csv("./test_set_features.csv")                            # test features
test_metadata.columns = test_metadata.columns.str.replace(' ','_').str.lower()
test_metadata = test_metadata.sort_values(by=['image_id']).reset_index(drop=True)
test_metadata["file_name"] = DATA_PATH + r"/nasa_tropical_storm_competition_test_source" + r"/nasa_tropical_storm_competition_test_source_" + test_metadata.image_id + "/image.jpg"

test_labels = pd.read_csv("./test_set_labels.csv")                                # test labels
# replace space with hyphen in col names in df
test_labels.columns = test_labels.columns.str.replace(' ','_').str.lower()

In [None]:
test_metadata.head()

In [None]:
def test(model, x_test):
    test_dataset = DatasetWIND(x_test)
    test_dataloader = DataLoader(test_dataset, num_workers=num_workers, batch_size=batch_size )
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    ori_lab = []
    pred_lab = []
    model.to(device)
    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            image_id = batch["image_id"]
            image = batch["image"].to(device)

            preds = model.eval().forward(image)
            preds_val = preds.squeeze().tolist()

    results_df = pd.DataFrame(list(zip(image_id, preds_val)), columns =['image_id', 'pred_wind_speed'])
    return results_df

In [None]:
final_model = PretrainedWindModel.load_from_checkpoint("./plmodel/model-outputs/final_model.ckpt")

In [None]:
test_pred = test(final_model, test_metadata)

In [None]:
test_actual_pred = test_pred.merge(test_labels, on="image_id")
test_actual_pred.head()

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(test_actual_pred["wind_speed"], test_actual_pred["pred_wind_speed"]))
print(f"RMSE: {rmse:.4f}")

In [None]:
def predict_image(image, model):
    image = pil_image.open(image).convert("RGB")
    test_transforms = transforms.Compose(
            [
                transforms.CenterCrop(128),
                transforms.ToTensor(),
                transforms.Normalize(
                    mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
                ),
            ]
        )
    image = test_transforms(image)
    image = image.unsqueeze(0)
    with torch.no_grad():
        preds = model(image)
        preds_val = preds.data.squeeze().numpy()
    return preds_val

In [None]:
test_metadata["file_name"][0]

In [None]:
model = PretrainedWindModel.load_from_checkpoint("./plmodel/model-outputs/final_model.ckpt")
sample_image = test_metadata["file_name"][0]
image_id = test_metadata["image_id"][0]
print(image_id)
sample_img_label = test_actual_pred[0]
print(sample_img_label)
pred = predict_image(sample_image, model)
print(f"Predicted windspeed: {pred} | Actual Windspeed: {sample_img_label}")

In [None]:
inp = './data_custom/nasa_tropical_storm_competition_train_source/nasa_tropical_storm_competition_train_source_abs_000/image.jpg'
image = pil_image.open(inp).convert("RGB")
test_transforms = transforms.Compose(
        [
            transforms.CenterCrop(128),
            transforms.ToTensor(),
            # All models expect the same normalization mean & std
            # https://pytorch.org/docs/stable/torchvision/models.html
            transforms.Normalize(
                mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
            ),
        ]
    )
image = test_transforms(image)
image = image.unsqueeze(0)
#     with torch.no_grad():
#         preds = model(image)
#         preds_val = preds.data.squeeze().numpy()
#     return preds_val
scripted_module = torch.jit.load("model.pt")
output = scripted_module(image)
output

In [None]:
from IPython.display import FileLink
from IPython.display import FileLinks

In [None]:
FileLinks(r'./plmodel')

In [None]:
%%capture
!zip -r {model_name+data_aug}.zip model_checkpoints Save_model