# Checkpoint 2: Mandatory Check-In

- Research Questions (1.5%).
- Feature Engineering (2%).
- Modeling Approaches (3%).
- Preliminary Results (1%).


## Import Libraries

In [1]:
import cv2
import re
import io
import os
import pickle
import zipfile
import numpy as np
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt

COLAB = False

# mount Colab drive and set library path
if COLAB:
  import sys
  from google.colab import drive
  drive.mount('/content/drive')
  path = '/content/drive/My Drive/Colab Notebooks/grad_project'
  data_dir = f'{path}/Data'
  sys.path.insert(0, f'{path}/Source') # for colab to see local libraries
else:
  data_dir = f'../Data'


from data_utils import read_files_from_zip, data2pd, show_balance, load_model_data, save_model_data
from feature_utils import crop_and_fill
from eda_utils import show_image


from tensorflow.keras import backend
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Dense, Flatten, Dropout, Rescaling
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from random import shuffle
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from tensorflow.keras.utils import to_categorical

2024-04-24 10:26:56.300069: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Import data

In [2]:
%%time

# Set Date Source
data_file = f'{data_dir}/sp24_grad_project_data.zip'

# Load Test Images
test_data_images = read_files_from_zip(data_file, r'(test)_images_([\w-]*)\.npz')

# Load Train Images and Labels
train_data_images = read_files_from_zip(data_file, r'satellite-image-data/([\w-]*)/(train)_images\.npz')
train_data_labels = read_files_from_zip(data_file, r'satellite-image-data/([\w-]*)/(train)_labels\.npy')

# Convert to Dataframe
train_df = data2pd(train_data_images, train_data_labels)
test_df = data2pd(test_data_images)

# Clean up
del(test_data_images)
del(train_data_images)
del(train_data_labels)

Importing data from zip: 100%|██████████| 2/2 [00:03<00:00,  1.53s/it]


Found the following datasets:  ['hurricane-matthew', 'flooding-fire']


Importing data from zip: 100%|██████████| 3/3 [00:14<00:00,  4.80s/it]


Found the following datasets:  ['socal-fire', 'midwest-flooding', 'hurricane-matthew']


Importing data from zip: 100%|██████████| 3/3 [00:00<00:00, 442.95it/s]

Found the following datasets:  ['socal-fire', 'midwest-flooding', 'hurricane-matthew']
CPU times: user 15.1 s, sys: 1.45 s, total: 16.5 s
Wall time: 17.6 s





In [3]:
print('Train label balance:')
show_balance(train_df)

Train label balance:


type       label
fire       0        7204
           3        1064
           1          69
           2          43
flood      0        6734
           1         114
           2          97
           3          59
hurricane  1        5236
           0        2631
           3        1740
           2        1544
Name: count, dtype: int64

## Data Cleansing Plan

There are many potential enhancements, this seems a reasonable starting point.

 - Scale images to 180x180 (ResNet50 default)
 - Normalize pixel values from 0-255 to be float from 0-1
 - For Type Classifier, encode level+type to float between -1 and 1
 - For Level Classifier, ohe level to catagories 0-4
 - When undersampling, sample without replacement within innerquartile
 - When oversampling, include entire set then sample with replacement and augmentation



### Model Task B

Data split and augmentation plan (Disaster Level Classification)

We'll use a mix of oversampling and undersampling...
 - oversample the minority class using replacement
 - undersample by randomly delete rows from the majority class to match

 - Validation:
    - Sample without replacement N samples of each disaster level
 - Training:
    - Select a sample size such that is a balance between drop/augment

In [4]:
# Feature Engineering

# Add a size column
def get_size(img):
    dim = img.shape
    return dim[0] * dim[1]
train_df['size'] = train_df['image'].apply(get_size)

# Show size feature and range
print(train_df.columns)
train_df[['size']].describe().T

Index(['index', 'type', 'image', 'label', 'size'], dtype='object')


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
size,26535.0,8169.871867,12364.717747,84.0,1936.0,4340.0,10240.0,410464.0


In [5]:
# Create dataset for Disaster Type Classification

# This dataset is a bit more balanced.
classify_level_df = train_df[(train_df.type == 'hurricane')]
show_balance(classify_level_df)

type       label
hurricane  1        5236
           0        2631
           3        1740
           2        1544
Name: count, dtype: int64

In [6]:
# Find a good validation split size
# We'll pick a whole number for our puny human brains.
validation_sample_size = int(classify_level_df.groupby('type')['label'].value_counts().min() * .1945)
print(validation_sample_size)

300


In [7]:
# Find a good augmentation size
# We'll pick a whole number for our puny human brains.
augmentation_size = int(classify_level_df.groupby('type')['label'].value_counts().min() * 1.2955)
print(augmentation_size)

2000


In [8]:
# Create validation set for modelling.

# We would like for our test set to representive
# of our population so we'll limit it
# sizes in the innerquartile.
classify_level_valid_df = pd.DataFrame()
print('Before: ',classify_level_df.shape)
for T in classify_level_df.type.unique():
    for L in classify_level_df.label.unique():
        #print(f'{T}:{L}')
        sample_df =  classify_level_df[(classify_level_df['type'] == T) & (classify_level_df['label'] == L)
        & ((classify_level_df['size'] <= 10240) & (classify_level_df['size'] >= 1936))
        ].sample(validation_sample_size)
        #display(sample_df.shape)
        #print(type(sample_df))
        classify_level_valid_df = pd.concat([sample_df, classify_level_valid_df])
# drop our test sample from the parent df
classify_level_df = classify_level_df.drop(index=classify_level_valid_df.index)

print('After: ',classify_level_df.shape)

assert(9951 == classify_level_df.shape[0]), 'Invalid parent data size, something looks fishy.'

Before:  (11151, 5)
After:  (9951, 5)


In [9]:
from feature_utils import augment_image

# Now augment data until balanced.
# We'll use a mix of oversampling and undersampling...
# - oversample the minority class using replacement
# - undersample by randomly delete rows from the majority class to match

classify_level_train_df = pd.DataFrame()

# Set a reasonable argumentation limit
sample_size = augmentation_size

# We apply flip, rotate augmentation to the
# sample with replacement samples.
# We might duplicate an augmentation if the
# same record is sampled and the augmentation
# is applied.
# Anyway... Cool thing is that we can get as many unique
# balanced training sets as we desire.

for T in classify_level_df.type.unique():
    for L in classify_level_df.label.unique():
        record_size = classify_level_df[
        (classify_level_df['type'] == T) & (classify_level_df['label'] == L)].shape[0]
        if (record_size > sample_size):
            sample_df = classify_level_df[
            (classify_level_df['type'] == T) & (classify_level_df['label'] == L)].sample(sample_size)
        else:
            sample_df = classify_level_df[
            (classify_level_df['type'] == T) & (classify_level_df['label'] == L)]
            classify_level_train_df = pd.concat([sample_df, classify_level_train_df])
            sample_df = classify_level_df[
            (classify_level_df['type'] == T) & (classify_level_df['label'] == L)].sample(
                sample_size-record_size, replace=True)
            sample_df['image'] = sample_df['image'].apply(augment_image)
        classify_level_train_df = pd.concat([sample_df, classify_level_train_df])
count_s = classify_level_train_df.groupby('type')['label'].value_counts()
display(count_s)

type       label
hurricane  0        2000
           1        2000
           2        2000
           3        2000
Name: count, dtype: int64

In [10]:
# Now let's recap our datasets for task B
show_balance(classify_level_valid_df)
show_balance(classify_level_train_df)

type       label
hurricane  0        300
           1        300
           2        300
           3        300
Name: count, dtype: int64

type       label
hurricane  0        2000
           1        2000
           2        2000
           3        2000
Name: count, dtype: int64

In [11]:
300/2000

0.15

---

## Feature Engineering

**Damage Level Classification**
 - The objective is to create a classifier that can automatically determine the level of building damage following a disaster, specifically for hurricanes.

For now we've only added size and used it for undersampling.
These are images so we'll let Gauss do his thing...
...try not to second guess Gauss.


## Modeling

Let's cookup a the simplest model we can think of...
For us this a autoencoder where we encoding our convolved image into a latent space then decode to our classes.

**Task B**
 - Reshape ((180x180, normalize for positive definteness)
 - Conv2d (180x180)
 - MaxPooling2D (60x60, Dense, ReLu)
 - Dropout
 - Flatten
 - Dense (4 classes) levels 0-3

Consider augment the data with 0-45 degree rotataton and 100to120% scaling.

In [12]:
from tensorflow.keras import backend
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Dense, Flatten, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from random import shuffle
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from tensorflow.keras.utils import to_categorical

Encode Labels and Resize Images for modeling.

Note: In general changing aspect ratios show little effect on the ablity of CNNs to learn.  We note changing the aspect ratio increases the number of epochs to train to some loss threshold.  This implies that for simpiler models might benifit from preserving the aspect ratio.

Things to consider:
 - A model might memorize the low-dimension aspect-ratio rather than high-dimensional image-features.
 - Consider the balance of compute, memory and time it takes for model to converge.
 - Train with cropped large images and zero-background centered small images.
 - Make a more extractable model, for example use 2d convolution for each color then apply SVG. Plot labels using PCA dimensions.

### Model Task B

In [13]:
from tensorflow.keras.utils import to_categorical

image_dim = 180

# Encode Labels
def encode_level(df):
    X = df['image'].to_list()
    X = np.array([cv2.resize(img, (image_dim, image_dim)) for img in X])
    #Y = to_categorical(df.label, num_classes=4)
    df['level_code'] = df['label']
    df['level_code'] = df['level_code']/df['level_code'].abs().max()
    Y = df['level_code'].astype('float32') # let's keep it linear
    return X,Y

X_valid, Y_valid = encode_level(classify_level_valid_df)
X_train, Y_train = encode_level(classify_level_train_df)

### New Model

In [14]:
# This dataset is a bit more balanced.
classify_level_df = train_df[(train_df.type == 'hurricane')]
show_balance(classify_level_df)
backend.clear_session()  # clear model

type       label
hurricane  1        5236
           0        2631
           3        1740
           2        1544
Name: count, dtype: int64

In [15]:
# Encode Labels

image_dim = 72 # 180//3

def encode_level(df):
    X = df['image'].to_list()
    X = np.array([cv2.resize(img, (image_dim, image_dim)) for img in X])
    Y = to_categorical(df.label, num_classes=4)
    return X,Y

X_train, Y_train = encode_level(classify_level_df)

del(classify_level_df)

In [33]:
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Rescaling, BatchNormalization

# Transfer model
model_name = 'model_1_2_0'
model, history, notes = load_model_data(model_name, path=data_dir)

if model == None:   
    # Define our number of splits
    n_splits = 5
    
    # Create a StratifiedKFold object
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True)
    
    # Define our ImageDataGenerator for augmentation
    datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        horizontal_flip=True)
    
    # Loop over our folds
    for train_index, test_index in skf.split(X_train, Y_train.argmax(axis=1)):
        # Create the train and test sets
        X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
        Y_train_fold, Y_test_fold = Y_train[train_index], Y_train[test_index]
    
        # Fit our ImageDataGenerator
        datagen.fit(X_train_fold)
    
        model = Sequential([
            Input(shape = (image_dim, image_dim, 3)),
            BatchNormalization(),
            Conv2D(filters = 16, kernel_size = 2, padding = "same", activation = "relu"),
            MaxPooling2D(pool_size = 2),
            Dropout(0.2),
            Conv2D(filters = 32, kernel_size = 2, padding = "same", activation = "relu"),
            MaxPooling2D(pool_size = 2),
            Dropout(0.1),
            Conv2D(filters = 32, kernel_size = 2, padding = "same", activation = "relu"),
            MaxPooling2D(pool_size = 2),
            Dropout(0.2),
            Conv2D(filters = 32, kernel_size = 2, padding = "same", activation = "relu"),
            MaxPooling2D(pool_size = 2),
            Dropout(0.2),
            Flatten(),
            Dense(512, activation='relu'),
            Dropout(0.4),
             Dense(4, activation='linear')# Dense(4, activation='softmax')
            ])
    
    
        # Compile the model
        #model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
    
        # Train the model
        model.fit(datagen.flow(X_train_fold, Y_train_fold), validation_data=(X_test_fold, Y_test_fold), epochs=10, batch_size=32,)
    
    save_model_data(model_name, model, None, 'trained on k=5 folds', path=data_dir)

../Data/models/model_1_2_0
../Data/models/model_1_2_0/history.pkl
Gherkin injested.


In [34]:
prediction = model.predict(X_train)
actual = Y_train



In [35]:
from sklearn.metrics import r2_score
r2_score(actual, prediction)

0.1474989825663362

In [36]:
from sklearn.metrics import classification_report
import numpy as np

def print_metrics(model, X, Y):
    Y_pred = model.predict(X)
    Y_pred_classes = np.argmax(Y_pred, axis = 1)
    Y_true = np.argmax(Y, axis = 1)

    # compute the confusion matrix
    print(classification_report(Y_true, Y_pred_classes, target_names = ['0', '1', '2', '3']))

print_metrics(model, X_train, Y_train)

              precision    recall  f1-score   support

           0       0.55      0.51      0.53      2631
           1       0.67      0.46      0.54      5236
           2       0.35      0.52      0.42      1544
           3       0.48      0.79      0.60      1740

    accuracy                           0.53     11151
   macro avg       0.51      0.57      0.52     11151
weighted avg       0.57      0.53      0.53     11151



In [82]:
#test_df
level_df = test_df[(test_df.type == 'hurricane')]
X = level_df['image'].to_list()
X = np.array([cv2.resize(img, (image_dim, image_dim)) for img in X])
Y_pred = model.predict(X)



In [88]:
def custom_activation(y):
    return (y*4).astype(int)
    
Y_pred_custom = custom_activation(Y_pred) #.reshape(-1,)
Y_pred_custom.min(), Y_pred_custom.max()

(0, 3)

In [89]:
Y_pred_custom= Y_pred.argmax(axis=1)


In [90]:
# Get unique values and their counts
unique_values, counts = np.unique(Y_pred_custom, return_counts=True)

# Print the results
print("Unique values:", unique_values)
print("Counts:", counts)

Unique values: [0 1 2 3]
Counts: [613 898 557 720]


In [91]:
Y_pred_out = pd.DataFrame(Y_pred_custom, columns=['pred'])
Y_pred_out.to_csv('test_images_hurricane-matthew_predictions.csv', index=False)

In [92]:
Y_pred_out

Unnamed: 0,pred
0,2
1,2
2,2
3,0
4,0
...,...
2783,0
2784,3
2785,2
2786,2


## Notes and Resources

https://learningds.org/ch/19/class_pred.html

https://neptune.ai/blog/keras-loss-functions

https://www.h2kinfosys.com/blog/linear-classifier-with-tensorflow-keras/

Deep Learning with Python, by François Chollet

https://github.com/fchollet/deep-learning-with-python-notebooks/

Python Data Science Handbook, by Jake VanderPlas

https://github.com/jakevdp/PythonDataScienceHandbook