In [1]:
import os
import constants as c
import yaml
import numpy as np
import datetime
import s3fs
import boto3
import matplotlib.pyplot as plt

from matplotlib import gridspec
from sklearn.decomposition import PCA
from tensorflow.keras import layers, Model
from tensorflow.keras.layers import Input
from models import ResnetTripletEmbedding, triplet_loss
from umap import UMAP
from tqdm import tqdm

from landcover import unpack_array

secrets = yaml.load(open(os.path.join(c.BASE_DIR, '.secrets.yml')), Loader=yaml.FullLoader)

2023-08-27 10:01:18.892138: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


### Script parameters

In [2]:
epochs         = 50
batch_size     = 32
embed_dim      = 16
num_filters    = 64
n_linear       = 64
n_conv_blocks  = 2
n_train_files  = 30
model_filename = "resnet-triplet-lc.keras"
s3_region      = "us-east-1"

### Load training data from S3

In [3]:
# Initialize s3fs using aws_aceess_key_id and aws_secret_access_key
fs = s3fs.S3FileSystem(
    key=secrets['aws_access_key_id'],
    secret=secrets['aws_secret_access_key'],
    client_kwargs={'region_name': s3_region}
)

s3_client = boto3.client(
    's3',
    aws_access_key_id=secrets['aws_access_key_id'],
    aws_secret_access_key=secrets['aws_secret_access_key'],
    region_name=s3_region
)


# Read all files in the bucket c.S3_BUCKET and key 'landcover' with file extension .npy
# and store them in a list
files = fs.ls(os.path.join(c.S3_BUCKET, 'landcover'))
files = [f for f in files if f.endswith('.npy')]
print('Found {} files'.format(len(files)))

arrays = []

files_to_read = files[0:n_train_files]
print(f"Preparing to read {len(files_to_read)} files")

for f in files_to_read:
    # Read each file in the list and append it to the arrays list
    print('....Reading {}'.format(f))
    arrays.append(np.load(fs.open(f)))

Found 281 files
Preparing to read 30 files
....Reading lql-data/landcover/lulc-patches-pairs-32x32-102420.npy
....Reading lql-data/landcover/lulc-patches-pairs-32x32-103028.npy
....Reading lql-data/landcover/lulc-patches-pairs-32x32-108132.npy
....Reading lql-data/landcover/lulc-patches-pairs-32x32-110513.npy
....Reading lql-data/landcover/lulc-patches-pairs-32x32-112374.npy
....Reading lql-data/landcover/lulc-patches-pairs-32x32-11850.npy
....Reading lql-data/landcover/lulc-patches-pairs-32x32-121111.npy
....Reading lql-data/landcover/lulc-patches-pairs-32x32-128142.npy
....Reading lql-data/landcover/lulc-patches-pairs-32x32-131464.npy
....Reading lql-data/landcover/lulc-patches-pairs-32x32-141004.npy
....Reading lql-data/landcover/lulc-patches-pairs-32x32-141960.npy
....Reading lql-data/landcover/lulc-patches-pairs-32x32-142622.npy
....Reading lql-data/landcover/lulc-patches-pairs-32x32-144478.npy
....Reading lql-data/landcover/lulc-patches-pairs-32x32-145292.npy
....Reading lql-data

### Convert from integer to one-hot

In [4]:
xs_one_hot = np.concatenate([unpack_array(xs) for xs in tqdm(arrays)], axis=0)
anchors, positives, negatives = xs_one_hot[:, 0], xs_one_hot[:, 1], xs_one_hot[:, 2]
labels = np.zeros((len(anchors), 1))
print(f"Loaded {len(arrays)} files; resulting stacked array has shape {xs_one_hot.shape}")

  0%|          | 0/30 [00:00<?, ?it/s]

100%|██████████| 30/30 [01:17<00:00,  2.57s/it]


### Model training

In [None]:
# Input shape for data is (H, W, C)
input_shape = xs_one_hot.shape[2:]

triplet_model, embedding_network = initalize_triplet(
    input_shape,
    n_conv_blocks,
    embed_dim,
    num_filters,
    n_linear
)

NameError: name 'xs_one_hot' is not defined

In [None]:
history = triplet_model.fit([anchors, positives, negatives], labels, epochs=epochs, batch_size=batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Count model parameters

In [None]:
embedding_network.summary()

Model: "resnet_triplet_embedding"


__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 32, 32, 23)]         0         []                            
                                                                                                  
 conv2d (Conv2D)             (None, 32, 32, 32)           6656      ['input_1[0][0]']             
                                                                                                  
 conv2d_1 (Conv2D)           (None, 32, 32, 32)           9248      ['conv2d[0][0]']              
                                                                                                  
 batch_normalization (Batch  (None, 32, 32, 32)           128       ['conv2d_1[0][0]']            
 Normalization)                                                                                   
          

### Counting inactive dimensions

In [None]:
zs = embedding_network(xs_one_hot[0:1024, 0]).numpy()

# transform with pca
pca = PCA(n_components=embed_dim)
zs_pca = pca.fit_transform(zs)

# Print out the number of eigenvalues
# needed to reach threshold of variance explained
variance_threshold = 0.99
variance_explained = np.cumsum(pca.explained_variance_ratio_)
n_components = np.where(variance_explained > variance_threshold)[0][0]
print(f"Number of components needed to reach {variance_threshold} variance explained: {n_components}")

Number of components needed to reach 0.99 variance explained: 5


### Save model on S3

In [None]:
model_path = os.path.join('temp', model_filename)

embedding_network.save(model_path) 
s3_client.upload_file(model_path, c.S3_BUCKET, f"models/{model_filename}")
os.remove(model_path)

### Visualize embedding space in 2D / sniff check on embedding values

In [None]:
pca = PCA(n_components=embed_dim)
zs = embedding_network(xs_one_hot[0:1024, 0]).numpy()
zs_pca = pca.fit_transform(zs)

# Create a figure
fig = plt.figure(figsize=(10, 6))

# Create a GridSpec layout
gs = gridspec.GridSpec(2, 2, width_ratios=[1, 1], height_ratios=[1, 1])

# Create the subplots
ax0 = plt.subplot(gs[0, 0])
ax1 = plt.subplot(gs[0, 1])
ax2 = plt.subplot(gs[1, :])

# Plot of embeddings without PCA
ax0.imshow(zs[0:64])
ax0.set_ylabel("Different vectors")
ax0.set_xlabel("Vector dimensions")

# Plot of embeddings with PCA
ax1.imshow(zs_pca[0:64])
ax1.set_ylabel("Different vectors")
ax1.set_xlabel("Vector dimensions (PCA)")

# Plot of UMAP projection of embeddings
reducer = UMAP()
ws = reducer.fit_transform(zs)
hb = ax2.hexbin(ws[:, 0], ws[:, 1], cmap='viridis', bins='log')
ax2.set_title("Hexbin log density for\nUMAP projection of embeddings")
plt.colorbar(hb, ax=ax2, orientation='vertical')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.savefig("figures/embeddings.png")

NameError: name 'embedding_network' is not defined