# Machine learning-based framework for prospectivity mapping of critical minerals

### Ehsan Farahbakhsh<sup>1</sup>, Jack Maughan<sup>2</sup>, R. Ditmar M&uuml;ller<sup>1</sup>

<sup>1</sup>*EarthByte Group, School of Geosciences, University of Sydney, NSW 2006, Australia*

<sup>2</sup>*Datarock Pty Ltd., Level 3, 31 Queen Street, VIC 3000, Australia*

This notebook enables the user to generate synthetic positive samples using an improved generative adversarial network (GAN) called SMOTE-GAN which is the combination of synthetic minority over-sampling technique (SMOTE) and GAN.

### Libraries

In [None]:
from imblearn.over_sampling import SMOTE
from ipywidgets import interact
import matplotlib.pyplot as plt
import numpy as np
from numpy import genfromtxt
from numpy import ones
from numpy import zeros
from numpy.random import randint
from numpy.random import randn
import os
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

### SMOTE

In [None]:
# uncomment the target commodity
commodity = 'Co'
# commodity = 'Cr'
# commodity = 'Ni'
features_labels_file = f'./{commodity}/features_labels_{commodity}_nogeochem.csv'
features_labels = pd.read_csv(features_labels_file, index_col=False)
categorical_features_columns = []

for column in features_labels.columns:
    if features_labels[column].nunique() == 2:
        categorical_features_columns.append(column)

categorical_features_columns.remove('label')

features_list_label = features_labels.columns.tolist()
features_labels = features_labels.to_numpy()
features = features_labels[:, :-1]
labels = features_labels[:, -1]

# SMOTE
smote = SMOTE(random_state=1)
X_sm, y_sm = smote.fit_resample(features, labels)
smote_samples = np.concatenate((X_sm, y_sm.reshape(-1, 1)), axis=1)
X_positive = smote_samples[np.where(smote_samples[:, -1]==1)]
X_positive = X_positive[:, 0:-1]

### GAN

In [None]:
# define the standalone discriminator model
def define_discriminator(n_inputs):
    model = Sequential()
    model.add(Dense(int(n_inputs*0.5), activation='relu', kernel_initializer='he_uniform', input_dim=n_inputs))
    model.add(Dense(1, activation='sigmoid'))
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# define the standalone generator model
def define_generator(latent_dim, n_outputs):
    model = Sequential()
    model.add(Dense(int(n_outputs*0.75), activation='relu', kernel_initializer='he_uniform', input_dim=latent_dim))
    model.add(Dense(n_outputs, activation='linear'))
    return model

# define the combined generator and discriminator model for updating the generator
def define_gan(generator, discriminator):
    # make weights in the discriminator not trainable
    discriminator.trainable = False
    # connect them
    model = Sequential()
    # add generator
    model.add(generator)
    # add the discriminator
    model.add(discriminator)
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

n_samples = int(features.shape[0]-(2*labels.sum()))
# sample real data
def sample_real_data(n=n_samples):
    X_rand = X_positive[randint(X_positive.shape[0], size=n), :]
    y_rand = ones((n, 1))
    return X_rand, y_rand

# generate points in the latent space as input for the generator
def generate_latent_points(latent_dim, n=n_samples):
    # generate points in the latent space
    x_input = randn(latent_dim * n)
    # reshape into a batch of inputs for the network
    x_input = x_input.reshape(n, latent_dim)
    return x_input

# use the generator to generate n fake examples with class labels
def generate_fake_samples(generator, latent_dim, n=n_samples):
    # generate points in the latent space
    x_input = generate_latent_points(latent_dim, n)
    # predict outputs
    X_fake = generator.predict(x_input, verbose=0)
    # create class labels
    y_fake = zeros((n, 1))
    return X_fake, y_fake

n_epochs = 20000
smote_gan_file = f'./{commodity}/smote_gan_{commodity}_nogeochem.csv'
# evaluate the discriminator
def summarize_performance(epoch, generator, discriminator, latent_dim):
    # sample real data
    x_real, y_real = sample_real_data()
    # evaluate the discriminator on real examples
    _, acc_real = discriminator.evaluate(X_sm, y_sm, verbose=0)
    # prepare fake examples
    x_fake, y_fake = generate_fake_samples(generator, latent_dim)
    # evaluate the discriminator on fake examples
    _, acc_fake = discriminator.evaluate(x_fake, y_fake, verbose=0)
    return x_real, x_fake, acc_real, acc_fake
    
# train the generator and discriminator
def train(g_model, d_model, gan_model, latent_dim, n_epochs=n_epochs, n_batch=128, n_eval=1000):
    # determine half the size of one batch, for updating the discriminator
    half_batch = int(n_batch / 2)
    acc_real_all = []
    acc_fake_all = []
    # manually enumerate epochs
    for i in range(n_epochs):
        # prepare real samples
        x_real, y_real = sample_real_data(half_batch)
        # prepare fake examples
        x_fake, y_fake = generate_fake_samples(g_model, latent_dim, half_batch)
        # update discriminator
        d_model.train_on_batch(x_real, y_real)
        d_model.train_on_batch(x_fake, y_fake)
        # prepare points in the latent space as input for the generator
        x_gan = generate_latent_points(latent_dim, n_batch)
        # create inverted labels for the fake samples
        y_gan = ones((n_batch, 1))
        # update the generator via the discriminator's error
        gan_model.train_on_batch(x_gan, y_gan)
        # evaluate the model every n_eval epochs
        if (i+1) % n_eval == 0:
            # summarize discriminator performance
            x_real, x_fake, acc_real, acc_fake = summarize_performance(i, g_model, d_model, latent_dim)
            print(i+1, acc_real, acc_fake)
            # scatter plot real and fake data points
            fig = plt.figure(figsize=[15, 6])
            ax1 = fig.add_subplot(1, 2, 1)
            ax1.scatter(x_real[:, 0], x_real[:, 14], c='red')
            ax1.scatter(x_fake[:, 0], x_fake[:, 14], c='blue')
            ax1.set_xlabel('Total Magnetic Intensity')
            ax1.set_ylabel('Bouger Anomaly')
            ax2 = fig.add_subplot(1, 2, 2, projection='3d')
            ax2.scatter(x_real[:, 0], x_real[:, 14], x_real[:, 17], c='red')
            ax2.scatter(x_fake[:, 0], x_fake[:, 14], x_fake[:, 17], c='blue')
            ax2.set_xlabel('Total Magnetic Intensity')
            ax2.set_ylabel('Bouger Anomaly')
            ax2.set_zlabel('Radiation Dose Rate')
            plt.show()
        if i == n_epochs-1:
            x_fake_ones = ones((x_fake.shape[0], 1))
            x_fake = np.concatenate((x_fake, x_fake_ones), axis=1)
            smote_gan_samples = np.concatenate((features_labels, x_fake), axis=0)
            smote_gan_samples = pd.DataFrame(smote_gan_samples, columns=features_list_label)
            for column in categorical_features_columns:
                for row in range(smote_gan_samples.shape[0]):
                    if smote_gan_samples[column][row] > 0.5:
                        smote_gan_samples[column][row] = 1
                    else:
                        smote_gan_samples[column][row] = 0
            smote_gan_samples.to_csv(smote_gan_file, index=False)
    return smote_gan_samples

In [None]:
# size of the latent space
latent_dim = int(features.shape[1]*0.5)
# create the discriminator
discriminator = define_discriminator(features.shape[1])
# create the generator
generator = define_generator(latent_dim, features.shape[1])
# create the gan
gan_model = define_gan(generator, discriminator)
# train model
smote_gan_samples = train(generator, discriminator, gan_model, latent_dim)