In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from tqdm import tqdm
import cv2
import tifffile
from skimage.transform import resize
import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, Conv2D, Reshape, Input, Conv2DTranspose
from keras.layers import Activation, LeakyReLU, BatchNormalization, Dropout, Resizing
import dcgan as dcgan
import wgan as wgan
import prgan as prgan
from scores import scores
from scores import preprocess_array

try:
    from tensorflow.keras.optimizers import Adam
except:
    from keras.optimizers import Adam



In [None]:
NOISE_DIM = 100  
BATCH_SIZE = 20 
STEPS_PER_EPOCH = 100
EPOCHS = 50
STEPS = 5
SEED = 40
WIDTH, HEIGHT, CHANNELS = 75, 75, 3
OPTIMIZER = Adam(0.0002, 0.5)

In [None]:
train = pd.read_json('./data/train.json')
train['inc_angle'] = pd.to_numeric(train['inc_angle'],errors='coerce')

In [None]:
train2 = pd.read_json('./data/train.json')


In [None]:
def get_stats(train,label=1):
    print(train.shape)
    train['max'+str(label)] = [np.max(np.array(x)) for x in train['band_'+str(label)] ]
    train['maxpos'+str(label)] = [np.argmax(np.array(x)) for x in train['band_'+str(label)] ]
    train['min'+str(label)] = [np.min(np.array(x)) for x in train['band_'+str(label)] ]
    train['minpos'+str(label)] = [np.argmin(np.array(x)) for x in train['band_'+str(label)] ]
    train['med'+str(label)] = [np.median(np.array(x)) for x in train['band_'+str(label)] ]
    train['std'+str(label)] = [np.std(np.array(x)) for x in train['band_'+str(label)] ]
    train['mean'+str(label)] = [np.mean(np.array(x)) for x in train['band_'+str(label)] ]
    train['p25_'+str(label)] = [np.sort(np.array(x))[int(0.25*75*75)] for x in train['band_'+str(label)] ]
    train['p75_'+str(label)] = [np.sort(np.array(x))[int(0.75*75*75)] for x in train['band_'+str(label)] ]
    train['mid50_'+str(label)] = train['p75_'+str(label)]-train['p25_'+str(label)]

    return train


In [None]:
train = get_stats(train,1)
train = get_stats(train,2)

In [None]:
def plot_var(name,nbins=50):
    minval = train[name].min()
    maxval = train[name].max()
    plt.hist(train.loc[train.is_iceberg==1,name],range=[minval,maxval],
             bins=nbins,color='b',alpha=0.5,label='Boat')
    plt.hist(train.loc[train.is_iceberg==0,name],range=[minval,maxval],
             bins=nbins,color='r',alpha=0.5,label='Iceberg')
    plt.legend()
    plt.xlim([minval,maxval])
    plt.xlabel(name)
    plt.ylabel('Number')
    plt.show()

In [None]:
print(len(train.loc[train.is_iceberg==1]))
for col in ['inc_angle','min1','max1','std1','med1','mean1','mid50_1']:
    plot_var(col)

In [None]:
train_stats = train.drop(['id','is_iceberg','band_1','band_2'],axis=1)

In [None]:
corr = train_stats.corr()
fig = plt.figure(1, figsize=(10,10))
plt.imshow(corr,cmap='inferno')
labels = np.arange(len(train_stats.columns))
plt.xticks(labels,train_stats.columns,rotation=90)
plt.yticks(labels,train_stats.columns)
plt.title('Correlation Matrix of Global Variables')
cbar = plt.colorbar(shrink=0.85,pad=0.02)
plt.show()

In [None]:
icebergs = train[train.is_iceberg==1].sample(n=300,random_state=123)
ships = train[train.is_iceberg==0].sample(n=300,random_state=456)

In [None]:
# Plot band_1/band_2 images
fig = plt.figure(1,figsize=(15,15))
for i in range(9):
    ax = fig.add_subplot(3,3,i+1)
    arr = np.reshape(np.array(icebergs.iloc[i,1]),(75,75))
    ax.imshow(arr,cmap='gist_gray')
    
plt.show()

In [None]:
def plot_results(images, n_cols=None):
    '''Visualizes fake images'''

    n_cols = n_cols or len(images)
    n_rows = (len(images) - 1) // n_cols + 1

    if images.shape[-1] == 1:
        images = np.squeeze(images, axis=-1)

    plt.figure(figsize=(n_cols * 2, n_rows * 2))  # Increase the figure size to accommodate 2-band images

    for index, image in enumerate(images):
        if image.shape[-1] == 1:
            image = np.squeeze(image, axis=-1)
        elif image.shape[-1] == 2:
            # Merge the two bands into a single image
            image = np.dstack((image[:, :, 0], image[:, :, 1]))
            image = np.mean(image, axis=2)  # Convert to grayscale by taking the mean of the two bands
        else:
            raise ValueError("Invalid number of image bands")

        plt.subplot(n_rows, n_cols, index + 1)
        plt.imshow(image, cmap="inferno")
        plt.axis("off")


In [None]:
# download the training images
X_train = icebergs[['band_1', 'band_2']]
X_band_1=np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in train["band_1"]])
X_band_2=np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in train["band_2"]])

# adding a new channel
X_train = np.concatenate([X_band_1[:, :, :, np.newaxis], X_band_2[:, :, :, np.newaxis],((X_band_1+X_band_2)/2)[:, :, :, np.newaxis]], axis=-1)
print(X_train.shape)

band_1 = np.array([np.array(image) for image in X_train[:, :, :, 0]])
band_2 = np.array([np.array(image) for image in X_train[:, :, :, 1]])
band_3 = np.array([np.array(image) for image in X_train[:, :, :, 2]])

band_1 = band_1.flatten().reshape(-1)
band_2 = band_2.flatten().reshape(-1)
band_3 = band_3.flatten().reshape(-1)

mean_1 = np.mean(band_1)
std_1 = np.std(band_1)

mean_2 = np.mean(band_2)
std_2 = np.std(band_2)

mean_3 = np.mean(band_3)
std_3 = np.std(band_3)

normalized_band_1 = (band_1 - band_1.min()) / (band_1.max() - band_1.min())
normalized_band_2 = (band_2 - band_2.min()) / (band_2.max() - band_2.min())
normalized_band_3 = (band_3 - band_3.min()) / (band_3.max() - band_3.min())
#normalized_column_1 = (column_1 - mean_1) / std_1
#normalized_column_2 = (column_2 - mean_2) / std_2

X_train_normalized = np.column_stack((normalized_band_1, normalized_band_2, normalized_band_3))
#print(X_train[0])
# Reshape images 
X_train = X_train_normalized.reshape(-1, WIDTH,HEIGHT,CHANNELS)

# Convert X_train to a NumPy array
X_train_array = np.array(X_train[:,:,:,:2])

print(X_train_array.shape)

# create batches of tensors to be fed into the model
dataset = X_train
print(X_train_array.dtype)

In [None]:
#Take a look at a iceberg
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
def plotmy3d(c, name):

    data = [
        go.Surface(
            z=c,
            colorscale=[[0, 'black'], [1, 'white']]
        )
    ]
    layout = go.Layout(
        title=name,
        autosize=False,
        width=700,
        height=700,
        margin=dict(
            l=65,
            r=50,
            b=65,
            t=90
        )
    )
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig)
plotmy3d(X_band_1[12,:,:], 'Real data')

In [None]:
from scipy import ndimage

def sample_images(noise, subplots, generator, prefix, figsize=(22,8), save=False):
    generated_images = generator.predict(noise)
    print(generated_images.shape)
    

    plt.figure(figsize=figsize)
    #print(np.amax(generated_images))
    samples = []
    for i, image in enumerate(generated_images):
        plt.subplot(subplots[0], subplots[1], i+1)
        if CHANNELS == 1:
            plt.imshow(image.reshape((WIDTH, HEIGHT)), cmap='gray')    
                                                                            
        else:
            print(image.shape)
            if image.shape == (64, 64, 2):
                print(image)
                image = ndimage.zoom(image, (75 / 64, 75 / 64, 1), order=1)

            image_band1 = np.reshape(np.array(image[:, :, 1]),(75,75))
            plt.imshow(image_band1, cmap='gist_gray')

            #channel_1 = image[:, :, 0]
            #channel_2 = image[:, :, 1]

            #plt.subplot(1, 2, 1)
            #plt.imshow(channel_1, cmap='gray')
            #plt.title('Channel 1')

            #plt.subplot(1, 2, 2)
            #plt.imshow(channel_2, cmap='gray')
            #plt.title('Channel 2')

            #plt.show()
            import matplotlib.image
            #print(np.amin(image.reshape((WIDTH, HEIGHT, 3))))
            from skimage import util
            #image = (image - image.min()) / (image.max() - image.min())
            #print(np.min(image))   # minimum value of the image data
            #print(np.max(image))   # maximum value of the image data
            #matplotlib.image.imsave('gen_img'+str(i)+'.png', image)
            #image_data = np.transpose(image, (2, 0, 1))
            # Transpose the array to have shape (height, width, num_channels)
            #image_data = np.transpose(image, (2, 0, 1))
            #print(image.shape)
            # Concatenate the channels along the third axis to create a multi-channel image
            print(image.shape)
            joined_channels_image = np.concatenate([np.expand_dims(image[:, :, i], axis=0) for i in range(image.shape[2])], axis=0)
            joined_channels_image = np.transpose(joined_channels_image, (1, 2, 0))  # Transpose dimensions

            samples.append(joined_channels_image)
            combined_image = np.mean(joined_channels_image, axis=2)

            plt.imshow(combined_image, cmap='gist_gray')
            # Save grayscale image as PNG
            plt.imsave(prefix + '_gen_img' + str(i) + '.png', combined_image, cmap='inferno')

        if save == True:
            img_name = "gen" + str(i)
            plt.savefig(img_name)
        plt.subplots_adjust(wspace=None, hspace=None)
        plt.axis('off')
    
    plt.tight_layout()
    plt.show()
    return samples

In [None]:
noise_dcgan = np.random.normal(0,1, size=(BATCH_SIZE, NOISE_DIM))
noise_wgan = tf.random.normal([32, 75*75*2])
generator_dcgan, discriminator_dcgan, dcgan_model = dcgan.build(OPTIMIZER, NOISE_DIM)
generator_dcgan_loss_values = dcgan.train(generator_dcgan, discriminator_dcgan, dcgan_model, noise_dcgan, EPOCHS, STEPS, BATCH_SIZE, NOISE_DIM, X_train_array)
generator_wgan, generator_wgan_loss_values = wgan.train(EPOCHS, STEPS, BATCH_SIZE, NOISE_DIM, X_train_array)


In [None]:
# Save the generator_dcgan model as an h5 file
generator_dcgan.save('generator_dcgan.h5')

# Save the generator_wgan model as an h5 file
generator_wgan.save('generator_wgan.h5')

In [None]:
import importlib 
importlib.reload(prgan) 
import prgan as prgan

generator_prgan, generator_prgan_loss_values = prgan.build(X_train_array)

In [None]:
generator_prgan.save('generator_prgan.h5')

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt
import statsmodels.api as sm

from scipy import stats
import numpy as np
 
def outlier_removal(array):
    if array is not pd.DataFrame:
        array = np.array(array)
        df = pd.DataFrame(array)
    df.head()
    print(df.shape)
    # IQR
    # Calculate the upper and lower limits
    Q1 = df.quantile(0.30)
    Q3 = df.quantile(0.70)
    IQR = Q3 - Q1
    lower = Q1 - 1.5*IQR
    upper = Q3 + 1.5*IQR
    
    # Create arrays of Boolean values indicating the outlier rows
    upper_array = np.where(df>=upper)[0]
    lower_array = np.where(df<=lower)[0]
    
    # Replace outlier values with NaN
    df.iloc[upper_array] = np.nan
    df.iloc[lower_array] = np.nan
    
    # Replace NaN values with the mean
    df = df.fillna(df.mean())
    
    # Print the new shape of the DataFrame
    print("New Shape: ", df.shape)

    return df

df_generator_prgan_loss_values = outlier_removal(generator_prgan_loss_values)
df_generator_prgan_loss_values.rename(columns={df_generator_prgan_loss_values.columns[0]: 'PRGAN'}, inplace=True)

df_generator_wgan_loss_values = outlier_removal(generator_wgan_loss_values)
df_generator_wgan_loss_values.rename(columns={df_generator_wgan_loss_values.columns[0]: 'WGAN'}, inplace=True)

array_generator_dcgan_loss_values = np.array(generator_dcgan_loss_values)
df_generator_dcgan_loss_values = pd.DataFrame(array_generator_dcgan_loss_values, columns=['DCGAN'])


df_combined = pd.concat([df_generator_dcgan_loss_values, df_generator_wgan_loss_values, df_generator_prgan_loss_values[:50]], axis=1)
# Set Seaborn style
sns.set_style("ticks")
sns.set_context("paper")

# Calculate the confidence interval
# Calculate and add the confidence intervals for each line
for column in df_combined.columns:
    model = sm.OLS(df_combined[column], sm.add_constant(df_combined.index))
    results = model.fit()
    predictions = results.get_prediction(sm.add_constant(df_combined.index))
    conf_int = predictions.conf_int(alpha=0.05)
    
    plt.fill_between(df_combined.index, conf_int[:, 0], conf_int[:, 1], alpha=0.3)

sns.lineplot(df_combined, ci='sd', markers=False)

plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss evolution during the training')

# Adding a legend
plt.legend()

# Displaying the plot
plt.savefig("chart-loss.pdf", format="pdf")
plt.show()



output = [i for i in range(1, len(df_generator_prgan_loss_values)+1)]
print(output)
# Scatter plot
fig, ax = plt.subplots(figsize = (10,6))
ax.scatter(output, df_generator_prgan_loss_values)
 
# x-axis label
ax.set_xlabel('(body mass index of people)')
 
# y-axis label
ax.set_ylabel('(bp of the people )')
plt.show()



In [None]:
from numpy.random import randn

noise_dcgan = np.random.normal(0, 1, size=(100, NOISE_DIM))
noise_wgan = tf.random.normal([10, 75*75*2])

samples_dcgan = sample_images(noise_dcgan, (10,10), generator_dcgan, 'dcgan', (24,20), save=True)
samples_wgan = sample_images(noise_wgan, (10,10), generator_wgan, 'wgan', (24,20), save=True)


In [None]:
noise_prgan = randn(100 * 10)
# reshape into a batch of inputs for the network
noise_prgan = noise_prgan.reshape(10, 100)
samples_prgan = sample_images(noise_prgan, (10,10), generator_prgan, 'prgan', (24,20), save=True)

In [None]:
# Create a figure and a set of subplots
fig, axes = plt.subplots(4, 5, figsize=(25, 25))
# Titles for each row
titles = ["Real Images", "DCGAN", "WGAN", "PRGAN"]

# Adjust layout
plt.subplots_adjust(left=0.05, right=0.95, top=0.92, bottom=0.08, wspace=0.1, hspace=0.0)

# Iterate through rows and columns to plot images
for i in range(4):
    # Set the title for the entire row
    
    for j in range(5):
        if i == 0:
            axes[i, j].axis("off")
            fig.text(0.02, 0.83 - i * 0.25, titles[i], va='center', ha='center', rotation='vertical', fontsize=25)
            axes[i, j].imshow(X_train_array[j][:, :, 0], cmap='gray')
            if j == 0:
                axes[i, j].axis("on")
        elif i == 1:
            axes[i, j].axis("off")
            fig.text(0.02, 0.86 - i * 0.25, titles[i], va='center', ha='center', rotation='vertical', fontsize=25)
            axes[i, j].imshow(samples_dcgan[(j * 2) % len(samples_dcgan)][:, :, 0], cmap='gray')
            if j == 0:
                axes[i, j].axis("on")
        elif i == 2:
            axes[i, j].axis("off")
            fig.text(0.02, 0.9 - i * 0.25, titles[i], va='center', ha='center', rotation='vertical', fontsize=25)
            axes[i, j].imshow(samples_wgan[(j * 2) % len(samples_wgan)][:, :, 0], cmap='gray')
            if j == 0:
                axes[i, j].axis("on")
        elif i == 3:
            axes[i, j].axis("off")
            fig.text(0.02, 0.94 - i * 0.25, titles[i], va='center', ha='center', rotation='vertical', fontsize=25)
            axes[i, j].imshow(samples_prgan[(j * 2) % len(samples_prgan)][:, :, 0], cmap='gray')
            if j == 0:
                axes[i, j].axis("on")

        
        axes[i, j].tick_params(axis='both', which='both', labelsize=23, length=8)


plt.show()

In [None]:
samples_dcgan_array = np.array(samples_dcgan)
samples_wgan_array = np.array(samples_wgan)
samples_prgan_array = np.array(samples_prgan)
X_train_array = np.array(X_train)

samples_dcgan_array = preprocess_array(samples_dcgan_array)
samples_wgan_array = preprocess_array(samples_wgan_array)
samples_prgan_array = preprocess_array(samples_prgan_array)


In [None]:
import scores
importlib.reload(scores) 
from scores import scores, calculate_bhattacharyya, calculate_chi_square


print(X_train_array[:30].shape)
print(samples_dcgan_array[:30].shape)
#fid, fid_error, bc, bc_error, chi_square, chi_square_error, correlation, correlation_error, intersection, intersection_error
#    return fid, fid_error, bhattacharyya, bhattacharyya_error, chi_square, chi_square_error, correlation, correlation_error, intersection, intersection_error

dcgan_fid, dcgan_fid_error, dcgan_bc, dcgan_bc_error, dcgan_cs, dcgan_cs_error, dcgan_corr, dcgan_corr_error, dcgan_inter, dcgan_inter_error = scores(X_train_array[:10], samples_dcgan_array[:10])
wgan_fid, wgan_fid_error, wgan_bc, wgan_bc_error, wgan_cs, wgan_cs_error, wgan_corr, wgan_corr_error, wgan_inter, wgan_inter_error = scores(X_train_array[:10], samples_wgan_array[:10])
prgan_fid, prgan_fid_error, prgan_bc, prgan_bc_error, prgan_cs, prgan_cs_error, prgan_corr, prgan_corr_error, prgan_inter, prgan_inter_error = scores(X_train_array[:10], samples_prgan_array[:10])


In [None]:
baseline_fid, baseline_fid_error, baseline_bc, baseline_bc_error, baseline_cs, baseline_cs_error, baseline_corr, baseline_corr_error, baseline_inter, baseline_inter_error = scores(X_train_array[:10], X_train_array[10:20])

In [None]:
# Data for FID comparison
model_names = ['Baseline', 'DCGAN', 'WGAN', 'PRGAN']
fid_means = [baseline_fid/1000, dcgan_fid/1000, wgan_fid/1000, prgan_fid/1000]
fid_errors = [baseline_fid_error/1000, dcgan_fid_error/1000, wgan_fid_error/1000, prgan_fid_error/1000]
print(fid_means)

bc_means = [baseline_bc, dcgan_bc, wgan_bc, prgan_bc]
bc_errors = [baseline_bc_error, dcgan_bc_error, wgan_bc_error, prgan_bc_error]
print(bc_means)

cs_means = [baseline_cs, dcgan_cs, wgan_cs, prgan_cs]
cs_errors = [baseline_cs_error, dcgan_cs_error, wgan_cs_error, prgan_cs_error]
print(cs_means)

corr_means = [baseline_corr, dcgan_corr, wgan_corr, prgan_corr]
corr_errors = [baseline_corr_error, dcgan_corr_error, wgan_bc_error, prgan_corr_error]
print(corr_means)

inter_means = [baseline_inter, dcgan_inter, wgan_inter, prgan_inter]
inter_errors = [baseline_inter_error/1000, dcgan_inter_error/1000, wgan_inter_error/1000, prgan_inter_error/1000]
print(inter_means)


# Create a figure with five subplots
fig, ((ax1, ax2), (ax3, ax5)) = plt.subplots(2, 2, figsize=(12, 10))

# Set larger font size
font_size = 16

# Plotting the FID comparison using a line chart with error bars
sns.lineplot(x=model_names, y=fid_means, marker='o', color='red', ax=ax1)
ax1.errorbar(x=model_names, y=fid_means, yerr=fid_errors, fmt='none', color='black', capsize=5)
ax1.set_ylabel('Frechet Inception Distance (FID)', fontsize=font_size)
ax1.text(0, 1.05, 'Lower FID is better', transform=ax1.transAxes, fontsize=font_size)
ax1.tick_params(axis='both', which='major', labelsize=font_size - 2)

# Plotting the BC comparison using a line chart with error bars
sns.lineplot(x=model_names, y=bc_means, marker='o', color='red', ax=ax2)
ax2.errorbar(x=model_names, y=bc_means, yerr=bc_errors, fmt='none', color='black', capsize=5)
ax2.set_ylabel('Bhattacharyya Coefficient (BC)', fontsize=font_size)
ax2.text(0, 1.05, 'Lower BC is better', transform=ax2.transAxes, fontsize=font_size)
ax2.tick_params(axis='both', which='major', labelsize=font_size - 2)

# Plotting the Chi-Square comparison using a line chart with error bars
sns.lineplot(x=model_names, y=cs_means, marker='o', color='red', ax=ax3)
ax3.errorbar(x=model_names, y=cs_means, yerr=cs_errors, fmt='none', color='black', capsize=5)
ax3.set_ylabel('Chi-Square Coefficient (Chi-C)', fontsize=font_size)
ax3.text(0, 1.05, 'Lower Chi-C is better', transform=ax3.transAxes, fontsize=font_size)
ax3.tick_params(axis='both', which='major', labelsize=font_size)

# Plotting the Intersection comparison using a line chart with error bars
sns.lineplot(x=model_names, y=inter_means, marker='o', color='red', ax=ax5)
ax5.errorbar(x=model_names, y=inter_means, yerr=inter_errors, fmt='none', color='black', capsize=5)
ax5.set_ylabel('Intersection Coefficient (IC)', fontsize=font_size)
ax5.text(0, 1.05, 'Higher IC is better', transform=ax5.transAxes, fontsize=font_size)
ax5.tick_params(axis='both', which='major', labelsize=font_size)

plt.tight_layout()
plt.savefig("results.pdf", format="pdf")
plt.show()


# Adjust the spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
import pandas as pd
import numpy as np

def get_df_from_images(images):
    num_images = len(images)
    
    # Initialize empty lists for 'band_1' and 'band_2' values as lists
    band_1_values = []
    band_2_values = []
    
    # Generate unique IDs for each image, matching the number of images
    image_ids = [f"image_{i}" for i in range(1, num_images + 1)]

    for i, image in enumerate(images):
        # Extract 'band_1' and 'band_2' for each image
        band_1 = image[:, :, 0]
        band_2 = image[:, :, 1]
        # Flatten the 'band_1' and 'band_2' arrays
        band_1_flat = band_1.flatten()
        band_2_flat = band_2.flatten()
        band_1_values.append(band_1_flat)
        band_2_values.append(band_2_flat)

    # Create a Pandas DataFrame
    data = {
        'id': image_ids,
        'band_1': band_1_values,
        'band_2': band_2_values,
        'inc_angle': np.zeros(num_images),  # You can set inc_angle to a specific value or calculate it
        'is_iceberg': np.zeros(num_images)  # Assuming all images have the label '0' (iceberg)
    }

    df = pd.DataFrame(data)

    # Optionally, you can set the 'id' column as the DataFrame index
    df.set_index('id', inplace=False)

    return df


In [None]:
# COMBINE REAL AND SYNTHETIC DATASETS
combined_train_dcgan = np.concatenate([X_train_array[:, :, :, :2], samples_dcgan], axis=0)
combined_train_wgan = np.concatenate([X_train_array[:, :, :, :2], samples_wgan], axis=0)
combined_train_prgan = np.concatenate([X_train_array[:, :, :, :2], samples_prgan], axis=0)

In [None]:
samples_gan_df = get_df_from_images(samples_dcgan)
samples_gan_df.to_json('samples_gan.json', orient='records')

samples_gan_df = get_stats(samples_gan_df, 1)
samples_gan_df = get_stats(samples_gan_df, 2)

print(len(samples_gan_df))

num_classes = train['is_iceberg'].nunique()
print(f"Number of distinct values in 'specific_column': {distinct_values_count}")

In [None]:
test = pd.read_json('./data/test.json')
print(test.columns)

In [None]:
# CELL TO PERFORM THE DATA CLEANING AND SPLITTING
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, concatenate
from keras.optimizers import Adam
from keras.utils import to_categorical

# Extract image arrays, non-image features, and labels
X_image = train[['band_1', 'band_2']].values  # Image data
X_other_features = train.drop(['id', 'band_1', 'band_2', 'is_iceberg', 'inc_angle'], axis=1)  # Non-image features
y = train['is_iceberg'].values  # Labels

X_synthetic_image = samples_gan_df[['band_1', 'band_2']].values
X_synthetic_other_features = samples_gan_df.drop(['id', 'band_1', 'band_2', 'is_iceberg', 'inc_angle'], axis=1)  # Non-image features
y_synthetic = samples_gan_df['is_iceberg'].values  # Labels

num_other_features = len(X_other_features.columns)

# Preprocess image data (resize, normalize, etc.)
X_image = np.vstack([np.hstack(arr) for arr in X_image])
X_image = X_image.reshape(-1, 75, 75, 2)

X_synthetic_image = np.vstack([np.hstack(arr) for arr in X_synthetic_image])
X_synthetic_image = X_synthetic_image.reshape(-1, 75, 75, 2)

# Split the dataset into training and testing sets
X_image_train, X_image_test, X_other_train, X_other_test, y_train, y_test = train_test_split(
    X_image, X_other_features.values, y, test_size=0.2, random_state=42
)

# Concatenate the image arrays vertically
X_image_train = np.vstack((X_image_train, X_synthetic_image))

# Concatenate the other feature arrays vertically
X_other_train = np.vstack((X_other_train, X_synthetic_other_features))

# Concatenate the labels
y_train = np.concatenate((y_train, y_synthetic))
# Assuming y_test contains the true labels (0 or 1)
class_counts = np.bincount(y_test)

# Assuming class 0 represents "Iceberg" and class 1 represents "Ship"
iceberg_count = class_counts[0]
ship_count = class_counts[1]

print(f'Number of Iceberg samples in y_test: {iceberg_count}')
print(f'Number of Ship samples in y_test: {ship_count}')

In [None]:
# Define the CNN model for image data
image_input = Input(shape=(75, 75, 2))
conv1 = Conv2D(32, (3, 3), activation='relu')(image_input)
pool1 = MaxPooling2D((2, 2))(conv1)
conv2 = Conv2D(64, (3, 3), activation='relu')(pool1)
pool2 = MaxPooling2D((2, 2))(conv2)
flatten = Flatten()(pool2)

# Define the model for other features
other_input = Input(shape=(num_other_features,))
dense1 = Dense(64, activation='relu')(other_input)

# Combine the two models
merged = concatenate([flatten, dense1])

# Add additional layers for classification
output = Dense(num_classes, activation='softmax')(merged)

# Create the hybrid model
model = Model(inputs=[image_input, other_input], outputs=output)

# Compile the model
model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Convert labels to one-hot encoding

y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

# Train the model
model.fit([X_image_train, X_other_train], y_train, epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the model
test_loss, test_acc = model.evaluate([X_image_test, X_other_test], y_test)
print(f'Test accuracy: {test_acc}')

In [None]:
print(test.columns)

In [None]:
from sklearn.metrics import classification_report

# Assuming y_test contains the true labels (0 or 1)
#print(y_test)
#class_counts = np.bincount(y_test)

# Assuming class 0 represents "Iceberg" and class 1 represents "Ship"
#iceberg_count = class_counts[0]
#ship_count = class_counts[1]

#print(f'Number of Iceberg samples in y_test: {iceberg_count}')
#print(f'Number of Ship samples in y_test: {ship_count}')

# Make predictions on the test data
predictions = model.predict([X_image_test, X_other_test])
print(predictions)
# Convert predicted probabilities to binary labels (0 or 1)
predicted_labels = (predictions > 0.5).astype(int)

# Calculate and print classification report
class_names = ["Iceberg (0)", "Ship (1)"]
report = classification_report(y_test, predicted_labels, target_names=class_names)

print("Classification Report:")
print(report)