In [None]:
import pandas as pd
import random
import matplotlib.pyplot as plt

import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.activations import relu=

In [None]:
training_file_path = '/training.csv'
training_data = pd.read_csv(training_file_path) 
training_data.columns

test_file_path = '/testing.csv'
test_data = pd.read_csv(test_file_path) 
test_data.columns

In [None]:
# Extract the training data
X_trains = training_data.iloc[:, 1:]
y_trains = training_data.iloc[:, 0]

# Print the first few rows of the 'X' data
print(X_trains.head())
# Print the first few values of the 'y' data
print(y_trains.head())

# Extract the test data
X_tests = test_data.iloc[:, 1:]
y_tests = test_data.iloc[:, 0]

# Print the first few rows of the 'X' data
print(X_tests.head())
# Print the first few values of the 'y' data
print(y_tests.head())

In [None]:
# Select the 2nd and 3rd features
feature_2 = X_trains.iloc[:, 1]
feature_3 = X_trains.iloc[:, 2]

# Get the list of unique labels
unique_labels = y_trains.unique()

# Assign random colors to each label
label_colors = {label: [random.random(), random.random(), random.random()] for label in unique_labels}

# Create the scatter plot
plt.figure(figsize=(8, 6))  # Adjust the plot size

# Plot the data points with corresponding colors
for label in unique_labels:
    label_data = X_trains[y_trains == label]
    plt.scatter(label_data.iloc[:, 1], label_data.iloc[:, 2], color=label_colors[label], label=label)

# Show the legend
plt.legend(title='Label')
plt.xlabel('Feature 2')
plt.ylabel('Feature 3')
plt.title('Scatter plot between Feature 2 and Feature 3')
plt.show()

In [None]:
# Calculate the total number of data and percentage of each label
total_data = len(y_trains)
label_counts = y_trains.value_counts()
label_percentages = (label_counts / total_data) * 100

# Print the information
print(f"Total number of data: {total_data}")
print("Percentage of each label:")
for label, percentage in label_percentages.items():
    print(f"{label}: {percentage:.2f}%")

In [None]:
# Convert X
X_trains = np.array(X_trains)
X_tests = np.array(X_tests)

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Fit the encoder to the 'y' data
label_encoder.fit(y_trains)
label_encoder.fit(y_tests)

# Transform the 'y' data into encoded values
y_trains= label_encoder.transform(y_trains)
y_tests = label_encoder.transform(y_tests)

In [None]:
#  Oversample the minority classes using SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(sampling_strategy='auto', random_state=42)

#  Undersample the majority class using RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)

#  Apply SMOTE to oversample the minority classes in the training data
X_trains, y_trains = smote.fit_resample(X_trains, y_trains)

#  Apply RandomUnderSampler to undersample the majority class in the training data
X_trains, y_trains = rus.fit_resample(X_trains, y_trains)

In [None]:
# Create a RobustScaler object
scaler = RobustScaler()

# Fit the scaler to the training data
scaler.fit(X_trains)

# Transform the training data
X_trains = scaler.transform(X_trains)

# Transform the testing data using the fitted scaler
X_tests = scaler.transform(X_tests)

In [None]:
# Shape of X_trains and y_trains
print(f"Shape of X_trains: {X_trains.shape}")
print(f"Shape of y_trains: {y_trains.shape}")

# Data type of X_trains and y_trains
print(f"Data type of X_trains: {X_trains.dtype}")
print(f"Data type of y_trains: {y_trains.dtype}")

# Print the first 10 rows of X_trains and y_trains
print(f"First 10 rows of X_trains: {X_trains[:10]}")
print(f"First 10 rows of y_trains: {y_trains[:10]}")

# Shape of X_tests and y_tests
print(f"Shape of X_tests: {X_tests.shape}")
print(f"Shape of y_tests: {y_tests.shape}")

# Data type of X_tests and y_tests
print(f"Data type of X_tests: {X_tests.dtype}")
print(f"Data type of y_tests: {y_tests.dtype}")

# Print the first 10 rows of X_tests and y_tests
print(f"First 10 rows of X_tests: {X_tests[:10]}")
print(f"First 10 rows of y_tests: {y_tests[:10]}")

In [None]:
tf.random.set_seed(1234) # for consistent results
model = Sequential(
    [
        tf.keras.layers.InputLayer((28,)),
        tf.keras.layers.Dense(128, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.1), name="L1"),
        tf.keras.layers.Dense(64, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.1), name="L2"),
        tf.keras.layers.Dense(32, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.1), name="L3"),
        tf.keras.layers.Dense(6, activation="linear", name="L4")
    ], name = "landCover_model"
)
model.summary()

In [None]:
[layer1, layer2, layer3, layer4] = model.layers

W1,b1 = layer1.get_weights()
W2,b2 = layer2.get_weights()
W3,b3 = layer3.get_weights()
W4,b4 = layer4.get_weights()
print(f"W1 shape = {W1.shape}, b1 shape = {b1.shape}")
print(f"W2 shape = {W2.shape}, b2 shape = {b2.shape}")
print(f"W3 shape = {W3.shape}, b3 shape = {b3.shape}")
print(f"W4 shape = {W4.shape}, b4 shape = {b4.shape}")

In [None]:
# Create a StratifiedKFold object with 5 folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf.get_n_splits(X_trains, y_trains)

# Initialize an empty list to store the loss histories for each fold
loss_histories = []

fold=0
# Loop through each fold
for train_index, test_index in skf.split(X_trains, y_trains):
    fold += 1
    print(f"Fold {fold}:")

    # Split the data into training and testing sets for the current fold
    X_train, X_val = X_trains[train_index], X_trains[test_index]
    y_train, y_val = y_trains[train_index], y_trains[test_index]

    # Compile the model
    model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),)

    # Train the model
    history = model.fit(X_train, y_train, epochs = 50, batch_size = 64, validation_data=(X_val, y_val))
    loss_histories.append(history.history)

    predictions = model.predict(X_val)
    predicted_classes = np.argmax(predictions, axis=1)

    # Calculate precision, recall, and f1-score
    precision = precision_score(y_val, predicted_classes, average='macro')
    recall = recall_score(y_val, predicted_classes, average='macro')
    f1 = f1_score(y_val, predicted_classes, average='macro')

    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-score: {f1:.4f}')

In [None]:
# Plot the loss curve for each fold
for i, history in enumerate(loss_histories):
    # Create a new figure for each fold
    plt.figure()
    # Plot the training loss for the current fold
    plt.plot(history['loss'], label='Training loss')
    # Plot the validation loss for the current fold
    plt.plot(history['val_loss'], label='Validation loss')
    # Set labels and title for the current fold's loss curve
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title(f'Loss Curve for Fold {i+1}')
    # Display the legend for the current fold's loss curve
    plt.legend()
    # Show the loss curve for the current fold
    plt.show()

In [None]:
# Plot the overall loss curve
plt.figure()
epochs = 0  # Keep track of the current epoch
for i, history in enumerate(loss_histories):
    # Plot the training loss for the current fold
    plt.plot(range(epochs, epochs + len(history['loss'])), history['loss'], label=f'Training loss - Fold {i+1}')
    # Plot the validation loss for the current fold
    plt.plot(range(epochs, epochs + len(history['val_loss'])), history['val_loss'], label=f'Validation loss - Fold {i+1}')
    # Update the epoch counter for the next fold
    epochs += len(history['loss'])

# Set labels and title for the overall loss curve
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Overall Loss Curve for 5 Folds')
plt.legend()  # Display the legend for the different folds
plt.show()  # Show the overall loss curve

In [None]:
# Predict on X_trains
predictions = model.predict(X_trains)
predicted_classes = np.argmax(predictions, axis=1)

# Calculate metrics
accuracy = accuracy_score(y_trains, predicted_classes)
confusion_mat = confusion_matrix(y_trains, predicted_classes)
precision = precision_score(y_trains, predicted_classes, average='macro')
recall = recall_score(y_trains, predicted_classes, average='macro')
f1 = f1_score(y_trains, predicted_classes, average='macro')

print(f'Accuracy: {accuracy:.4f}')
print(f'Confusion Matrix:\n{confusion_mat}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1:.4f}')

In [None]:
# Predict on X_tests
predictions = model.predict(X_tests)
predicted_classes = np.argmax(predictions, axis=1)

# Calculate metrics
accuracy = accuracy_score(y_tests, predicted_classes)
confusion_mat = confusion_matrix(y_tests, predicted_classes)
precision = precision_score(y_tests, predicted_classes, average='macro')
recall = recall_score(y_tests, predicted_classes, average='macro')
f1 = f1_score(y_tests, predicted_classes, average='macro')

print(f'Accuracy: {accuracy:.4f}')
print(f'Confusion Matrix:\n{confusion_mat}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1:.4f}')