In [None]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, Normalizer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier
import PIL.Image as Image

### Preprocessing

In [None]:
# Constants
IMAGE_DIM = 200, 200    # TODO hard-coded

In [None]:
# Read in training and test data
train_df = pd.read_csv("train.csv", index_col='id')
test_df = pd.read_csv("test.csv", index_col='id')

#### Image Preprocessing

In [None]:
# Specify the type of each column for us to know what preprocessing needs to be done to each column
numeric_features = [i for i in train_df.drop(columns='species').columns]
binary_features = []

# Loads an image and applies some basic preprocessing
# to it (resizing, black and white)
def load_image(filename):
    return Image.open(filename) \
                .resize(IMAGE_DIM) \
                .convert(
                    mode='1',   # black and white
                    dither=Image.Dither.NONE,
                )

# Converts a Pillow image to a 1D numpy array of pixel data
def image_to_flat_array(img):
    return np.array(img).reshape((-1))

# Loads images with specified indices into one-dimensional
# pixel data and concatenates it all into a single dataframe
def images_to_df(indices):
    # TODO filepath hard-coded, change if needed
    imgs = [
        load_image(f'images/{i}.jpg') for i in indices
    ]

    # Converts each image to flat 1D representation
    df = pd.DataFrame(
               np.asarray(imgs).reshape((len(imgs), -1))
           ).set_index(indices)
    df.columns = df.columns.astype(str)     # Prevents some obscure errors later
    return df

In [None]:
# Load the data corresponding to each dataset
img_train_df = images_to_df(train_df.index)
img_test_df = images_to_df(test_df.index)

In [None]:
img_train_df = img_train_df.astype(int)
img_test_df = img_test_df.astype(int)

In [None]:
# Add the img_train columns to the preprocessor lists
passthrough_features = [i for i in img_train_df]

In [None]:
# Define the pipelines for each type of data we have
numeric_pipeline = make_pipeline(
    # Normalize all of them to unit norm
    Normalizer()
)

# Define our column transformer/preprocessor itself
preprocessor = make_column_transformer(
    (numeric_pipeline, numeric_features),
    ('passthrough', passthrough_features)

)

#### Concatenated Dataframes & Column Transformations

In [None]:
# Concatenates the image data with the metadata into one dataframe
full_train_df = pd.concat([train_df, img_train_df], axis=1)
full_test_df = pd.concat([test_df, img_test_df], axis=1)

# We want to split X (big X because it is a matrix) and y from each other
full_X_train = full_train_df.drop(columns=['species'])
full_y_train = full_train_df['species']

# Funnily enough, the test has no ground truth...
full_X_test = full_test_df

# Now we want to fit our preprocessor onto our data, so we can actually transform it (then cast it to a DF)
full_X_train_transformed = pd.DataFrame(preprocessor.fit_transform(full_X_train))
full_X_test_transformed = pd.DataFrame(preprocessor.transform(full_X_test))

In [None]:
print(full_X_train_transformed.shape)

### Models

In [None]:
# Let's define a pipeline for all preprocessing and processing
np.random.seed()

full_pipeline = make_pipeline(
    preprocessor,
    DummyClassifier(
        strategy = "uniform",
        random_state = np.random.randint(0, 256)
    )
)

# cv_scores = cross_val_score(full_pipeline, full_X_train_transformed, full_y_train, cv=5)

In [None]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

##### Manual Dummy Classifier

In [None]:
# Dummy classifer that randomly guesses the species a leaf belongs to
np.random.seed()
dummy = DummyClassifier(
    strategy="uniform",
    random_state=np.random.randint(0, 256)
)
dummy.fit(full_X_train_transformed, full_y_train)

In [None]:
cv_scores = cross_val_score(dummy, full_X_train_transformed, full_y_train, cv=5)

In [None]:
print(cv_scores)

## CSV Output

In [None]:
# Predict probabilities for each class
proba = pd.DataFrame(
    dummy.predict_proba(full_X_test_transformed),
    columns=dummy.classes_,
    index=test_df.index   
)


In [None]:
# Output to CSV for submission
proba.to_csv('checkpoint1.csv')

# Testing with DL model on images

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
import torch.optim as optim

In [None]:
# Extracting only the image data from the full_X_train_transformed DataFrame
image_data_train = full_X_train_transformed.iloc[:, -40000:]
image_data_test = full_X_test_transformed.iloc[:, -40000:]

In [None]:
print(image_data_train.shape)
print(image_data_test.shape)
print(full_y_train.shape)

In [None]:
image_data_train.columns = range(40000)
image_data_test.columns = range(40000)

In [None]:
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(full_y_train)

# Splitting 20% of the training data as a validation set
X_train, X_val, y_train_encoded_split, y_val_encoded_split = train_test_split(
    image_data_train, y_train_encoded, test_size=0.2, stratify=y_train_encoded, random_state=42
)

print(f"Training set size: {X_train.shape}")
print(f"Validation set size: {X_val.shape}")


In [None]:
print("NaN values in X_train:", X_train.isnull().sum().sum())
print("NaN values in y_train:", np.isnan(y_train_encoded_split).sum())


In [None]:
import torch.nn.functional as F

class MLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 1024)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(1024, 512)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(512, 256)
        self.fc4 = nn.Linear(256, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x


In [None]:
# Convert data to tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_encoded_split, dtype=torch.int64)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_encoded_split, dtype=torch.int64)

# Define the model
n_classes = full_y_train.nunique()
model = MLP(input_dim=40000, output_dim=n_classes)

# Define the loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [None]:
# Training parameters
epochs = 10
batch_size = 32

# Training loop
for epoch in range(epochs):
    for i in range(0, len(X_train_tensor), batch_size):
        # Determine end index for the current batch
        end_idx = min(i + batch_size, len(X_train_tensor))
        
        # Get the mini-batch data
        inputs = X_train_tensor[i:end_idx]
        labels = y_train_tensor[i:end_idx]

        # print("Inputs shape:", inputs.shape)
        # print("Labels shape:", labels.shape)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Print statistics
    print(f"Epoch [{epoch+1}/{epochs}] - Loss: {loss.item()}")

print('Finished Training')

In [None]:
# Set the model to evaluation mode
model.eval()

correct_predictions = 0
total_predictions = 0

# Prevent gradient calculations
with torch.no_grad():
    for i in range(0, len(X_val_tensor), batch_size):
        # Determine end index for the current batch
        end_idx = min(i + batch_size, len(X_val_tensor))
        
        # Get the mini-batch data
        inputs = X_val_tensor[i:end_idx]
        labels = y_val_tensor[i:end_idx]

        # Forward pass
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        
        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

accuracy = 100 * correct_predictions / total_predictions
print(f'Accuracy on the validation set: {accuracy:.2f}%')
