In [1]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, Normalizer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier
import PIL.Image as Image

### Preprocessing

In [2]:
# Constants
IMAGE_DIM = 200, 200    # TODO hard-coded

In [3]:
# Read in training and test data
train_df = pd.read_csv("train.csv", index_col='id')
test_df = pd.read_csv("test.csv", index_col='id')

#### Image Preprocessing

In [4]:
# Specify the type of each column for us to know what preprocessing needs to be done to each column
numeric_features = [i for i in train_df.drop(columns='species').columns]
binary_features = []

# Loads an image and applies some basic preprocessing
# to it (resizing, black and white)
def load_image(filename):
    return Image.open(filename) \
                .resize(IMAGE_DIM) \
                .convert(
                    mode='1',   # black and white
                    dither=Image.Dither.NONE,
                )

# Converts a Pillow image to a 1D numpy array of pixel data
def image_to_flat_array(img):
    return np.array(img).reshape((-1))

# Loads images with specified indices into one-dimensional
# pixel data and concatenates it all into a single dataframe
def images_to_df(indices):
    # TODO filepath hard-coded, change if needed
    imgs = [
        load_image(f'images/{i}.jpg') for i in indices
    ]

    # Converts each image to flat 1D representation
    df = pd.DataFrame(
               np.asarray(imgs).reshape((len(imgs), -1))
           ).set_index(indices)
    df.columns = df.columns.astype(str)     # Prevents some obscure errors later
    return df

In [5]:
# Load the data corresponding to each dataset
img_train_df = images_to_df(train_df.index)
img_test_df = images_to_df(test_df.index)

# Add the img_train columns to the preprocessor lists
binary_features = [i for i in img_train_df]

In [6]:
# Define the pipelines for each type of data we have
numeric_pipeline = make_pipeline(
    # Normalize all of them to unit norm
    Normalizer()
)
binary_pipeline = make_pipeline(
    OneHotEncoder()
)

# Define our column transformer/preprocessor itself
preprocessor = make_column_transformer(
    (numeric_pipeline, numeric_features),
    (binary_pipeline, binary_features)
)

#### Concatenated Dataframes & Column Transformations

In [7]:
# Concatenates the image data with the metadata into one dataframe
full_train_df = pd.concat([train_df, img_train_df], axis=1)
full_test_df = pd.concat([test_df, img_test_df], axis=1)

# We want to split X (big X because it is a matrix) and y from each other
full_X_train = full_train_df.drop(columns=['species'])
full_y_train = full_train_df['species']

# Funnily enough, the test has no ground truth...
full_X_test = full_test_df

# Now we want to fit our preprocessor onto our data, so we can actually transform it (then cast it to a DF)
full_X_train_transformed = pd.DataFrame(preprocessor.fit_transform(full_X_train))
full_X_test_transformed = pd.DataFrame(preprocessor.fit_transform(full_X_test))

### Models

In [8]:
# Let's define a pipeline for all preprocessing and processing
np.random.seed()

full_pipeline = make_pipeline(
    preprocessor,
    DummyClassifier(
        strategy = "uniform",
        random_state = np.random.randint(0, 256)
    )
)

# cv_scores = cross_val_score(full_pipeline, full_X_train_transformed, full_y_train, cv=5)

##### Manual Dummy Classifier

In [9]:
# Dummy classifer that randomly guesses the species a leaf belongs to
np.random.seed()
dummy = DummyClassifier(
    strategy="uniform",
    random_state=np.random.randint(0, 256)
)
dummy.fit(full_train_df.drop(columns=['species']), train_df[['species']])