In [1]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, Normalizer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier
import PIL.Image as Image

### Preprocessing

In [2]:
# Constants
IMAGE_DIM = 200, 200    # TODO hard-coded

In [3]:
# Read in training and test data
train_df = pd.read_csv("CSV/train.csv", index_col='id')
test_df = pd.read_csv("CSV/test.csv", index_col='id')

#### Image Preprocessing

In [4]:
# Specify the type of each column for us to know what preprocessing needs to be done to each column
numeric_features = [i for i in train_df.drop(columns='species').columns]
binary_features = []

# Loads an image and applies some basic preprocessing
# to it (resizing, black and white)
def load_image(filename):
    return Image.open(filename) \
                .resize(IMAGE_DIM) \
                .convert(
                    mode='1',   # black and white
                    dither=Image.Dither.NONE,
                )

# Converts a Pillow image to a 1D numpy array of pixel data
def image_to_flat_array(img):
    return np.array(img).reshape((-1))

# Loads images with specified indices into one-dimensional
# pixel data and concatenates it all into a single dataframe
def images_to_df(indices):
    # TODO filepath hard-coded, change if needed
    imgs = [
        load_image(f'images/{i}.jpg') for i in indices
    ]

    # Converts each image to flat 1D representation
    df = pd.DataFrame(
               np.asarray(imgs).reshape((len(imgs), -1))
           ).set_index(indices)
    df.columns = df.columns.astype(str)     # Prevents some obscure errors later
    return df

In [5]:
# Load the data corresponding to each dataset
img_train_df = images_to_df(train_df.index)
img_test_df = images_to_df(test_df.index)

In [6]:
img_train_df = img_train_df.astype(int)
img_test_df = img_test_df.astype(int)

In [7]:
# Add the img_train columns to the preprocessor lists
passthrough_features = [i for i in img_train_df]

In [8]:
# Define the pipelines for each type of data we have
numeric_pipeline = make_pipeline(
    # Normalize all of them to unit norm
    Normalizer()
)

# Define our column transformer/preprocessor itself
preprocessor = make_column_transformer(
    (numeric_pipeline, numeric_features),
    ('passthrough', passthrough_features)

)

#### Concatenated Dataframes & Column Transformations

In [9]:
# Concatenates the image data with the metadata into one dataframe
full_train_df = pd.concat([train_df, img_train_df], axis=1)
full_test_df = pd.concat([test_df, img_test_df], axis=1)

# We want to split X (big X because it is a matrix) and y from each other
full_X_train = full_train_df.drop(columns=['species'])
full_y_train = full_train_df['species']

# Funnily enough, the test has no ground truth...
full_X_test = full_test_df

# Now we want to fit our preprocessor onto our data, so we can actually transform it (then cast it to a DF)
full_X_train_transformed = pd.DataFrame(preprocessor.fit_transform(full_X_train))
full_X_test_transformed = pd.DataFrame(preprocessor.transform(full_X_test))

In [10]:
print(image_data_train.shape)
print(image_data_test.shape)
print(full_y_train.shape)

NameError: name 'image_data_train' is not defined

### Save the pre-processed data to csv for access in other jupyter notebooks

In [11]:
full_X_train_transformed.to_csv('CSV/pre-processed/full_X_train_transformed')
full_y_train.to_csv('CSV/pre-processed/full_y_train')


full_X_test_transformed.to_csv('CSV/pre-processed/full_X_test_transformed')


### Models

In [92]:
# Let's define a pipeline for all preprocessing and processing
np.random.seed()

full_pipeline = make_pipeline(
    preprocessor,
    DummyClassifier(
        strategy = "uniform",
        random_state = np.random.randint(0, 256)
    )
)

# cv_scores = cross_val_score(full_pipeline, full_X_train_transformed, full_y_train, cv=5)

In [93]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

##### Manual Dummy Classifier

In [94]:
# Dummy classifer that randomly guesses the species a leaf belongs to
np.random.seed()
dummy = DummyClassifier(
    strategy="uniform",
    random_state=np.random.randint(0, 256)
)
dummy.fit(full_X_train_transformed, full_y_train)

In [95]:
cv_scores = cross_val_score(dummy, full_X_train_transformed, full_y_train, cv=5)

In [96]:
print(cv_scores)

[0.03535354 0.         0.02020202 0.         0.01515152]


## CSV Output

In [97]:
# Predict probabilities for each class
proba = pd.DataFrame(
    dummy.predict_proba(full_X_test_transformed),
    columns=dummy.classes_,
    index=test_df.index   
)


In [98]:
# Output to CSV for submission
proba.to_csv('checkpoint1.csv')