# Leaf Classification

In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
df_train = pd.read_csv('../datasets/train.csv', index_col=0)
df_test = pd.read_csv('../datasets/test.csv', index_col=0)

In [None]:
df_train.shape, df_test.shape

In [None]:
np.any(df_train.isnull()), np.any(df_test.isnull())

In [None]:
df_train.columns.tolist()

In [None]:
df_train['species'].unique().size

In [None]:
df_train['species'].value_counts().sort_index()

In [None]:
def plot_distributions(data, var_groups=None):
    """Plot the distribution of every variable in a data set.

    Input:
        data        Input data set.
        var_groups  Groups of variables (names) to plot together in the same figure.

    Output:
        (None)
    """
    if var_groups is None:
        sns.plt.figure(figsize=(10, data.shape[1] // 2))
        sns.boxplot(data=data, orient='horizontal')
        sns.plt.show()
    else:
        for group in var_groups:
            sns.plt.figure(figsize=(10, len(group) // 2))
            sns.boxplot(data=data[group], orient='horizontal')
            sns.plt.show()

In [None]:
def plot_correlations(data, ignore_vars=[]):
    """Plot the correlation map between every pair of numeric variables in a data set.

    Input:
        data         Input data set.
        ignore_vars  Names of variables to ignore.

    Output:
        corr  The correlation matrix.
    """
    corr = data.drop(ignore_vars, axis=1).corr()
    sns.plt.figure(figsize=(10, 8))
    sns.heatmap(corr)

    return corr

In [None]:
margin_vars = ['margin%d' % i for i in range(1, 65)]
shape_vars = ['shape%d' % i for i in range(1, 65)]
texture_vars = ['texture%d' % i for i in range(1, 65)]

var_groups = [margin_vars, shape_vars, texture_vars]

In [None]:
plot_distributions(df_train, var_groups)

In [None]:
corr = plot_correlations(df_train, ignore_vars=['species'])

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [None]:
X_train = df_train.drop('species', axis=1).values
y_train = df_train['species'].values

X_test = df_test.values

In [None]:
scaler = StandardScaler()

In [None]:
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.fit_transform(X_test)

In [None]:
encoder = LabelEncoder().fit(y_train)

In [None]:
y_train_enc = encoder.transform(y_train)
y_train_enc

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [None]:
models = {
    'logreg': LogisticRegression(
        solver='newton-cg',
        multi_class='multinomial',
        random_state=42,
    ),
    'mlp': MLPClassifier(
        solver='lbfgs',
        activation='tanh',
        random_state=42,
    ),
}

In [None]:
param_grids = {
    'logreg': {
        'C': [1000, 2000, 4000, 6000],
    },
    'mlp': {
        'hidden_layer_sizes': [(300), (600,), (900)],
    },
}

In [None]:
kfold = StratifiedKFold(5, random_state=42)

In [None]:
for key in models.keys():
    print('-------------------------------------------------------------------------------')
    print('Training model: %s' % key)
    print('-------------------------------------------------------------------------------')

    gs = GridSearchCV(
        estimator=models[key],
        param_grid=param_grids[key],
        scoring='neg_log_loss',
        cv=kfold,
        n_jobs=1,
        verbose=1,
    )
    gs.fit(X_train_sc, y_train_enc)

    models[key] = gs.best_estimator_

    print('Best score: %g' % gs.best_score_)
    print('Best parameters:', gs.best_params_)
    print()

In [None]:
from copy import deepcopy

In [None]:
def plot_learning_curves(estimators, title, X, y, scoring=None, cv=None,
                         train_sizes=np.linspace(0.1, 1, 5), ylim=None, n_jobs=-1):
    """Plots training and validation errors for increasingly bigger portions of
    the data set, showing the learning progress.
    
    Inputs:
        estimators    List of models to evaluate.
        title         Graph title.
        X             Training/validation samples (independent variables).
        y             Training/validation targets (dependent variables).
        scoring       Scoring function that evaluates the models.
        cv            Cross-validation object or number of folds.
        train_sizes   Portions of the training set used in the evaluations.
        ylim          Limits of the displayed y values in the graph.
        n_jobs        Number of jobs (threads).
    
    Outputs:
        None
    """
    from sklearn.model_selection import learning_curve

    palette = sns.color_palette()

    for i in range(len(estimators)):
        train_sizes, train_scores, valid_scores = learning_curve(
            estimators[i], X, y, cv=cv, scoring=scoring,
            train_sizes=train_sizes, n_jobs=n_jobs)

        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        valid_scores_mean = np.mean(valid_scores, axis=1)
        valid_scores_std = np.std(valid_scores, axis=1)

        sns.plt.plot(train_sizes, train_scores_mean, 'o--', color=palette[i],
            label=type(estimators[i]).__name__ + ' (train)')
        sns.plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
            train_scores_mean + train_scores_std, alpha=0.1, color=palette[i])

        sns.plt.plot(train_sizes, valid_scores_mean, 'o-', color=palette[i],
            label=type(estimators[i]).__name__ + ' (cv)')
        sns.plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std,
            valid_scores_mean + valid_scores_std, alpha=0.1, color=palette[i])

    if ylim is not None:
        sns.plt.ylim(*ylim)

    sns.plt.title(title)
    sns.plt.grid('on')
    sns.plt.legend(loc='best')
    sns.plt.show()

In [None]:
plot_learning_curves([deepcopy(x) for x in models.values()],
                     'Learning Curves',
                     X_train_sc,
                     y_train_enc,
                     scoring='accuracy',
                     cv=kfold,
                     train_sizes=np.linspace(0.1, 1, 10),
                     n_jobs=1,
)

In [None]:
for key in models.keys():
    print('Predicting with model: %s' % key)

    y_test_proba = models[key].predict_proba(X_test_sc)
    y_test_label = encoder.inverse_transform(models[key].predict(X_test_sc))

    df_test_proba = pd.DataFrame(y_test_proba,
                                 index=df_test.index,
                                 columns=encoder.inverse_transform(np.unique(y_train_enc)))
    df_test_label = pd.DataFrame(y_test_label,
                                 index=df_test.index,
                                 columns=['species'])

    df_test_proba.to_csv('../submissions/submission_%s.csv' % key, index_label='id')
    df_test_label.to_csv('../submissions/prediction_%s.csv' % key, index_label='id')