In [63]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [2]:
df_train = pd.read_csv('/Users/dominicdebiaso/Development/datasets/kaggle_leaf_train.csv')
df_test = pd.read_csv('/Users/dominicdebiaso/Development/datasets/kaggle_leaf_test.csv')

### Data Processing

In [3]:
df_train.head()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,Acer_Opalus,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391
1,2,Pterocarya_Stenoptera,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0.0,...,0.000977,0.0,0.0,0.000977,0.023438,0.0,0.0,0.000977,0.039062,0.022461
2,3,Quercus_Hartwissiana,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.0,...,0.1543,0.0,0.005859,0.000977,0.007812,0.0,0.0,0.0,0.020508,0.00293
3,5,Tilia_Tomentosa,0.0,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0.0,...,0.0,0.000977,0.0,0.0,0.020508,0.0,0.0,0.017578,0.0,0.047852
4,6,Quercus_Variabilis,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0.0,...,0.09668,0.0,0.021484,0.0,0.0,0.0,0.0,0.0,0.0,0.03125


In [4]:
## Split data
X_train = df_train.drop(['id', 'species'], axis=1)
y_train = np.ravel(df_train[['species']])
X_test = df_test.drop('id', axis=1)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(990, 192)
(990,)
(594, 192)


In [5]:
## Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Convert target feature into numerical format
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [6]:
## Dimensionality Reduction
# Transform set of correlated variables into linear uncorrelated, orthogonal (ie. statistically independent)
# vectors (or principal components). The first PC accounts for a large amount of variability and has the
# largest variance
pca = PCA(n_components='mle', #Minka's MLE to guess min number of output components necessary
          svd_solver='full')  #SVD in full to maintain same info coming from the input descriptors
var_reduction = pca.fit_transform(X_train)
print(X_train.shape)
print(var_reduction.shape)
# Reduced by 1 so most features are statistically independent

(990, 192)
(990, 191)


### Model Building

In [33]:
## Logistic Regression
# Generalized liner model (target values are expected to be in linear combination of input variables) for
# classification, where the logistic (ie. sigmoid) function is fitted on data to describe
# probability for an outcome at each trial
lreg = LogisticRegression()
param_grid = {
    'C':np.arange(0.1,5.0,0.5).tolist(),
    'solver':['newton-cg', 'lbfgs', 'sag'],
    'multi_class':['ovr', 'multinomial']
}
lreg_grid = GridSearchCV(lreg, param_grid, cv=10, scoring='neg_log_loss') #logloss to evaluate confidence of preds
                                                                          #value closer to zero reduces residual error
%time lreg_grid.fit(X_train, y_train)
print(lreg_grid.best_score_)
print(lreg_grid.best_params_)

-0.056488023139
{'multi_class': 'multinomial', 'C': 4.6, 'solver': 'lbfgs'}


In [34]:
## K-Nearest Neighbors
# Find predefined number of training samples closest in distance to the new point,
# and predict the label form these
knn = KNeighborsClassifier()
param_grid = {
    'n_neighbors':range(5,50,5),
    'leaf_size':range(10,80,10)
}
knn_grid = GridSearchCV(knn, param_grid, cv=10, scoring='neg_log_loss')
%time knn_grid.fit(X_train, y_train)
print(knn_grid.best_score_)
print(knn_grid.best_params_)

-0.194581643404
{'n_neighbors': 5, 'leaf_size': 10}


In [35]:
## Random Forest
# A high number of decision trees are built and trained and the mode of the results is given in the output
rfc = RandomForestClassifier()
param_grid = {
    'n_estimators':range(50,700,200),
    'max_depth':range(1,5,2),
    'min_samples_split':range(1,50,15),
    'min_samples_leaf':range(1,50,15)  #smaller values tend to capture noise
}
rfc_grid = GridSearchCV(rfc, param_grid, cv=10, scoring='neg_log_loss')
%time rfc_grid.fit(X_train, y_train)
print(rfc_grid.best_score_)
print(rfc_grid.best_params_)

-3.23041839142
{'min_samples_split': 31, 'n_estimators': 450, 'max_depth': 3, 'min_samples_leaf': 46}


In [57]:
## Ensemble with VotingClassifier
eclf = VotingClassifier(estimators=[
        ('logistic', lreg_grid.best_estimator_),
        ('knn', knn_grid.best_estimator_),
        ('random_forest', rfc_grid.best_estimator_)],
        voting='soft',
#         weights=[0.7,0.2,0.1]
        weights=[20,4,1]
    )
cross_val_score(eclf, X_train, y_train, cv=10, scoring='neg_log_loss').mean()

-0.10011466839774015

In [65]:
# Predict ensemble probabilities and print results
eclf = CalibratedClassifierCV(eclf, cv=10)
eclf.fit(X_train, y_train)
y_pred_proba = eclf.predict_proba(X_test)
df_submission = pd.DataFrame(y_pred_proba, index=df_test['id'], columns=le.classes_)
# df_submission.to_csv('/Users/dominicdebiaso/Desktop/kaggle_leaf_classification_ensemble.csv')