In [None]:
import pandas as pd
input_data=pd.read_csv('train.csv', index_col=0)
X_submission=pd.get_dummies(pd.read_csv('test.csv', index_col=0))

### Let's see what the data looks like

In [None]:
input_data.head()

####  Are there any missing data?

In [None]:
input_data.isnull().sum()

##### One problem has been solved by itself  :)

#### How the types are distributed ?

In [None]:
input_data.type.value_counts()

## Data Preprocessing

### Encoding class labels

In [None]:
from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()
le_data=input_data
le_data['type']=class_le.fit_transform(input_data['type'])
le_data.head()

### Encoding nominal feature

In [None]:
le_data=pd.get_dummies(le_data)
typ=le_data['type']
le_data.drop(columns='type', inplace=True)
le_data['type']=typ
le_data.head()

### Partitioning a dataset in training and test sets

In [None]:
from sklearn.model_selection import train_test_split

X, y = le_data.iloc[:,:10], le_data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
y_train.value_counts()

## Standardization

In [None]:
from sklearn.preprocessing import StandardScaler

stdsc_1 = StandardScaler()
X_train_stand = stdsc_1.fit_transform(X_train)
X_test_stand = stdsc_1.transform(X_test)

## Feature selection PCA

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np

%matplotlib inline

pca = PCA()
X_train_pca = pca.fit_transform(X_train_stand)
X_test_pca = pca.transform(X_test_stand)
pca.explained_variance_ratio_

plt.bar(range(1, X_train_pca.shape[1]+1), pca.explained_variance_ratio_, alpha=0.5, align='center')
plt.step(range(1, X_train_pca.shape[1]+1), np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.show()


## Linear Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA()
X_train_lda = lda.fit_transform(X_train_stand, y_train)
X_test_lda = lda.transform(X_test_stand)

In [None]:
lda.explained_variance_ratio_

## Definition of some useful functions

In [None]:
# Cross validation
from sklearn.model_selection import cross_val_score
import numpy as np

def stratified_k_fold_val(estimator, X, y, cv=10):
    scores = cross_val_score(estimator=estimator,
                             X=X,
                             y=y,
                             cv=cv,
                             n_jobs=-1)
    
    mean = np.mean(scores)
    std = np.std(scores)
    print('mean CV accuracy %.3f, std CV %.3f' % (mean, std))
    
    
# Validation curve
from sklearn.model_selection import validation_curve

def plot_validation_curve(estimator, param_name, param_range, X, y, cv=10):

    train_scores, test_scores = validation_curve(estimator=estimator,
                                                 X=X,
                                                 y=y,
                                                 param_name=param_name,
                                                 param_range=param_range,
                                                 cv=cv,
                                                 n_jobs=-1)

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    
    plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
    plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')
    plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
    plt.grid()
    plt.xscale('log')
    plt.legend(loc='lower right')
    plt.xlabel('Parameter')
    plt.ylabel('Accuracy')
    plt.ylim([0.2, 1.0])
    plt.show()

    
# Learning curve
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, X, y, cv=10):

    train_sizes, train_scores, test_scores = learning_curve(estimator=estimator,
                                                            X=X,
                                                            y=y,
                                                            train_sizes=np.linspace(0.1, 1.0, 10),
                                                            cv=cv,
                                                            n_jobs=-1,
                                                            random_state=1)

    train_mean=np.mean(train_scores, axis=1)
    train_std=np.std(train_scores, axis=1)
    test_mean=np.mean(test_scores, axis=1)
    test_std=np.std(test_scores, axis=1)
    
    plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
    plt.grid()
    plt.xlabel('Number of training samples')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.ylim([0.2, 1.0])
    plt.show()
    

# recursive elimination
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV


def recursive_elimination(estimator, X, y, step=1, cv=3):
    
    rfecv = RFECV(estimator=estimator, step=step, cv=StratifiedKFold(cv), scoring='accuracy')
    rfecv.fit(X, y)
    
    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
    
    print("Optimal number of features : %d" % rfecv.n_features_)
    print('ranking: %s' % (rfecv.ranking_))

    
# grid search    
from sklearn.model_selection import GridSearchCV

def grid_search(X, y, X_te, y_te, n_neighbors, weights, p):
    
    knn = Pipeline([('scl', StandardScaler()),
                   ('clf', KNeighborsClassifier(n_jobs=-1))])
    
    param_grid = [{'clf__n_neighbors':n_neighbors,
                   'clf__weights':weights,
                   'clf__p':p}]

    gs = GridSearchCV(estimator=knn,
                      param_grid=param_grid,
                      scoring='accuracy',
                      cv=10,
                      n_jobs=-1)

    gs_params = gs.fit(X, y)

    knn.clf__n_neighbors = gs_params.best_params_['clf__n_neighbors']
    knn.clf__weights = gs_params.best_params_['clf__weights']
    knn.clf__p = gs_params.best_params_['clf__p']

    print('best params:', gs_params.best_params_)
    print('best score:', gs_params.best_score_)
    stratified_k_fold_val(knn, X, y)
    knn.fit(X, y)
    print('test set score:', knn.score(X_te, y_te))

# Classifiers

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

n_neighbors=[1,2,3,4]
weights=['uniform', 'distance']
p=[1,2] 

## Clasic

### set up features

In [None]:
X_train_cla = X_train_stand[:,:4]
X_test_cla = X_test_stand[:,:4]

### Grid search - clasic

In [None]:
grid_search(X_train_cla, y_train, X_test_cla, y_test, n_neighbors , weights , p)

### Def knn - clasic

In [None]:
knn = KNeighborsClassifier(n_neighbors=4,
                           p=2,
                           weights='distance',
                           n_jobs=-1,)

### n_neighbors - best param - clasic

In [None]:
plot_validation_curve(knn, 'n_neighbors', n_neighbors, X_train_cla, y_train, cv=10)

### p - best param - clasic

In [None]:
knn.n_neighbors=4
plot_validation_curve(knn, 'p', p, X_train_cla, y_train, cv=10)

### weights - best param - clasic

In [None]:
knn.p=2

for w in weights:
    knn.weights=w
    stratified_k_fold_val(knn, X_train_cla, y_train)

### Learning curve with best params - clasic

In [None]:
knn.weights='distance'
plot_learning_curve(knn, X_train_cla, y_train)

### Test - clasic

In [None]:
knn.fit(X_train_cla, y_train)
knn.score(X_test_cla, y_test)

### Submission - clasic

In [None]:
X_cla =  X.iloc[:,:4]
X_submission_cla = X_submission.iloc[:,:4]

knn.fit(X_cla, y)

predict = class_le.inverse_transform(knn.predict(X_submission_cla))
pd.Series(predict, index=X_submission.index, name='type').to_csv('knn.csv', header=True, index_label='id')

## PCA

In [None]:
n_neighbors=[1,2,3,4,5,6,7,8]
weights=['uniform', 'distance']
p=[1,2] 

grid_search(X_train_pca, y_train, X_test_pca, y_test, n_neighbors , weights , p)

### Def knn - PCA

In [None]:
knn = KNeighborsClassifier(n_neighbors=6,
                           p=2,
                           weights='uniform',
                           n_jobs=-1,)

### n_neighbors - best param - PCA

In [None]:
plot_validation_curve(knn, 'n_neighbors', n_neighbors, X_train_pca, y_train, cv=10)

### p - best param - PCA

In [None]:
knn.n_neighbors=3
plot_validation_curve(knn, 'p', p, X_train_pca, y_train, cv=10)

### weights - best param - PCA

In [None]:
knn.p=1

for w in weights:
    knn.weights=w
    stratified_k_fold_val(knn, X_train_pca, y_train)

### Learning curve with best params - PCA

In [None]:
knn.weights='distance'
plot_learning_curve(knn, X_train_pca, y_train)

### Test - PCA

In [None]:
knn.fit(X_train_pca, y_train)
knn.score(X_test_pca, y_test)

### Submission - PCA

## LDA

In [None]:
n_neighbors=[1,2,3,4,5,6,7,8]

grid_search(X_train_lda, y_train, X_test_lda, y_test, n_neighbors , weights , p)

### Def knn - LDA

In [None]:
knn = KNeighborsClassifier(n_neighbors=6,
                           p=2,
                           weights='uniform',
                           n_jobs=-1,)

### n_neighbors - best param - LDA

In [None]:
plot_validation_curve(knn, 'n_neighbors', n_neighbors, X_train_lda, y_train, cv=10)

### p - best param - LDA

In [None]:
knn.n_neighbors=6
plot_validation_curve(knn, 'p', p, X_train_lda, y_train, cv=10)

### weights - best param - PCA

In [None]:
knn.p=1

for w in weights:
    knn.weights=w
    stratified_k_fold_val(knn, X_train_lda, y_train)

### Learning curve with best params - LDA

In [None]:
knn.weights='distance'
plot_learning_curve(knn, X_train_lda, y_train)

### Test - LDA

In [None]:
knn.fit(X_train_lda, y_train)
knn.score(X_test_lda, y_test)

### Submission - LDA