In [None]:
import pandas as pd
input_data=pd.read_csv('train.csv', index_col=0)
X_submission=pd.get_dummies(pd.read_csv('test.csv', index_col=0))

### Let's see what the data looks like

In [None]:
input_data.head()

####  Are there any missing data?

In [None]:
input_data.isnull().sum()

##### One problem has been solved by itself  :)

#### How the types are distributed ?

In [None]:
input_data.type.value_counts()

## Data Preprocessing

### Encoding class labels

In [None]:
from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()
le_data=input_data
le_data['type']=class_le.fit_transform(input_data['type'])
le_data.head()

### Encoding nominal feature

In [None]:
le_data=pd.get_dummies(le_data)
typ=le_data['type']
le_data.drop(columns='type', inplace=True)
le_data['type']=typ
le_data.head()

### Partitioning a dataset in training and test sets

In [None]:
from sklearn.model_selection import train_test_split

X, y = le_data.iloc[:,:10], le_data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
y_train.value_counts()

## Standardization

In [None]:
from sklearn.preprocessing import StandardScaler

stdsc_1 = StandardScaler()
X_train_stand = stdsc_1.fit_transform(X_train)
X_test_stand = stdsc_1.transform(X_test)

## Feature selection PCA

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np

%matplotlib inline

pca = PCA()
X_train_pca = pca.fit_transform(X_train_stand)
X_test_pca = pca.transform(X_test_stand)
pca.explained_variance_ratio_

plt.bar(range(1, X_train_pca.shape[1]+1), pca.explained_variance_ratio_, alpha=0.5, align='center')
plt.step(range(1, X_train_pca.shape[1]+1), np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.show()


## Linear Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA()
X_train_lda = lda.fit_transform(X_train_stand, y_train)
X_test_lda = lda.transform(X_test_stand)

In [None]:
lda.explained_variance_ratio_

## Definition of some useful functions

In [None]:
# Cross validation
from sklearn.model_selection import cross_val_score
import numpy as np

def stratified_k_fold_val(estimator, X, y, cv=10):
    scores = cross_val_score(estimator=estimator,
                             X=X,
                             y=y,
                             cv=cv,
                             n_jobs=-1)
    
    mean = np.mean(scores)
    std = np.std(scores)
    print('mean CV accuracy %.3f, std CV %.3f' % (mean, std))
    
    
# Validation curve
from sklearn.model_selection import validation_curve

def plot_validation_curve(estimator, param_name, param_range, X, y, cv=10):

    train_scores, test_scores = validation_curve(estimator=estimator,
                                                 X=X,
                                                 y=y,
                                                 param_name=param_name,
                                                 param_range=param_range,
                                                 cv=cv,
                                                 n_jobs=-1)

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    
    plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
    plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')
    plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
    plt.grid()
    plt.xscale('log')
    plt.legend(loc='lower right')
    plt.xlabel('Parameter')
    plt.ylabel('Accuracy')
    plt.ylim([0.2, 1.0])
    plt.show()

    
# Learning curve
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, X, y, cv=10):

    train_sizes, train_scores, test_scores = learning_curve(estimator=estimator,
                                                            X=X,
                                                            y=y,
                                                            train_sizes=np.linspace(0.1, 1.0, 10),
                                                            cv=cv,
                                                            n_jobs=-1,
                                                            random_state=1)

    train_mean=np.mean(train_scores, axis=1)
    train_std=np.std(train_scores, axis=1)
    test_mean=np.mean(test_scores, axis=1)
    test_std=np.std(test_scores, axis=1)
    
    plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
    plt.grid()
    plt.xlabel('Number of training samples')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.ylim([0.2, 1.0])
    plt.show()
    

# recursive elimination
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV


def recursive_elimination(estimator, X, y, step=1, cv=3):
    
    rfecv = RFECV(estimator=estimator, step=step, cv=StratifiedKFold(cv), scoring='accuracy')
    rfecv.fit(X, y)
    
    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
    
    print("Optimal number of features : %d" % rfecv.n_features_)
    print('ranking: %s' % (rfecv.ranking_))

    
# grid search    
from sklearn.model_selection import GridSearchCV

def grid_search(X, y, X_te, y_te, c, solver, multi_clas):
    
    lr = Pipeline([('scl', StandardScaler()),
                   ('clf', LogisticRegression())])

    param_grid = [{'clf__C':c,
                   'clf__solver':solver,
                   'clf__multi_class':multi_clas}]

    gs = GridSearchCV(estimator=lr,
                      param_grid=param_grid,
                      scoring='accuracy',
                      cv=10,
                      n_jobs=-1)

    gs_params = gs.fit(X, y)

    lr.clf__C = gs_params.best_params_['clf__C']
    lr.clf__solver = gs_params.best_params_['clf__solver']
    lr.clf__multi_class = gs_params.best_params_['clf__multi_class']

    print('best params:', gs_params.best_params_)
    print('best score:', gs_params.best_score_)
    stratified_k_fold_val(lr, X, y)
    lr.fit(X, y)
    print('test set score:', lr.score(X_te, y_te))

# Classifiers

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

c = [0.001, 0.01, 0.1, 1.0, 10, 100]
solver = ['newton-cg', 'lbfgs', 'sag', 'saga']
multi_class = ['ovr', 'multinomial']

## Clasic

### Grid search - clasic

In [None]:
grid_search(X_train, y_train, X_test, y_test, c, solver, multi_class)

### Recursive elimination with best params - clasic

In [None]:
re_clasic = LogisticRegression(C=0.1, multi_class='multinomial', solver='newton-cg')
recursive_elimination(re_clasic, X_train_stand, y_train)

### Elimination params and gs again - clasic

In [None]:
X_c = X_train.iloc[:,:4]
X_te_c = X_test.iloc[:,:4]
grid_search(X_c, y_train, X_te_c, y_test, c, solver, multi_class)

### Learning curve with best params - clasic

In [None]:
clasic = Pipeline([('scl', StandardScaler()),
                   ('clf', LogisticRegression(C=0.1, multi_class='multinomial', solver='newton-cg'))])

plot_learning_curve(clasic, X_c, y_train)

### Submission - clasic

In [None]:
X_c_all = X.iloc[:,:4]
clasic.fit(X_c_all, y)
X_c_sub = X_submission.iloc[:,:4]

predict_c = class_le.inverse_transform(clasic.predict(X_c_sub))
pd.Series(predict_c, index=X_submission.index, name='type').to_csv('lr_clasic.csv', header=True, index_label='id')

### AdaBoost - clasic

In [None]:
from sklearn.ensemble import AdaBoostClassifier

clasic_2 = LogisticRegression(C=0.1, multi_class='multinomial', solver='newton-cg')

ada_boost_clasic = AdaBoostClassifier(base_estimator=clasic_2, n_estimators=10, random_state=1)
stratified_k_fold_val(ada_boost_clasic, X_c, y_train)

ada_boost_clasic.fit(X_c, y_train)
print('test set score:', ada_boost_clasic.score(X_te_c, y_test))

## PCA

### Grid search - PCA

In [None]:
grid_search(X_train_pca, y_train, X_test_pca, y_test, c, solver, multi_class)

### Recursive elimination with best params - PCA

In [None]:
re_pca = LogisticRegression(C=1.0, multi_class='ovr', solver='newton-cg')
recursive_elimination(re_pca, X_train_pca, y_train)

### Elimination params and gs again - PCA

In [None]:
X_pca = X_train_pca[:,[0,1,2,5,6,7,8]]
X_te_pca = X_test_pca[:,[0,1,2,5,6,7,8]]
grid_search(X_pca, y_train, X_te_pca, y_test, c, solver, multi_class)

### Learning curve with best params - PCA

In [None]:
pca_cla_1 = Pipeline([('scl', StandardScaler()),
                ('clf', LogisticRegression(C=0.1, multi_class='multinomial', solver='newton-cg'))])

plot_learning_curve(pca_cla_1, X_pca, y_train)

### Submission - PCA

In [None]:
stdsc_2 = StandardScaler()
X_stand_2 = stdsc_2.fit_transform(X)
pca_2 = PCA()
X_pca_all = pca_2.fit_transform(X_stand_2)
pca_cla_1.fit(X_pca_all, y)

X_std_sub_pca = stdsc_2.transform(X_submission)
X_pca_sub = pca_2.transform(X_std_sub_pca)

predict_pca = class_le.inverse_transform(pca_cla_1.predict(X_pca_sub))
pd.Series(predict_pca, index=X_submission.index, name='type').to_csv('lr_pca.csv', header=True, index_label='id')

### AdaBoost - PCA

In [None]:
pca_cla_2 = LogisticRegression(C=0.1, multi_class='multinomial', solver='newton-cg')

ada_boost_pca = AdaBoostClassifier(base_estimator=pca_cla_2, n_estimators=10, random_state=1)
stratified_k_fold_val(ada_boost_pca, X_pca, y_train)

ada_boost_pca.fit(X_pca, y_train)
print('test set score:', ada_boost_pca.score(X_te_pca, y_test))

## LDA

### Grid search - LDA

In [None]:
grid_search(X_train_lda, y_train, X_test_lda, y_test, c, solver, multi_class)

### Recursive elimination with best params - LDA

In [None]:
re_lda = LogisticRegression(C=10, multi_class='multinomial', solver='newton-cg')
recursive_elimination(re_lda, X_train_lda, y_train)

### Elimination params and gs again - LDA

In [None]:
X_lda = X_train_lda
X_te_lda = X_test_lda
grid_search(X_lda, y_train, X_te_lda, y_test, c, solver, multi_class)

### Learning curve with best params - LDA

In [None]:
lda = Pipeline([('scl', StandardScaler()),
                ('clf', LogisticRegression(C=10, multi_class='multinomial', solver='newton-cg'))])

plot_learning_curve(lda, X_lda, y_train)

### Submission - LDA

In [None]:
stdsc_3 = StandardScaler()
X_stand_3 = stdsc_3.fit_transform(X)

lda_2 = LDA()
X_lda_all = lda_2.fit_transform(X_stand_3, y)
lda.fit(X_lda_all, y)

X_std_sub_lda = stdsc_3.transform(X_submission)
X_lda_sub = lda_2.transform(X_std_sub_lda)

predict_lda = class_le.inverse_transform(lda.predict(X_lda_sub))
pd.Series(predict_lda, index=X_submission.index, name='type').to_csv('lr_lda.csv', header=True, index_label='id')

### AdaBoost - LDA

In [None]:
lda_cla_2 = LogisticRegression(C=10, multi_class='multinomial', solver='newton-cg')

ada_boost_lda = AdaBoostClassifier(base_estimator=lda_cla_2, n_estimators=10, random_state=1)
stratified_k_fold_val(ada_boost_lda, X_lda, y_train)

ada_boost_lda.fit(X_lda, y_train)
print('test set score:', ada_boost_lda.score(X_te_lda, y_test))