In [None]:
import pandas as pd
input_data=pd.read_csv('train.csv', index_col=0)
X_submission=pd.get_dummies(pd.read_csv('test.csv', index_col=0))

### Let's see what the data looks like

In [None]:
input_data.head()

####  Are there any missing data?

In [None]:
input_data.isnull().sum()

##### One problem has been solved by itself  :)

#### How the types are distributed ?

In [None]:
input_data.type.value_counts()

## Data Preprocessing

### Encoding class labels

In [None]:
from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()
le_data=input_data
le_data['type']=class_le.fit_transform(input_data['type'])
le_data.head()

### I Partitioning a dataset in training and test sets 

In [None]:
from sklearn.model_selection import train_test_split

X_org, y_org = le_data.iloc[:,:5], le_data.iloc[:,-1]
X_org_train, X_org_test, y_org_train, y_org_test = train_test_split(X_org, y_org, test_size=0.3)

### Encoding nominal feature

In [None]:
le_data=pd.get_dummies(le_data)
typ=le_data['type']
le_data.drop(columns='type', inplace=True)
le_data['type']=typ
le_data.head()

### II Partitioning a dataset in training and test sets

In [None]:
X, y = le_data.iloc[:,:10], le_data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
y_train.value_counts()

## Definition of some useful functions

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# Cross validation
from sklearn.model_selection import cross_val_score
import numpy as np

def stratified_k_fold_val(estimator, X, y, cv=10):
    scores = cross_val_score(estimator=estimator,
                             X=X,
                             y=y,
                             cv=cv,
                             n_jobs=-1)
    
    mean = np.mean(scores)
    std = np.std(scores)
    print('mean CV accuracy %.3f, std CV %.3f' % (mean, std))
    
    
# Validation curve
from sklearn.model_selection import validation_curve

def plot_validation_curve(estimator, param_name, param_range, X, y, cv=10):

    train_scores, test_scores = validation_curve(estimator=estimator,
                                                 X=X,
                                                 y=y,
                                                 param_name=param_name,
                                                 param_range=param_range,
                                                 cv=cv,
                                                 n_jobs=-1)

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    
    plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
    plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')
    plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
    plt.grid()
    plt.xscale('log')
    plt.legend(loc='lower right')
    plt.xlabel('Parameter')
    plt.ylabel('Accuracy')
    plt.ylim([0.2, 1.0])
    plt.show()

    
# Learning curve
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, X, y, cv=10):

    train_sizes, train_scores, test_scores = learning_curve(estimator=estimator,
                                                            X=X,
                                                            y=y,
                                                            train_sizes=np.linspace(0.1, 1.0, 10),
                                                            cv=cv,
                                                            n_jobs=-1,
                                                            random_state=1)

    train_mean=np.mean(train_scores, axis=1)
    train_std=np.std(train_scores, axis=1)
    test_mean=np.mean(test_scores, axis=1)
    test_std=np.std(test_scores, axis=1)
    
    plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
    plt.grid()
    plt.xlabel('Number of training samples')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.ylim([0.2, 1.0])
    plt.show()
    

# recursive elimination
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV


def recursive_elimination(estimator, X, y, step=1, cv=3):
    
    rfecv = RFECV(estimator=estimator, step=step, cv=StratifiedKFold(cv), scoring='accuracy')
    rfecv.fit(X, y)
    
    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
    
    print("Optimal number of features : %d" % rfecv.n_features_)
    print('ranking: %s' % (rfecv.ranking_))

    
# grid search    
from sklearn.model_selection import GridSearchCV

def grid_search(X, y, X_te, y_te, n_estimators, max_features, max_depth):
    
    rf = RandomForestClassifier(criterion='entropy',
                                n_jobs=-1)

    param_grid = [{'n_estimators':n_estimators,
                   'max_features':max_features,
                   'max_depth':max_depth}]
                     
    gs = GridSearchCV(estimator=rf,
                      param_grid=param_grid,
                      scoring='accuracy',
                      cv=10,
                      n_jobs=-1)

    gs_params = gs.fit(X, y)

    rf.n_estimators = gs_params.best_params_['n_estimators']
    rf.max_features = gs_params.best_params_['max_features']   
    rf.max_depth = gs_params.best_params_['max_depth']
   
    print('best params:', gs_params.best_params_)
    print('best score:', gs_params.best_score_)
    stratified_k_fold_val(rf, X, y)
    rf.fit(X, y)
    print('test set score:', rf.score(X_te, y_te))

# Classifiers

## Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

n_estimators=[5, 10, 20, 50, 70]
max_features=[1, 2, 3, 4, 5, 6, 7]
max_depth=[1, 2, 3 , 4]

### Grid search

In [None]:
grid_search(X_train, y_train, X_test, y_test, n_estimators, max_features, max_depth)

### Def random forest

In [None]:
rf = RandomForestClassifier(criterion='entropy',
                            n_estimators=50,
                            max_features=3,
                            max_depth=4,
                            n_jobs=-1,
                            random_state=1)

### Depth - best params

In [None]:
plot_validation_curve(rf , 'max_depth', max_depth, X_train, y_train, cv=10)

### Features - best params

In [None]:
rf.max_depth=3

plot_validation_curve(rf, 'max_features', max_features, X_train, y_train, cv=10)

### N estimators - best params

In [None]:
rf.max_features=3

plot_validation_curve(rf, 'n_estimators', n_estimators, X_train, y_train, cv=10)

### Learning curve with best params - clasic

In [None]:
rf.n_estimators=10

plot_learning_curve(rf, X_train, y_train)

### CV

In [None]:
stratified_k_fold_val(rf, X_train, y_train)

### Test

In [None]:
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
rf.feature_importances_

### Submission - clasic

In [None]:
rf.fit(X, y)

predict = class_le.inverse_transform(rf.predict(X_submission))
pd.Series(predict, index=X_submission.index, name='type').to_csv('rf.csv', header=True, index_label='id')