In [7]:
# ==========================
# LOADING DATASETS
# ==========================
import pandas as pd
import numpy as np

df_train = pd.read_csv('./Databases/train.csv', sep=";")
df_test = pd.read_csv('./Databases/test.csv', sep=";", header=None)

# ==========================
# SUPPORT LIBRARIES
# ==========================
import os.path
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import matthews_corrcoef, accuracy_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, Normalizer

# ==========================
# SUPPORT FUNCTIONS 
# ==========================

def processDF(df):
  myLE = LabelEncoder()
  for columnName in df.columns:
    if columnName == "duration":
      df.drop(columnName, 1)
    elif df[columnName].dtype == np.object:
      myLE.fit(df[columnName].drop_duplicates())
      df[columnName] = myLE.transform(df[columnName])
  return df 

def predictCSV(model, df, path):
    df_pred = pd.DataFrame({'Id':range(1, len(df)+1), 'prediction': model.predict(df)})
    df_pred.to_csv(path, index=False)
    return

# ==========================
# DATA PREPROCESSING
# ==========================

# remove 'duration' and encode features
df_train = processDF(df_train)
df_test = processDF(df_test)

# Set up X and Y datasets
X = df_train.iloc[:,0:20]
Y = df_train["y"]

# Generate test and split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=69)

# Classifiers
clfs = {
    'mlp' : MLPClassifier(hidden_layer_sizes=(100,20,50)),
    'knn' : KNeighborsClassifier(n_neighbors=7),
    'rfc' : RandomForestClassifier(n_estimators=50),
    'gbc' : GradientBoostingClassifier(n_estimators=30)
}
pipe = Pipeline([
    ('scaling', None),
    ('scaling2', None),
    ('reduce_dim',None),
    ('skb',None),
    ('classify',None)
])
param_grid = [
    {
        'scaling': [MinMaxScaler(), MaxAbsScaler(), RobustScaler()],
        'scaling2': [RobustScaler(), Normalizer()],
        'reduce_dim': [PCA(svd_solver='randomized')],
        'skb': [SelectKBest(f_classif, k=8)],
        'classify': [clfs['mlp']]
    },
    {
        'scaling': [StandardScaler(), MinMaxScaler(), MaxAbsScaler(), RobustScaler(), Normalizer()],
        'reduce_dim': [PCA(svd_solver='randomized')],
        'skb': [SelectKBest(f_classif)],
        'skb__k': range(7,9),
        'classify': [clfs['knn'],clfs['rfc'],clfs['gbc']]
    },
    {
        'scaling': [StandardScaler(), MinMaxScaler(), MaxAbsScaler(), RobustScaler(), Normalizer()],
        'reduce_dim': [PCA(svd_solver='randomized')],
        'skb': [SelectKBest(f_classif)],
        'skb__k': range(7,9),
        'classify' : [VotingClassifier([\
                                       ('mlp',clfs['mlp']),('knn',clfs['knn']),\
                                       ('rfc',clfs['rfc']),('gbc',clfs['gbc'])\
                                       ], voting='hard')]
    },
    {
        'scaling': [StandardScaler(), MinMaxScaler(), MaxAbsScaler(), RobustScaler(), Normalizer()],
        'reduce_dim': [PCA(svd_solver='randomized')],
        'skb': [SelectKBest(f_classif, k=9)],
        'classify' : [VotingClassifier([\
                                       ('mlp',clfs['mlp']),('knn',clfs['knn']),\
                                       ('rfc',clfs['rfc']),('gbc',clfs['gbc'])\
                                       ], voting='soft')]
    }
]

# ==========================
# CLASSIFICATION MODEL
# ==========================
grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid, scoring=make_scorer(matthews_corrcoef))
grid.fit(X,Y)

# Best model for the data
model = grid.best_estimator_

# Test model
model.fit(X_train, Y_train)
print(matthews_corrcoef(Y_test, grid.predict(X_test)))
print(matthews_corrcoef(Y, cross_val_predict(model, X, Y, cv=10)))

# Build the prediction
testing_ds = df_test
model.fit(X,Y)
predictCSV(model, testing_ds, 'prediction.csv')

0.502026608115
0.511678734609
