In [7]:
import numpy as np
import pandas as pd
import sklearn as learn
import tensorflow as tf
    

In [94]:
class Model:
    def __init__(self, model_name, file_name):
        self.model_name = model_name
        self.file_name = file_name
    
    def create_dataset(self):
        return pd.read_csv(self.file_name)
    
    def get_columns_name(self):
        return list(self.create_dataset().columns)
    
    def create_features_labels(self, index_of_labels=-1):
        df = self.create_dataset()
        labels = df.iloc[:,index_of_labels]
        features = df[df.columns.difference([labels.name])]
        return features, labels
    
    def create_train_test_set(self, ratio=0.2):
        from sklearn.cross_validation import train_test_split
        X, y = self.create_features_labels()
        X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=ratio)
        return X_train, X_test, Y_train, Y_test
    
    

    

In [95]:
a  = Model("ania", "train.csv")

In [97]:
df = a.create_dataset()



In [113]:
X_train, X_test, Y_train, Y_test = a.create_train_test_set()

In [110]:
# Construct some pipelines
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn import svm



pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('pca', PCA(n_components=2)),
                    ('clf', LogisticRegression(random_state=42))])

pipe_svm = Pipeline([('scl', StandardScaler()),
                     ('pca', PCA(n_components=2)),
                     ('clf', svm.SVC(random_state=42))])

pipe_dt = Pipeline([('scl', StandardScaler()),
                    ('pca', PCA(n_components=2)),
                    ('clf', tree.DecisionTreeClassifier(random_state=42))])

In [111]:
pipe_lr

Pipeline(memory=None,
     steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [122]:

pipelines = [pipe_lr, pipe_svm, pipe_dt]

# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Support Vector Machine', 2: 'Decision Tree'}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, Y_train)

# Compare accuracies
for idx, val in enumerate(pipelines):
    print('%s pipeline test accuracy: %.3f' % (pipe_dict[idx], val.score(X_test, Y_test)))

# Identify the most accurate model on test data
best_acc = 0.0
best_clf = 0
best_pipe = ''
for idx, val in enumerate(pipelines):
    if val.score(X_test,Y_test) > best_acc:
        best_acc = val.score(X_test, Y_test)
        best_pipe = val
        best_clf = idx
print('Classifier with best accuracy: %s' % pipe_dict[best_clf])



Logistic Regression pipeline test accuracy: 0.932
Support Vector Machine pipeline test accuracy: 0.932
Decision Tree pipeline test accuracy: 0.917
Classifier with best accuracy: Logistic Regression
