In [133]:
import pandas as pd
import numpy as np
import pylab as pl
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFECV
from sklearn.neighbors import LocalOutlierFactor

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [134]:
df_train = pd.read_csv("train.csv")

In [135]:
noninformative = 0
for name in df_train.columns:
    if df_train[name].dtype == 'object':
        s = df_train[name].value_counts()
        if s[0] > int(0.9 * df_train.index.size):
            df_train = df_train.drop([name], axis=1)
            noninformative += 1
print(str(noninformative) + " признаков неинформативны, осталось " + str(len(df_train.columns)) + " признаков")

7 признаков неинформативны, осталось 35 признаков


In [136]:
df_train = df_train.replace(to_replace={'CLASS': {'-50000':0, '50000+':1}})
Y_train = df_train['CLASS'].values.astype(np.int)
df_train = df_train.drop(['CLASS'], axis=1)
print(Y_train)

[1 0 1 ..., 0 0 1]


In [137]:
from copy import deepcopy


def dummy_encode_categorical_columns(df, categorical_columns):
    result_df = deepcopy(df)
    for column in categorical_columns:
        result_df = pd.concat([result_df, pd.get_dummies(result_df[column], 
                                                         prefix=column, prefix_sep=': ')], axis=1)
        del result_df[column]
    return result_df

In [138]:
categorical_columns = set(df_train.columns[df_train.dtypes == 'object'])
df_no_cat = dummy_encode_categorical_columns(df_train, categorical_columns)
X_train=df_no_cat.values.astype(np.float32)
print(X_train.shape)

(10000, 306)


In [3]:
def cross_validate(X, y, clf, splits=4, sel_feat=False, del_anom=False):
    scores = []
    kf = KFold(n_splits=splits)
    for train, test in kf.split(X_train):
        x_train, x_test, y_train, y_test = X[train], X[test], y[train], y[test]
        if sel_feat:
            estimator = DecisionTreeClassifier()
            selector = RFECV(estimator, step=0.1, n_jobs=-1)
            x_train = selector.fit_transform(x_train, y_train)
            x_test = x_test[:,selector.ranking_==1]
        
        if del_anom:
            lof = LocalOutlierFactor()
            keep = lof.fit_predict(x_train)
            x_train = x_train[keep==1,:]
            y_train = y_train[keep==1]
            
        
        score = accuracy_score(y_test, clf.fit(x_train, y_train).predict(x_test))
        scores.append(score)
    return scores

    

In [2]:
print("FEATURE SELECTION")
clf = svm.SVC()
print("svm.SVC, 4 fold cross validation, with feature selection")
print(cross_validate(X_train, Y_train, clf, sel_feat=True))
print("svm.SVC, 4 fold cross validation, without feature selection")
print(cross_validate(X_train, Y_train, clf))

clf = DecisionTreeClassifier()
print("Decision tree, 4 fold cross validation, with feature selection")
print(cross_validate(X_train, Y_train, clf, sel_feat=True))
print("Decision tree, 4 fold cross validation, without feature selection")
print(cross_validate(X_train, Y_train, clf))

FEATURE SELECTION


NameError: name 'svm' is not defined

In [142]:
print("ANOMALY DETECTION")

clf = svm.SVC()
print("svm.SVC, 4 fold cross validation, with anomaly detection")
print(cross_validate(X_train, Y_train, clf, del_anom=True))
print("svm.SVC, 4 fold cross validation, without anomaly detection")
print(cross_validate(X_train, Y_train, clf))

clf = DecisionTreeClassifier()
print("Decision tree, 4 fold cross validation, with anomaly detection")
print(cross_validate(X_train, Y_train, clf, del_anom=True))
print("Decision tree, 4 fold cross validation, without anomaly detection")
print(cross_validate(X_train, Y_train, clf))





ANOMALY DETECTION
svm.SVC, 4 fold cross validation, with anomaly detection
[0.76080000000000003, 0.75639999999999996, 0.73839999999999995, 0.75960000000000005]
svm.SVC, 4 fold cross validation, without anomaly detection
[0.76400000000000001, 0.75800000000000001, 0.74239999999999995, 0.76519999999999999]
Decision tree, 4 fold cross validation, with anomaly detection
[0.7964, 0.8196, 0.81440000000000001, 0.81799999999999995]
Decision tree, 4 fold cross validation, without anomaly detection
[0.8004, 0.8256, 0.80720000000000003, 0.82320000000000004]
