In [1]:
import pandas as pd
import numpy as np

from os import listdir

from imblearn.over_sampling import SMOTE

from sklearn import preprocessing
from sklearn import svm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

### util functions

In [2]:
def read_data(file):
    df = pd.read_csv(file)
    
    ## discard some useless columns
    X = df[df.columns[3:-1]].values
    X = X.astype(float)
    y = df[df.columns[-1]].values
    return X, y

def feature_selection(X, y):
    clf = ExtraTreesClassifier()
    clf = clf.fit(X, y)
    print("feature importance:", clf.feature_importances_)
    model = SelectFromModel(clf, prefit=True)
    X_selected = model.transform(X)
    print("shape of X after selection:", X_selected.shape)
    return X_selected

### def1

In [3]:
X, y = read_data('data/labeled/def1.csv')

## standarlize
X_scaled = preprocessing.scale(X)

## split train sets and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y)
print(y_test.sum() / y_train.sum())

# param_grid = [{'gamma': np.arange(0.005, 0.015, 0.005), 'C': np.arange(2300.0, 2500.0, 50.0)}]
param_grid = [{'gamma': np.arange(0.001, 0.006, 0.001), 'C': np.arange(2300.0, 2400.0, 10.0)}]

clf = GridSearchCV(svm.SVC(), param_grid, cv=StratifiedKFold(), scoring='roc_auc')
clf.fit(X_train, y_train)

print("Best estimator found:", clf.best_estimator_)
print("Best parameters found:", clf.best_params_)
print("Best score found:", clf.best_score_)

0.257575757576
Best estimator found: SVC(C=2360.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0040000000000000001,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
Best parameters found: {'C': 2360.0, 'gamma': 0.0040000000000000001}
Best score found: 0.553967637076


In [4]:
classifier = svm.SVC(C=2360, gamma=0.004, probability=True)
classifier.fit(X_train, y_train)
prob = classifier.predict_proba(X_test)[:, 1]
print("roc_auc_score:", roc_auc_score(y_test, prob))

roc_auc_score: 0.509022732599


### def2

In [None]:
X, y = read_data('data/labeled/def2.csv')

## standarlize
X_scaled = preprocessing.scale(X)

## split train sets and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y)

# param_grid = [{'gamma': np.arange(0.005, 0.015, 0.005), 'C': np.arange(2300.0, 2500.0, 50.0)}]
param_grid = [{'gamma': np.arange(0.001, 0.006, 0.001), 'C': np.arange(2300.0, 2400.0, 10.0)}]

clf = GridSearchCV(svm.SVC(), param_grid, cv=StratifiedKFold(), scoring='roc_auc')
clf.fit(X_train, y_train)

print("Best estimator found:", clf.best_estimator_)
print("Best parameters found:", clf.best_params_)
print("Best score found:", clf.best_score_)

0.24882629108


In [42]:
classifier = svm.SVC(C=2360, gamma=0.005, probability=True)
classifier.fit(X_train, y_train)
prob = classifier.predict_proba(X_test)[:, 1]
print("roc_auc_score:", roc_auc_score(y_test, prob))

0.257575757576
Best estimator found: SVC(C=2370.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0030000000000000001,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
Best parameters found: {'gamma': 0.0030000000000000001, 'C': 2370.0}
Best score found: 0.63469162007


### def3

In [None]:
X, y = read_data('data/labeled/def2.csv')

## standarlize
X_scaled = preprocessing.scale(X)

## split train sets and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y)

# param_grid = [{'gamma': np.arange(0.005, 0.015, 0.005), 'C': np.arange(2300.0, 2500.0, 50.0)}]
param_grid = [{'gamma': np.arange(0.001, 0.006, 0.001), 'C': np.arange(2300.0, 2400.0, 10.0)}]

clf = GridSearchCV(svm.SVC(), param_grid, cv=StratifiedKFold(), scoring='roc_auc')
clf.fit(X_train, y_train)

print("Best estimator found:", clf.best_estimator_)
print("Best parameters found:", clf.best_params_)
print("Best score found:", clf.best_score_)

In [47]:
classifier = svm.SVC(C=2360, gamma=0.005, probability=True)
classifier.fit(X_train, y_train)
prob = classifier.predict_proba(X_test)[:, 1]
print("roc_auc_score:", roc_auc_score(y_test, prob))

### def4

In [None]:
X, y = read_data('data/labeled/def2.csv')

## standarlize
X_scaled = preprocessing.scale(X)

## split train sets and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y)

# param_grid = [{'gamma': np.arange(0.005, 0.015, 0.005), 'C': np.arange(2300.0, 2500.0, 50.0)}]
param_grid = [{'gamma': np.arange(0.001, 0.006, 0.001), 'C': np.arange(2300.0, 2400.0, 10.0)}]

clf = GridSearchCV(svm.SVC(), param_grid, cv=StratifiedKFold(), scoring='roc_auc')
clf.fit(X_train, y_train)

print("Best estimator found:", clf.best_estimator_)
print("Best parameters found:", clf.best_params_)
print("Best score found:", clf.best_score_)

In [None]:
classifier = svm.SVC(C=2360, gamma=0.005, probability=True)
classifier.fit(X_train, y_train)
prob = classifier.predict_proba(X_test)[:, 1]
print("roc_auc_score:", roc_auc_score(y_test, prob))