In [33]:
import pandas as pd

from os import listdir

from imblearn.over_sampling import SMOTE

from sklearn import preprocessing
from sklearn import svm
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [38]:
files = listdir('data/labeled')
for file in files:
    df = pd.read_csv('data/labeled/' + file)
    
    ## discard first 2 columns
    X = df[df.columns[3:-1]].values
    y = df[df.columns[-1]].values
    
    ## use smote to deal with unbalance
    X_resampled, y_resampled = SMOTE().fit_sample(X, y)

    ## standarlize
    X_scaled = preprocessing.scale(X_resampled)

    ## feature selection
    clf = ExtraTreesClassifier()
    clf = clf.fit(X_scaled, y_resampled)
    print(clf.feature_importances_)
    model = SelectFromModel(clf, prefit=True)
    X_selected = model.transform(X_scaled)
    print(X_selected.shape)
    
    ## split train sets and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y_resampled, test_size=0.2, random_state=1)
    
    ## svm
    classifier = svm.SVC(probability=True)
    classifier.fit(X_train, y_train)
        
    prob = classifier.predict_proba(X_test)[:, 1]
#     print(prob)
    print(file)
    print(roc_auc_score(y_test, prob))
    
    ## lr
#     classifier = LogisticRegression()
#     classifier.fit(X_train, y_train)
    
#     prob = classifier.predict_proba(X_test)[:, 1]
#     print(prob)
#     print(roc_auc_score(y_test, prob))

[ 0.308849    0.27254997  0.1341851   0.28441594]
(2556, 3)
def6.csv
0.70303058048
[ 0.34040458  0.26625408  0.10727896  0.28606238]
(2434, 3)
def4.csv
0.738443109731
[ 0.3321689   0.2454441   0.13639484  0.28599216]
(2432, 2)
def5.csv
0.686761428861
[ 0.32751737  0.25234424  0.12676424  0.29337415]
(2510, 3)
def1.csv
0.746185599746
[ 0.32440482  0.28796605  0.10921649  0.27841264]
(2144, 3)
def2.csv
0.656957013575
[ 0.3122735   0.30422817  0.11306807  0.27043026]
(2520, 3)
def3.csv
0.679957440101
[ 0.32671776  0.29840086  0.13617175  0.23870964]
(2644, 2)
def7.csv
0.837394537395
