In [103]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer 
from sklearn.svm import OneClassSVM
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


In [79]:
def get_dataDF():
    cat_features = [
    "Workclass",  "Education",
    "Education-Num", "Marital Status","Occupation", 
    "Relationship", "Race", "Sex",  "Country"]


    numeric_features = ["Age", "Capital Loss","Hours per week", "fnlwgt", "Capital Gain"]
    feature_names = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital Status","Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss","Hours per week", "Country"]

    df = pd.read_csv('./adult.data', header=None, names= feature_names + ['label'],index_col=None)
    labels = df.loc[:,'label'].values
    df = df.replace(to_replace=' ?', value=None)
    df = df.dropna()
    le = LabelEncoder()
    le.fit(labels)
    enc_labels = le.transform(labels)
    df.loc[:,'label'] = enc_labels
    label_classes = le.classes_

    return df, numeric_features, cat_features, label_classes

In [80]:
df, numeric_features, cat_features, label_classes = get_dataDF()

In [81]:
len(df), Counter(df['label'])

(32561, Counter({0: 24720, 1: 7841}))

In [82]:
categorical_names = {}
for feature in cat_features:
    le = sklearn.preprocessing.LabelEncoder()
    le.fit(df.loc[:, feature])
    df.loc[:, feature] = le.transform(df.loc[:, feature])
    categorical_names[feature] = le.classes_

In [83]:
categorical_names

{'Workclass': array([' Federal-gov', ' Local-gov', ' Never-worked', ' Private',
        ' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay'],
       dtype=object),
 'Education': array([' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th',
        ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate',
        ' HS-grad', ' Masters', ' Preschool', ' Prof-school',
        ' Some-college'], dtype=object),
 'Education-Num': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]),
 'Marital Status': array([' Divorced', ' Married-AF-spouse', ' Married-civ-spouse',
        ' Married-spouse-absent', ' Never-married', ' Separated',
        ' Widowed'], dtype=object),
 'Occupation': array([' Adm-clerical', ' Armed-Forces', ' Craft-repair',
        ' Exec-managerial', ' Farming-fishing', ' Handlers-cleaners',
        ' Machine-op-inspct', ' Other-service', ' Priv-house-serv',
        ' Prof-specialty', ' Protective-serv', ' Sales', ' Tech-support',

In [84]:
categorical_catLabels = {k: list(range(len(v))) for k,v in categorical_names.items()}

df_class0 = df.loc[df['label']==0]
df_class1 = df.loc[df['label']==1]
for cat, cat_lables in categorical_catLabels.items():
    df_class0[cat].astype(CategoricalDtype(cat_lables))
    df_class1[cat].astype(CategoricalDtype(cat_lables))
    

In [85]:
df.loc[df['label']==1]['Capital Gain'].describe(), df.loc[df['label']==0]['Capital Gain'].describe()

(count     7841.000000
 mean      4006.142456
 std      14570.378951
 min          0.000000
 25%          0.000000
 50%          0.000000
 75%          0.000000
 max      99999.000000
 Name: Capital Gain, dtype: float64,
 count    24720.000000
 mean       148.752468
 std        963.139307
 min          0.000000
 25%          0.000000
 50%          0.000000
 75%          0.000000
 max      41310.000000
 Name: Capital Gain, dtype: float64)

In [86]:
df.loc[df['label']==0]['Capital Loss'].describe(), df.loc[df['label']==1]['Capital Loss'].describe()

(count    24720.000000
 mean        53.142921
 std        310.755769
 min          0.000000
 25%          0.000000
 50%          0.000000
 75%          0.000000
 max       4356.000000
 Name: Capital Loss, dtype: float64,
 count    7841.000000
 mean      195.001530
 std       595.487574
 min         0.000000
 25%         0.000000
 50%         0.000000
 75%         0.000000
 max      3683.000000
 Name: Capital Loss, dtype: float64)

In [87]:
df_class0

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country,label
0,39,6,77516,9,12,4,0,1,4,1,2174,0,40,38,0
1,50,5,83311,9,12,2,3,0,4,1,0,0,13,38,0
2,38,3,215646,11,8,0,5,1,4,1,0,0,40,38,0
3,53,3,234721,1,6,2,5,0,2,1,0,0,40,38,0
4,28,3,338409,9,12,2,9,5,2,0,0,0,40,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32553,32,3,116138,12,13,4,12,1,1,1,0,0,11,35,0
32555,22,3,310152,15,9,4,10,1,4,1,0,0,40,38,0
32556,27,3,257302,7,11,2,12,5,4,0,0,0,38,38,0
32558,58,3,151910,11,8,6,0,4,4,0,0,0,40,38,0


In [88]:
df_class0["Workclass"] 

0        6
1        5
2        3
3        3
4        3
        ..
32553    3
32555    3
32556    3
32558    3
32559    3
Name: Workclass, Length: 24720, dtype: int64

In [75]:
pd.get_dummies(df["Workclass"],prefix='Workclass')

Unnamed: 0,Workclass_0,Workclass_1,Workclass_2,Workclass_3,Workclass_4,Workclass_5,Workclass_6,Workclass_7
0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0
2,0,0,0,1,0,0,0,0
3,0,0,0,1,0,0,0,0
4,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...
32556,0,0,0,1,0,0,0,0
32557,0,0,0,1,0,0,0,0
32558,0,0,0,1,0,0,0,0
32559,0,0,0,1,0,0,0,0


In [124]:
# For AD create train data
ad_train_df = df_class0.copy()
ad_test_df = df_class1.copy()
tmp = ad_train_df.append(ad_test_df,ignore_index=True)
for cat in cat_features:
    tmp = pd.concat(
        [tmp,
         pd.get_dummies(tmp[cat], 
                        prefix=cat
                       )],axis=1
    ).drop([cat],axis=1)  
    
ad_train_df = tmp.loc[tmp['label']==0]
ad_test_df = tmp.loc[tmp['label']==1]   

In [129]:
ad_test_df.head(10)

Unnamed: 0,Age,fnlwgt,Capital Gain,Capital Loss,Hours per week,label,Workclass_0,Workclass_1,Workclass_2,Workclass_3,...,Country_31,Country_32,Country_33,Country_34,Country_35,Country_36,Country_37,Country_38,Country_39,Country_40
0,0.479452,0.134036,0.0,0.0,0.44898,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.191781,0.022749,0.340934,0.0,0.5,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0.342466,0.099947,0.125345,0.0,0.397959,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,0.273973,0.182135,0.0,0.0,0.806122,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,0.178082,0.087619,0.0,0.0,0.397959,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0.315068,0.074359,0.0,0.0,0.397959,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
6,0.356164,0.190088,0.0,0.0,0.44898,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
7,0.315068,0.123089,0.0,0.0,0.602041,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
8,0.534247,0.138932,0.0,0.0,0.397959,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
9,0.506849,0.114048,0.0,0.0,0.602041,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0


In [126]:
# ---------------------------------
# Set up Standardscalers for eachnumerical feature
# ---------------------------------
numFeature_Scalers = {}


for num_f in numeric_features:
    obj= MinMaxScaler()
    obj.fit(df_class0.loc[:,num_f].values.reshape([-1,1]))
    ad_test_df.loc[:,num_f]= obj.transform( ad_test_df.loc[:,num_f].values.reshape([-1,1])).reshape(-1)
    ad_train_df.loc[:,num_f]= obj.transform( ad_train_df.loc[:,num_f].values.reshape([-1,1])).reshape(-1)
    numFeature_Scalers[num_f] = obj

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See th

In [127]:
ad_test_df = ad_test_df.reset_index(drop=True)
ad_train_df = ad_train_df.reset_index(drop=True)


In [130]:
len(ad_test_df)

7841

In [131]:
ad_train_df = ad_train_df.sample(frac=1)
ad_test_df_C1 = ad_test_df.copy()
test_len = int(0.25*len(ad_test_df_C1))

In [132]:
ad_test_df_C0 = ad_train_df.head(test_len)
ad_train_df_C0 = ad_train_df.tail(len(ad_train_df)-test_len)

In [133]:
combined_test = ad_test_df_C0.append(ad_test_df_C1,ignore_index=True)

In [134]:
test_Y = combined_test['label'].values.reshape(-1)

In [135]:

LABEL_COL = 'label'
combined_test = combined_test.drop([LABEL_COL],axis=1) 
ad_train_df_C0 = ad_train_df_C0.drop([LABEL_COL],axis=1) 

In [136]:
len(ad_train_df_C0)

22760

In [137]:
ocsvm_obj = OneClassSVM()

In [138]:
ocsvm_obj.fit(
    ad_train_df_C0.values
)

OneClassSVM()

In [140]:
result = ocsvm_obj.predict(combined_test.values)

In [141]:
groundtruth = []
for _ in test_Y:
    if _ == 1: groundtruth.append(-1)
    else: groundtruth.append(1)
        

In [143]:
from sklearn.metrics import accuracy_score
accuracy_score(groundtruth, result)

0.5249464340373431

In [144]:
probs = ocsvm_obj.score_samples(combined_test.values)

In [146]:
from sklearn.metrics import precision_recall_fscore_support
probs

array([3115.54069858, 3435.53202754, 3441.72372696, ..., 3148.70379588,
       3455.24777154, 2772.07340454])

In [147]:
from sklearn.ensemble import IsolationForest

In [157]:
clf = IsolationForest().fit(ad_train_df_C0.values)

In [158]:
result = clf.predict(combined_test.values)

In [159]:
accuracy_score(groundtruth, result)

0.20018365472910926

In [161]:
min(result)

-1