In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc,roc_curve,roc_auc_score
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

In [3]:
df=pd.read_csv('adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [4]:
dfr=pd.get_dummies(df,columns=['workclass','marital-status','occupation','relationship','race',
                               'gender','native-country'])
dfr.drop(['education'],axis=1,inplace=True)
dfr.replace({'income':{'<=50K': 0,'>50K':1}},inplace=True)

In [5]:
dfr.drop(['income'],axis=1).corrwith(dfr['income']).sort_values(ascending=False)[:20]

marital-status_Married-civ-spouse    0.445853
relationship_Husband                 0.403791
educational-num                      0.332613
age                                  0.230369
hours-per-week                       0.227687
capital-gain                         0.223013
gender_Male                          0.214628
occupation_Exec-managerial           0.210938
occupation_Prof-specialty            0.188793
capital-loss                         0.147554
workclass_Self-emp-inc               0.139596
relationship_Wife                    0.120484
race_White                           0.083710
workclass_Federal-gov                0.062112
workclass_Local-gov                  0.034576
native-country_United-States         0.032551
workclass_Self-emp-not-inc           0.027190
occupation_Protective-serv           0.024873
occupation_Sales                     0.023977
native-country_India                 0.022361
dtype: float64

In [6]:
X=dfr.drop(['income'],axis=1)
y=dfr.income

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
def mod_eval(models):
    aucs=[]
    for item in models:
        print(item)
        item.fit(X_train,y_train)
        y_pred=item.predict(X_test)
        aucs.append(roc_auc_score(y_test,y_pred))
        print(roc_auc_score(y_test,y_pred))
    return aucs

In [9]:
mods = [
    GaussianNB(),
    LogisticRegression(),
    RandomForestClassifier(),
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=5),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier()
]

In [10]:
evals=mod_eval(mods)

GaussianNB(priors=None, var_smoothing=1e-09)
0.6306159587222901
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
0.6177350517455096
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
0.7856535385765521
KNeighborsClassifier(algo