In [19]:
# Notebook modified from Kaggle: https://www.kaggle.com/shahishita23/income-classification-88-roc-auc-score

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Silencing warnings 

import warnings
warnings.filterwarnings('ignore')
pd.set_option('mode.chained_assignment',None) #Silencing the Setting with Copying Warning

%matplotlib inline

In [17]:
filepath = 'adult.csv'
df = pd.read_csv(filepath)

In [18]:
cols_df = pd.DataFrame(df.dtypes)
num_cols = list(cols_df[cols_df[0]=='int64'].index)
cat_cols = list(cols_df[cols_df[0]=='object'].index)[:-1] #excluding target column of income 
print('Numeric variables includes:','\n',num_cols)
print('\n')
print('Categorical variables includes','\n',cat_cols)

Numeric variables includes: 
 ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']


Categorical variables includes 
 ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']


In [62]:
from collections import Counter
Counter(df['race'])

Counter({'White': 25933,
         'Black': 2817,
         'Asian-Pac-Islander': 895,
         'Other': 231,
         'Amer-Indian-Eskimo': 286})

In [20]:
#Replacing '?' string with NaN values
df.replace(to_replace='?',value=np.nan,inplace=True)

In [22]:
df.dropna(axis=0,inplace=True)

In [23]:
df['income'].replace(to_replace='<=50K',value=0,inplace=True)
df['income'].replace(to_replace='>50K',value=1,inplace=True)

In [24]:
#Identifying categorical columns where more than 90% of observations belong only to one categroy

cat_drop = []
for i in cat_cols:
    if (df[i].value_counts(normalize=True)[0]) > 0.9:
        cat_drop.append(i)
        
print(cat_drop)

['native.country']


In [25]:
#Similarly for numerical columns

num_drop = []
for i in num_cols:
    if df[i].value_counts(normalize=True).iloc[0] > 0.9:
        num_drop.append(i)
        
print(num_drop)

['capital.gain', 'capital.loss']


In [26]:
X = df.drop(labels = cat_drop + num_drop + ['income'],axis=1)
y = df['income']

In [27]:
X.drop('education',axis=1,inplace=True)
X.drop('fnlwgt',axis=1,inplace=True)

In [28]:
#Listing all options other than private
to_replace = list(X['workclass'].unique())
to_replace.remove('Private')

#Placing all other categories under one bracket
X.replace(to_replace,'Non-Private',inplace=True)
X['workclass'].value_counts(normalize=True)*100

Private        73.887673
Non-Private    26.112327
Name: workclass, dtype: float64

In [29]:
#Let us consolidate all options where individuals were married at least once (i.e. all options other than never-married)
to_replace = list(X['marital.status'].unique())
to_replace.remove('Never-married')

#Placing all other categories under one bracket
X.replace(to_replace,'Married',inplace=True)

#Renaming the 'Never-married' category to 'Single'
X.replace('Never-married','Single',inplace=True)

In [31]:
#Separating the categorical variables in feature matrix that need to be encoded 

cols_X = pd.DataFrame(X.dtypes)
X_cat_cols = list(cols_X[cols_X[0]=='object'].index)
X_num_cols = list(cols_X[cols_X[0]=='int64'].index)

In [32]:
X = pd.get_dummies(data=X,prefix=X_cat_cols,drop_first=True)

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.25,random_state=101)

In [34]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [35]:
X_train[X_num_cols] = sc.fit_transform(X_train[X_num_cols])
X_val[X_num_cols] = sc.transform(X_val[X_num_cols])

In [36]:
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, confusion_matrix, roc_curve, log_loss, brier_score_loss

In [38]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(random_state=101)
log.fit(X_train,y_train)
log_y_pred = log.predict_proba(X_val)
log_roc = roc_auc_score(y_val,log_y_pred[:,-1])
print('ROC AUC score : ',log_roc)
print(log.get_params())

ROC AUC score :  0.8772318737812443
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 101, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [40]:
len(log_y_pred)

7541

In [51]:
attrlist = []

for i,row in X_val.iterrows():
    race = ""
    gender = ""
    if row['race_Asian-Pac-Islander'] == 1:
        race = "Asian"
    elif row['race_Black'] == 1:
        race = "Black"
    elif row['race_White'] == 1:
        race = "White"
    elif row['race_Other'] == 1:
        race = "Other"
    else:
        race = "NativeAm"
        
    if row['sex_Male'] == 1:
        gender = "Male"
    else:
        gender = "Female"
        
    attrlist.append(race+"_"+gender)

        

In [52]:
log_y_pred

array([[0.98805573, 0.01194427],
       [0.29615703, 0.70384297],
       [0.96698763, 0.03301237],
       ...,
       [0.37971341, 0.62028659],
       [0.33474661, 0.66525339],
       [0.19876634, 0.80123366]])

In [53]:
scores = []
for y in log_y_pred:
    scores.append(y[1])

In [54]:
incomedata = []
for att,sc in zip(attrlist,scores):
    incomedata.append({'score':sc, 'real_attr':att})
    
incomedata.sort(key=lambda x:x['score'], reverse=True)

In [55]:
incomedata

[{'score': 0.9745225421791006, 'real_attr': 'White_Male'},
 {'score': 0.9677706351887208, 'real_attr': 'White_Male'},
 {'score': 0.9643494676809221, 'real_attr': 'White_Male'},
 {'score': 0.9534162138443115, 'real_attr': 'White_Male'},
 {'score': 0.949975071427275, 'real_attr': 'White_Male'},
 {'score': 0.9491959088437405, 'real_attr': 'White_Male'},
 {'score': 0.9473007637012764, 'real_attr': 'White_Male'},
 {'score': 0.9447017430682298, 'real_attr': 'White_Male'},
 {'score': 0.9441750334991531, 'real_attr': 'White_Male'},
 {'score': 0.9418461852133313, 'real_attr': 'White_Male'},
 {'score': 0.9415769228871519, 'real_attr': 'White_Male'},
 {'score': 0.9387300935560305, 'real_attr': 'White_Male'},
 {'score': 0.9356657240724678, 'real_attr': 'White_Male'},
 {'score': 0.9353390560961274, 'real_attr': 'White_Male'},
 {'score': 0.9344333663016277, 'real_attr': 'White_Male'},
 {'score': 0.9309714195680526, 'real_attr': 'White_Female'},
 {'score': 0.9296937256191923, 'real_attr': 'White_Fema

In [56]:
import json
f = open('incomedatalist.json','w')
json.dump(incomedata,f)
f.close()

In [57]:
set([i['real_attr'] for i in incomedata])

{'Asian_Female',
 'Asian_Male',
 'Black_Female',
 'Black_Male',
 'NativeAm_Female',
 'NativeAm_Male',
 'Other_Female',
 'Other_Male',
 'White_Female',
 'White_Male'}