In [228]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Lets download the Adult data from UCI repository

In [229]:
import urllib.request

print('Downloading Adult data')

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
urllib.request.urlretrieve(url, 'adult-data.csv')

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
urllib.request.urlretrieve(url, 'adult-data-test.csv')

Downloading Adult data


('adult-data-test.csv', <http.client.HTTPMessage at 0x125076be0>)

In [230]:
col_names = ['age', 'workclass','fnlwgt','education','education-num','marital-status','occupation',
             'relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','salary']

df = pd.read_csv('adult-data.csv', header=None, names = col_names)
df_test = pd.read_csv('adult-data-test.csv', header=None, names = col_names)
df_test = df_test.drop(df_test.index[0])


df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Missing value completion and y, X split

In [231]:
df = df.replace({' ?': None})
df = df.fillna(method = 'backfill')

df_test = df_test.replace({' ?': None})
df_test = df_test.fillna(method = 'backfill')

y = df.salary
y_test = df_test.salary

df = df.drop('salary',axis = 1)
df_test = df_test.drop('salary',axis = 1)

In [232]:
y = np.int8(y == ' >50K')
y_test = np.int8(y_test == ' >50K.')

## One hot vectorization of categorical variables

In [233]:
col_names = ['workclass','education','marital-status','occupation',
            'relationship','race','sex']

for name in col_names:
    one_hot = pd.get_dummies(df[name])
    df = df.drop(name,axis = 1)
    df = df.join(one_hot)
    
for name in col_names:
    one_hot = pd.get_dummies(df_test[name])
    df_test = df_test.drop(name,axis = 1)
    df_test = df_test.join(one_hot)
    
df = df.drop('native-country',axis = 1)
df_test = df_test.drop('native-country',axis = 1)

In [234]:
df.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,Federal-gov,Local-gov,Never-worked,Private,...,Own-child,Unmarried,Wife,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White,Female,Male
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
2,38,215646,9,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1
3,53,234721,7,0,0,40,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
4,28,338409,13,0,0,40,0,0,0,1,...,0,0,1,0,0,1,0,0,1,0


In [235]:
X = df.to_numpy()
X_test = df_test.to_numpy()

Now using LDA and other models on our X,y

In [236]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
model = LinearDiscriminantAnalysis().fit(X, y)

train_error = np.mean(model.predict(X) != y)
test_error = np.mean(model.predict(X_test) != y_test)
print("Train Error = ", round(100*train_error,1))
print("Test Error = ", round(100*test_error,1))

Train Error =  16.0
Test Error =  15.7


In [237]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=0).fit(X, y)

train_error = np.mean(model.predict(X) != y)
test_error = np.mean(model.predict(X_test) != y_test)
print("Train Error = ", round(100*train_error,1))
print("Test Error = ", round(100*test_error,1))

Train Error =  20.4
Test Error =  20.2


In [238]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=15, random_state=0).fit(X, y)

train_error = np.mean(model.predict(X) != y)
test_error = np.mean(model.predict(X_test) != y_test)
print("Train Error = ", round(100*train_error,1))
print("Test Error = ", round(100*test_error,1))

Train Error =  10.9
Test Error =  13.7
