# Prepare the data and build a model

In [None]:
# Install requirements
!pip install -r requirements.txt
!pip install -r ../requirements.txt

## Prepare the data

In [15]:
# Import packages
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd

import fatf_dash.census as census
from fatf_dash.census import census_names

In [4]:
# Download census
! wget http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
# ! wget http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test
# ! wget http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names
# ! wget http://archive.ics.uci.edu/ml/machine-learning-databases/adult/old.adult.names

--2022-03-22 20:19:37--  http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3974305 (3.8M) [application/x-httpd-php]
Saving to: ‘adult.data’


2022-03-22 20:19:55 (219 KB/s) - ‘adult.data’ saved [3974305/3974305]



In [8]:
# Read in the data
df = pd.read_csv(
    'adult.data',
    names=census_names,
    skipinitialspace=True)

In [9]:
# Check for missing values
(df == '?').any(axis=0)

  result = method(y)


age               False
workclass          True
fnlwgt            False
education         False
education-num     False
marital-status    False
occupation         True
relationship      False
race              False
sex               False
capital-gain      False
capital-loss      False
hours-per-week    False
native-country     True
income            False
dtype: bool

In [10]:
# Remove missing data points
workclass_missing = df.index[df['workclass'] == '?'].tolist()
occupation_missing = df.index[df['occupation'] == '?'].tolist()
native_country_missing = df.index[df['native-country'] == '?'].tolist()

all_missing = set(workclass_missing).union(occupation_missing).union(native_country_missing)

df.drop(all_missing, inplace=True, axis=0)

In [11]:
# Check for missing, again
(df == '?').any(axis=0)

age               False
workclass         False
fnlwgt            False
education         False
education-num     False
marital-status    False
occupation        False
relationship      False
race              False
sex               False
capital-gain      False
capital-loss      False
hours-per-week    False
native-country    False
income            False
dtype: bool

In [12]:
# Display feature types
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object

In [13]:
# Get unique feature values
features_unique = {
    'workclass': np.sort(df['workclass'].unique()),
    'education': np.sort(df['education'].unique()),
    'marital-status': np.sort(df['marital-status'].unique()),
    'occupation': np.sort(df['occupation'].unique()),
    'relationship': np.sort(df['relationship'].unique()),
    'race': np.sort(df['race'].unique()),
    'sex': np.sort(df['sex'].unique()),
    'native-country': np.sort(df['native-country'].unique()),
    'income': np.sort(df['income'].unique())
}

In [14]:
# Create feature mappings in both directions
map_i_s = {}
map_s_i = {}
for feature_name in features_unique:
    map_i_s[feature_name] = dict()
    map_s_i[feature_name] = dict()
    for i, value in enumerate(features_unique[feature_name]):
        map_i_s[feature_name][i] = value
        map_s_i[feature_name][value] = i

In [17]:
# Make sure that these are still up to date
def dict_comp(d1, d2):
    d1_keys, d2_keys = sorted(list(d1)), sorted(list(d2))
    assert len(d1_keys) == len(d2_keys)
    for i, j in zip(d1_keys, d2_keys):
        assert i == j
        d1_i_keys, d2_j_keys = sorted(list(d1[i])), sorted(list(d2[j]))
        assert len(d1_i_keys) == len(d2_j_keys)
        for ii, jj in zip(d1_i_keys, d2_j_keys):
            assert ii == jj
            assert d1[i][ii] == d2[j][jj]

dict_comp(census.map_i_s, map_i_s)
dict_comp(census.map_s_i, map_s_i)

In [18]:
# Map all of the string features to integer, making the data set numerical
for feature_name in map_s_i:
    df[feature_name] = df[feature_name].map(map_s_i[feature_name])

In [19]:
# Check the types
df.dtypes

age               int64
workclass         int64
fnlwgt            int64
education         int64
education-num     int64
marital-status    int64
occupation        int64
relationship      int64
race              int64
sex               int64
capital-gain      int64
capital-loss      int64
hours-per-week    int64
native-country    int64
income            int64
dtype: object

In [20]:
# Remove ground truth from the data set
array = df.drop('income', axis=1).values
ground_truth = df['income'].values

## Build a model

In [21]:
import joblib
import sklearn.linear_model
import sklearn.metrics

In [22]:
# Fit logistic regression
clf = sklearn.linear_model.LogisticRegression(solver='lbfgs')
clf.fit(array, ground_truth)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
# Check training set accuracy
train_predict = clf.predict(array)
sklearn.metrics.accuracy_score(ground_truth, train_predict)

0.7840660433658245

In [24]:
# Save the data, ground truth and model
np.save('adult_num.pkl', array)
np.save('adult_num_gt.pkl', ground_truth)
joblib.dump(clf, 'log_reg.joblib') 

['log_reg.joblib']