In [8]:
import pandas as pd
import numpy as np

In [9]:
census_names = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income'
]

In [37]:
df = pd.read_csv('./adult_data/adult.data', names=census_names, skipinitialspace=True)

In [58]:
(df == '?').any(axis=0)

age               False
workclass          True
fnlwgt            False
education         False
education-num     False
marital-status    False
occupation         True
relationship      False
race              False
sex               False
capital-gain      False
capital-loss      False
hours-per-week    False
native-country     True
income            False
dtype: bool

In [66]:
workclass_missing = df.index[df['workclass'] == '?'].tolist()
occupation_missing = df.index[df['occupation'] == '?'].tolist()
native_country_missing = df.index[df['native-country'] == '?'].tolist()

all_missing = set(workclass_missing).union(occupation_missing).union(native_country_missing)

In [69]:
df.drop(all_missing, inplace=True, axis=0)

In [70]:
(df == '?').any(axis=0)

  result = method(y)


age               False
workclass         False
fnlwgt            False
education         False
education-num     False
marital-status    False
occupation        False
relationship      False
race              False
sex               False
capital-gain      False
capital-loss      False
hours-per-week    False
native-country    False
income            False
dtype: bool

---

In [73]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object

In [90]:
features_unique = {
    'workclass': np.sort(df['workclass'].unique()),
    'education': np.sort(df['education'].unique()),
    'marital-status': np.sort(df['marital-status'].unique()),
    'occupation': np.sort(df['occupation'].unique()),
    'relationship': np.sort(df['relationship'].unique()),
    'race': np.sort(df['race'].unique()),
    'sex': np.sort(df['sex'].unique()),
    'native-country': np.sort(df['native-country'].unique()),
    'income': np.sort(df['income'].unique())
}

In [91]:
map_i_s = {}
map_s_i = {}
for feature_name in features_unique:
    map_i_s[feature_name] = dict()
    map_s_i[feature_name] = dict()
    for i, value in enumerate(features_unique[feature_name]):
        map_i_s[feature_name][i] = value
        map_s_i[feature_name][value] = i

In [96]:
for feature_name in map_s_i:
    df[feature_name] = df[feature_name].map(map_s_i[feature_name])

In [98]:
df.dtypes

age               int64
workclass         int64
fnlwgt            int64
education         int64
education-num     int64
marital-status    int64
occupation        int64
relationship      int64
race              int64
sex               int64
capital-gain      int64
capital-loss      int64
hours-per-week    int64
native-country    int64
income            int64
dtype: object

---

In [104]:
array = df.drop('income', axis=1).values
ground_truth = df['income'].values

---

In [115]:
import sklearn.linear_model
import sklearn.metrics
import joblib

In [110]:
clf = sklearn.linear_model.LogisticRegression(solver='lbfgs')
clf.fit(array, ground_truth)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [111]:
train_predict = clf.predict(array)

In [112]:
sklearn.metrics.accuracy_score(ground_truth, train_predict)

0.7840660433658245

---

In [113]:
np.save('./adult_data/adult_num.pkl', array)

In [114]:
np.save('./adult_data/adult_num_gt.pkl', ground_truth)

In [116]:
joblib.dump(clf, './adult_data/log_reg.joblib') 

['./adult_data/log_reg.joblib']

In [124]:
np.random.choice(array.shape[0], 10, replace=False)

array([ 8567, 14528,  6236,  2962, 26274, 10925,  1148,  8223, 22340,
       14193])