In [1]:
import pandas as pd
import numpy as np
import adult_data_functions as af
from sklearn import preprocessing
from sklearn.utils import resample
import matplotlib.pyplot as plt

In [2]:
categorical_feature_encoder = preprocessing.OneHotEncoder()
sex_encoder = preprocessing.LabelEncoder() 
race_encoder = preprocessing.LabelEncoder()
income_encoder = preprocessing.LabelEncoder()

In [3]:
np.random.seed(0)
train_data_dict, excessive_data_dict = \
    af.preprocess(adult_dt_path="data/adult.data", 
                  categorical_feature_encoder=categorical_feature_encoder, 
                  sex_encoder=sex_encoder, race_encoder=race_encoder, 
                  income_encoder=income_encoder, encoder_fit_boolean=True,
                 drop_prop_male_poor=0.7)

In [4]:
pd.Series(train_data_dict["sex"]).value_counts(normalize=True)

1    0.509785
0    0.490215
dtype: float64

In [5]:
pd.Series(train_data_dict["income-label"]).value_counts(normalize=True)

0    0.643137
1    0.356863
dtype: float64

In [6]:
len(train_data_dict["sex"])

21972

In [7]:
len(excessive_data_dict["sex"])

10589

In [8]:
test_data_dict = af.preprocess(adult_dt_path="data/adult.test", categorical_feature_encoder=categorical_feature_encoder, 
                             sex_encoder=sex_encoder, race_encoder=race_encoder, income_encoder=income_encoder, 
                             encoder_fit_boolean=False)

# Fit Logsitic Model
## Income

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, f1_score, recall_score, make_scorer

In [10]:
logistic_x_train_mat = np.hstack([train_data_dict["categorical-features"].toarray(), 
                            train_data_dict["continuous-features"].to_numpy()])
logistic_x_exc_mat = np.hstack([excessive_data_dict["categorical-features"].toarray(), 
                            excessive_data_dict["continuous-features"].to_numpy()])
logistic_x_test_mat = np.hstack([test_data_dict["categorical-features"].toarray(), 
                                  test_data_dict["continuous-features"].to_numpy()])

In [11]:
naive_logistic_model = LogisticRegression(random_state=0)
naive_logistic_model.fit(X=logistic_x_train_mat, y=train_data_dict["income-label"])

LogisticRegression(random_state=0)

In [12]:
pd.Series(naive_logistic_model.predict(logistic_x_train_mat)).value_counts(normalize=True)

0    0.856226
1    0.143774
dtype: float64

In [13]:
# On Traning set
af.score_summary(y_pred=naive_logistic_model.predict(logistic_x_train_mat),
                 y_true=train_data_dict["income-label"], pos_label=1)

Unnamed: 0,accuracy,precision,recall,f1
0,0.716639,0.755619,0.304425,0.434


In [14]:
# On the excessive data set.
af.score_summary(y_pred=naive_logistic_model.predict(logistic_x_exc_mat),
                 y_true=excessive_data_dict["income-label"], pos_label=0)

Unnamed: 0,accuracy,precision,recall,f1
0,0.937577,1.0,0.937577,0.967783


In [15]:
# On Test Set
af.score_summary(y_pred=naive_logistic_model.predict(logistic_x_test_mat),
                 y_true=test_data_dict["income-label"], pos_label=1)

Unnamed: 0,accuracy,precision,recall,f1
0,0.790001,0.613263,0.300572,0.40342


In [16]:
af.score_summary(y_pred=naive_logistic_model.predict(logistic_x_test_mat),
                 y_true=test_data_dict["income-label"], pos_label=0)

Unnamed: 0,accuracy,precision,recall,f1
0,0.790001,0.813143,0.941375,0.872573


## Sex

In [17]:
naive_logistic_model = LogisticRegression(random_state=0)
naive_logistic_model.fit(X=logistic_x_train_mat, y=train_data_dict["sex"])

LogisticRegression(random_state=0)

In [18]:
pd.Series(naive_logistic_model.predict(logistic_x_train_mat)).value_counts(normalize=True)

1    0.67677
0    0.32323
dtype: float64

In [19]:
# On Traning set
af.score_summary(y_pred=naive_logistic_model.predict(logistic_x_train_mat),
                 y_true=train_data_dict["sex"], pos_label=1)

Unnamed: 0,accuracy,precision,recall,f1
0,0.581695,0.567586,0.753504,0.647463


In [20]:
# On Test set
af.score_summary(y_pred=naive_logistic_model.predict(logistic_x_test_mat),
                 y_true=test_data_dict["sex"], pos_label=1)

Unnamed: 0,accuracy,precision,recall,f1
0,0.614029,0.708988,0.714733,0.711849


In [21]:
af.score_summary(y_pred=naive_logistic_model.predict(logistic_x_test_mat),
                 y_true=test_data_dict["sex"], pos_label=0)

Unnamed: 0,accuracy,precision,recall,f1
0,0.614029,0.419089,0.412286,0.415659
