## Data organization

## Data loading

In [1]:
pip install shap

Note: you may need to restart the kernel to use updated packages.




In [1]:
from shap.datasets import adult

X, y = adult()
print(X.shape)
print(y.shape)
print(type(X))
print(type(y))

(32561, 12)
(32561,)
<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>


In [2]:
print(X)
print(y)

        Age  Workclass  Education-Num  Marital Status  Occupation  \
0      39.0          7           13.0               4           1   
1      50.0          6           13.0               2           4   
2      38.0          4            9.0               0           6   
3      53.0          4            7.0               2           6   
4      28.0          4           13.0               2          10   
...     ...        ...            ...             ...         ...   
32556  27.0          4           12.0               2          13   
32557  40.0          4            9.0               2           7   
32558  58.0          4            9.0               6           1   
32559  22.0          4            9.0               4           1   
32560  52.0          5            9.0               2           4   

       Relationship  Race  Sex  Capital Gain  Capital Loss  Hours per week  \
0                 0     4    1        2174.0           0.0            40.0   
1              

In [3]:
print(X['Age'])
print(X['Age'].shape)
print(X['Age'].values)
print(type(X['Age'].values))
print(X['Age'].values.reshape(-1,1))
print(X['Age'].values.reshape(-1,1).shape)

0        39.0
1        50.0
2        38.0
3        53.0
4        28.0
         ... 
32556    27.0
32557    40.0
32558    58.0
32559    22.0
32560    52.0
Name: Age, Length: 32561, dtype: float32
(32561,)
[39. 50. 38. ... 58. 22. 52.]
<class 'numpy.ndarray'>
[[39.]
 [50.]
 [38.]
 ...
 [58.]
 [22.]
 [52.]]
(32561, 1)


In [4]:
numerical_columns = ['Age','Education-Num','Capital Gain','Capital Loss','Hours per week'] 
categorical_columns = ['Workclass','Marital Status','Occupation','Relationship','Race','Sex','Country']

## Conversion of categorical data

In [5]:
import pandas as pd # for one-hot encoding
from sklearn.preprocessing import StandardScaler # for normalization

In [6]:
# Normalization of numerical data
for column in numerical_columns:
    scaler = StandardScaler()
    X[column] = scaler.fit_transform(X[column].values.reshape(-1,1))
    #X[column] = scaler.fit_transform(X[column])
    
      
print(X)
print(type(X))

            Age  Workclass  Education-Num  Marital Status  Occupation  \
0      0.030671          7       1.134739               4           1   
1      0.837109          6       1.134739               2           4   
2     -0.042642          4      -0.420060               0           6   
3      1.057047          4      -1.197459               2           6   
4     -0.775768          4       1.134739               2          10   
...         ...        ...            ...             ...         ...   
32556 -0.849080          4       0.746039               2          13   
32557  0.103983          4      -0.420060               2           7   
32558  1.423610          4      -0.420060               6           1   
32559 -1.215643          4      -0.420060               4           1   
32560  0.983734          5      -0.420060               2           4   

       Relationship  Race  Sex  Capital Gain  Capital Loss  Hours per week  \
0                 0     4    1      0.148453 

In [7]:
# Data type change of categorical data 
# 진짜 categorical data 라는 것을 지정해주는 작업

for column in categorical_columns:
    X[column] = X[column].astype('category')

print(X)
print(X['Country'].values)

            Age Workclass  Education-Num Marital Status Occupation  \
0      0.030671         7       1.134739              4          1   
1      0.837109         6       1.134739              2          4   
2     -0.042642         4      -0.420060              0          6   
3      1.057047         4      -1.197459              2          6   
4     -0.775768         4       1.134739              2         10   
...         ...       ...            ...            ...        ...   
32556 -0.849080         4       0.746039              2         13   
32557  0.103983         4      -0.420060              2          7   
32558  1.423610         4      -0.420060              6          1   
32559 -1.215643         4      -0.420060              4          1   
32560  0.983734         5      -0.420060              2          4   

      Relationship Race Sex  Capital Gain  Capital Loss  Hours per week  \
0                0    4   1      0.148453      -0.21666       -0.035429   
1        

In [8]:
X = pd.get_dummies(X)
print(X)

            Age  Education-Num  Capital Gain  Capital Loss  Hours per week  \
0      0.030671       1.134739      0.148453      -0.21666       -0.035429   
1      0.837109       1.134739     -0.145920      -0.21666       -2.222153   
2     -0.042642      -0.420060     -0.145920      -0.21666       -0.035429   
3      1.057047      -1.197459     -0.145920      -0.21666       -0.035429   
4     -0.775768       1.134739     -0.145920      -0.21666       -0.035429   
...         ...            ...           ...           ...             ...   
32556 -0.849080       0.746039     -0.145920      -0.21666       -0.197409   
32557  0.103983      -0.420060     -0.145920      -0.21666       -0.035429   
32558  1.423610      -0.420060     -0.145920      -0.21666       -0.035429   
32559 -1.215643      -0.420060     -0.145920      -0.21666       -1.655225   
32560  0.983734      -0.420060      1.888424      -0.21666       -0.035429   

       Workclass_0  Workclass_1  Workclass_2  Workclass_3  Work

In [9]:
# One-hot encoding of categorical data
X = pd.get_dummies(X) 

# Conversion of data frame to numpy
X = X.values

# Converision: {False, True} --> {0., 1.}
y = y.astype(float) 

In [10]:
print(X.shape)
print(y.shape)
#print(X)
#print(y)
print(max(X[:,42]))

(32561, 91)
(32561,)
1.0


## train-val-test split

In [11]:
from sklearn.model_selection import train_test_split

X_,X_test,y_,y_test = train_test_split(X,y,test_size=1/10,stratify=y)
#X_,X_test,y_,y_test = train_test_split(X,y,test_size=1/10)

X_train,X_val,y_train,y_val = train_test_split(X_,y_,test_size=1/9,stratify=y_)
#X_train,X_val,y_train,y_val = train_test_split(X_,y_,test_size=1/9)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(sum(y_train)/y_train.shape)
print(sum(y_val)/y_val.shape)
print(sum(y_test)/y_test.shape)

(26048, 91)
(3256, 91)
(3257, 91)
[0.24082463]
[0.24078624]
[0.24071231]


## Logistic regression

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
model_LR = LogisticRegression()

# training
model_LR.fit(X_train, y_train)

# evaulation
val_acc = model_LR.score(X_val, y_val) 

print(val_acc)

0.8507371007371007


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [14]:
from joblib import dump

dump(model_LR, 'LR_sample.joblib')

['LR_sample.joblib']

In [15]:
from joblib import load

loaded_model_LR = load('LR_sample.joblib')