In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
                    names = ['age','workclass','fnlwgt','education',
                             'education-num','marital-status','occupation',
                             'relationship','race','sex','capital-gain',
                             'capital-loss','hours-per-week','native-country',
                             'income'],
                    index_col=False)

df.drop('fnlwgt', axis=1, inplace=True)

df = pd.get_dummies(df, drop_first=True) 
y = df['income_ >50K']
df.drop('income_ >50K', axis=1, inplace=True)

In [3]:
df.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,39,13,2174,0,40,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,13,0,0,13,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,9,0,0,40,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,53,7,0,0,40,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,28,13,0,0,40,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, stratify=y)

In [4]:
from sklearn.decomposition import PCA

In [5]:
pca = PCA(n_components=5)

In [8]:
pca.fit(X_train, y_train)

PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [10]:
pca.explained_variance_, pca.explained_variance_ratio_

(array([5.45082797e+07, 1.66090600e+05, 1.85817656e+02, 1.47982092e+02,
        6.50991809e+00]),
 array([9.96955917e-01, 3.03779549e-03, 3.39860316e-06, 2.70659105e-06,
        1.19066340e-07]))

In [11]:
pca.components_

array([[ 1.40262091e-04,  4.37383583e-05,  9.99998410e-01,
        -1.77223447e-03,  1.35740265e-04, -6.78559090e-08,
        -4.56204021e-07, -5.33103354e-09, -2.47586011e-06,
         2.36312749e-06,  1.30205248e-06, -2.39471495e-07,
        -3.56670220e-09, -5.78227144e-07, -2.20773412e-07,
        -8.86894794e-08, -1.75447957e-07, -3.05145307e-07,
        -1.96627244e-07, -2.92217495e-07, -2.34172115e-07,
         2.00120095e-06,  8.18279179e-07, -3.11052755e-06,
         1.58603889e-06, -2.00381811e-09,  3.07543578e-06,
        -1.81064746e-06, -8.56808820e-09,  5.92017537e-06,
        -1.45405385e-07, -4.14127058e-06, -3.27602235e-07,
        -3.01748623e-07, -1.37676249e-06, -4.57180213e-09,
        -9.36358759e-07,  2.69470018e-06, -2.05702112e-07,
        -6.71552594e-07, -8.49847641e-07, -1.71802693e-06,
        -7.82658892e-08,  3.81837101e-06, -1.40412397e-07,
         7.15330161e-07, -2.83668836e-07, -5.41010177e-07,
        -1.66577668e-06, -4.43164617e-07, -2.70016444e-0

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, 

In [15]:
from sklearn.linear_model import LogisticRegression

In [23]:
pipeline = Pipeline([
    ('scaling', StandardScaler()),
    ('pca', PCA(n_components=10)),
    ('predict', LogisticRegression(C=0.1))
])


In [24]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('scaling',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=10,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('predict',
                 LogisticRegression(C=0.1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [25]:
prediction = pipeline.predict(X_test)

In [26]:
prediction

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [27]:
from sklearn.metrics import precision_score, recall_score

In [28]:
precision_score(y_test, prediction), recall_score(y_test, prediction)

(0.7, 0.5446428571428571)

In [30]:
pipeline.named_steps['predict'].coef_

array([[ 1.1265917 ,  0.02922512,  0.11448624,  0.00441967, -0.25455235,
         0.42395   , -0.08924177,  0.14490544, -0.17775192,  0.02828345]])

In [31]:
pipeline.named_steps['pca'].components_

array([[ 2.62107292e-01,  2.87602538e-01,  1.13458928e-01,
         8.98845588e-02,  2.14023078e-01,  4.24575004e-02,
         8.18927933e-02, -1.14047720e-02, -1.98438806e-01,
         1.34223594e-01,  1.38803134e-01,  5.03270664e-02,
        -2.40320982e-03, -1.04108934e-01, -6.20557526e-02,
        -4.74056420e-02, -5.53016100e-02, -2.54152180e-02,
        -4.86210509e-02,  1.65123401e-02,  2.21796794e-02,
         1.56999177e-01,  1.05989895e-01, -7.83844421e-02,
         1.46710388e-01, -3.22980693e-02,  1.22515326e-01,
        -8.43864634e-02, -1.15288629e-03,  3.66920788e-01,
        -3.93755915e-02, -3.20702983e-01, -6.75305783e-02,
        -3.14355765e-02, -1.04503311e-01,  5.25948965e-04,
         2.74478750e-02,  1.57946586e-01,  2.75002545e-02,
        -9.16049913e-02, -6.50028023e-02, -1.79716153e-01,
        -5.25291541e-02,  1.84690068e-01,  3.56017964e-02,
        -9.44069771e-03,  6.24995603e-04,  1.06011365e-02,
        -9.36812965e-02, -9.47050938e-02, -2.59263770e-0

In [52]:
from sklearn.model_selection import GridSearchCV

In [33]:
pipeline = Pipeline([
    ('scaling', StandardScaler()),
    ('pca', PCA()),
    ('predict', LogisticRegression())
])


In [38]:
params = {
    'pca__n_components': [1, 5, 10, 15],
    'predict__C': [0.001, 0.1, 1.0]
}



In [39]:
grid = GridSearchCV(pipeline, params)

In [40]:
grid.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaling',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('predict',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
       

In [41]:
grid.best_estimator_

Pipeline(memory=None,
         steps=[('scaling',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=15,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('predict',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [42]:
grid.cv_results_

{'mean_fit_time': array([0.16540847, 0.16421504, 0.16651616, 0.18266945, 0.18099928,
        0.18035107, 0.17957826, 0.17860627, 0.17929177, 0.20733781,
        0.20603466, 0.20892544]),
 'std_fit_time': array([0.00651546, 0.00576768, 0.00654416, 0.00418211, 0.00758072,
        0.00490597, 0.00503326, 0.00932681, 0.00566715, 0.00777039,
        0.00432345, 0.00115418]),
 'mean_score_time': array([0.01054215, 0.01078153, 0.01087642, 0.00697021, 0.00643778,
        0.00663371, 0.00843668, 0.00812769, 0.0083343 , 0.00834885,
        0.00838199, 0.00846152]),
 'std_score_time': array([0.00034572, 0.00037155, 0.00026495, 0.00114023, 0.00047629,
        0.00060474, 0.00025645, 0.00150715, 0.00030411, 0.00030207,
        0.00056222, 0.00020819]),
 'param_pca__n_components': masked_array(data=[1, 1, 1, 5, 5, 5, 10, 10, 10, 15, 15, 15],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
        fill_value='?',
           

In [46]:
grid.best_estimator_.named_steps['pca'].explained_variance_ratio_

array([0.03510399, 0.02756753, 0.0257294 , 0.02104781, 0.01821084,
       0.01675893, 0.01534561, 0.01458388, 0.01346326, 0.01294463,
       0.01231902, 0.01202976, 0.01156397, 0.01120101, 0.01113405])

In [48]:
prediction = grid.best_estimator_.predict(X_test)

In [49]:
precision_score(y_test, prediction), recall_score(y_test, prediction)

(0.700414937759336, 0.5382653061224489)