# Pipeline Utils

## Load Data

In [14]:
import data_loader
import numpy as np
import pandas
import joblib
import warnings
warnings.filterwarnings('ignore')

In [16]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import MissingIndicator
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import FeatureUnion
from sklearn import metrics
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
import sklearn.neural_network.multilayer_perceptron as mlp
import xgboost as xgb

In [6]:
def target_binarizer(Y):
    """'<50' is set to 0 and the other one is set to 1"""
    lb = LabelBinarizer()
    return lb.fit_transform(Y)

### Train Data

In [7]:
# load data and assign names
trdf, valdf = data_loader.load_train_data("data/adult.data", is_df=True)
## adding columns labels https://chartio.com/resources/tutorials/how-to-rename-columns-in-the-pandas-python-library/
trdf.columns = ["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country"
,"target"]
valdf.columns = ["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country"
,"target"]

In [8]:
Xtr = trdf.drop('target',axis=1)
Ytr = target_binarizer(trdf['target'])

In [9]:
Xva = valdf.drop('target', axis=1)
Yva = target_binarizer(valdf['target'])


### Test data

In [10]:
# load data and assign names
testdf = data_loader.load_test_data("data/adult.data", is_df=True)
## adding columns labels https://chartio.com/resources/tutorials/how-to-rename-columns-in-the-pandas-python-library/
testdf.columns = ["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country"
,"target"]



In [11]:
Xts = testdf.drop('target',axis=1)
Yts = target_binarizer(testdf['target'])
print(Xts.shape)
print(Yts.shape)

(32560, 14)
(32560, 1)


## Pipeline functions

In [12]:
def cf_report(p, X, Y):
    Xva = X
    Yva = Y
    Yva_hat = p.predict(Xva)
    print(metrics.classification_report(Yva, Yva_hat))
    print(metrics.confusion_matrix(Yva, Yva_hat))

In [13]:
def auc(pipeline, X, Y):
    pred = pipeline.predict(X)
    print(pred)
    fpr, tpr, thresholds = metrics.roc_curve(Y, pred)
    print("F1 weighted {}".format(metrics.f1_score(Y,pred, average='weighted')))
    print("AUC : {}".format(metrics.auc(fpr, tpr)))
    print("Classification report : ")
    print(cf_report(pipeline, X,Y))

In [11]:
def select_object(X):
    return X.select_dtypes(include = [np.object])

In [12]:
def select_number(X):
    """selects all numerical features in the data set"""
    return X.select_dtypes(include= [np.int64, np.float64])

In [13]:
def strip_cols(X):
    """strips whitespaces from the columns"""
    return X.apply(lambda col: col.str.strip())

In [14]:
def target_binarizer(Y):
    """'<50' is set to 0 and the other one is set to 1"""
    lb = LabelBinarizer()
    return lb.fit_transform(Y)

In [15]:
# references https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines
# references https://scikit-learn.org/stable/auto_examples/preprocessing/plot_function_transformer.html
# references https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py

stringstrip = Pipeline([
                ('selector', FunctionTransformer(select_object,validate=False)),
                ('striper', FunctionTransformer(strip_cols, validate=False))
            ])
numerical_transform = Pipeline([
    ('selector', FunctionTransformer(select_number, validate = False)),
    ('scaler', StandardScaler()) # use get parameters and set parameters for validation set
    ])

cat_transformer = Pipeline([
    ('stringstriper' , stringstrip ),
#     ('missing-indicator', MissingIndicator(missing_values='?', features='all')),
    ('imputer', SimpleImputer(missing_values='?', strategy = 'constant', fill_value='MISSING')),
     ('one-hot-encoder', OneHotEncoder(handle_unknown='ignore'))
     ])

In [16]:
def selectcols(X, name = None):
    return X[name]

In [17]:
a = stringstrip.fit_transform(trdf)

In [18]:
a['workclass'][1]

'Self-emp-not-inc'

In [19]:
MissingIndicator()

MissingIndicator(error_on_new=True, features='missing-only', missing_values=nan,
                 sparse='auto')

In [20]:
b = numerical_transform.fit_transform(trdf)

In [21]:
# cat_transformer.set_params(stringstriper__selector__validate=False)
cat = cat_transformer.fit_transform(trdf)

In [22]:
preprocess = FeatureUnion([
    ('numerical_transform', numerical_transform),
    ('cat_transformer', cat_transformer)
])
