## Lending Club Pipeline

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib



In [2]:
# pipeline
PIPELINE_NAME = 'lending_club'
PIPELINE_SAVE_FILE = f'{PIPELINE_NAME}_output'

# data
TESTING_DATA_FILE = 'lending_club_selected_features_test.csv'
TRAINING_DATA_FILE = 'lending_club_selected_features_train.csv'
TARGET = 'target'

# variables
FEATURES = ['loan_amnt', 'term', 'installment', 'grade', 'emp_length', 'home_ownership', 
            'annual_inc', 'verification_status', 'purpose', 'title', 'addr_state', 'dti', 
            'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths', 'open_acc', 'pub_rec', 
            'revol_bal', 'revol_util', 'total_acc', 'last_credit_pull_d', 'pub_rec_bankruptcies', 'fico_average']

# numerical variables with NA in train set
NUMERICAL_VARS_WITH_NA = ['pub_rec_bankruptcies']

# categorical variables with NA in train set
CATEGORICAL_VARS_WITH_NA = ['emp_length', 'title', 'revol_util', 'last_credit_pull_d']

# variables to log transform
NUMERICALS_LOG_VARS = ['loan_amnt', 'installment', 'annual_inc', 'dti', 
                       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 
                       'revol_bal', 'total_acc', 'fico_average']

# categorical variables to encode
CATEGORICAL_VARS = ['term', 'grade', 'home_ownership', 'verification_status', 
                    'purpose', 'addr_state', 'earliest_cr_line', 'emp_length', 
                    'title', 'revol_util', 'last_credit_pull_d']


In [3]:
def load_dataset(*, file_name: str) -> pd.DataFrame:
    _data = pd.read_csv(f'{file_name}')
    return _data

def save_pipeline(*, pipeline_to_persist) -> None:
    save_file_name = f'{PIPELINE_SAVE_FILE}.pkl'
    save_path = save_file_name
    joblib.dump(pipeline_to_persist, save_path)

In [4]:
data = load_dataset(file_name=TRAINING_DATA_FILE)
data.head()

Unnamed: 0,loan_amnt,term,installment,grade,emp_length,home_ownership,annual_inc,verification_status,target,purpose,...,earliest_cr_line,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,last_credit_pull_d,pub_rec_bankruptcies,fico_average
0,15000.0,36 months,483.16,B,9 years,MORTGAGE,67000.0,Not Verified,0,credit_card,...,Jun-1977,0.0,11.0,0.0,46525.0,64.1%,36.0,Sep-2016,0.0,757.0
1,6000.0,36 months,180.96,A,3 years,RENT,72000.0,Source Verified,1,wedding,...,Oct-2000,0.0,5.0,0.0,779.0,14.8%,15.0,Mar-2014,0.0,792.0
2,17500.0,60 months,384.79,B,7 years,MORTGAGE,90000.0,Source Verified,0,debt_consolidation,...,Sep-1997,0.0,9.0,0.0,16231.0,55.1%,36.0,Sep-2016,0.0,742.0
3,5400.0,36 months,188.01,D,1 year,OWN,18000.0,Not Verified,1,debt_consolidation,...,Mar-2005,0.0,4.0,0.0,3600.0,94.7%,4.0,Sep-2015,0.0,677.0
4,5000.0,36 months,176.58,D,7 years,RENT,40000.0,Source Verified,0,small_business,...,Jun-2006,0.0,3.0,0.0,1860.0,47.7%,7.0,Sep-2016,0.0,707.0


In [5]:
# divide train and test
X_train, X_test, y_train, y_test = train_test_split(
    data[FEATURES],
    data[TARGET],
    test_size=0.1,
    random_state=0)  # we are setting the seed here

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class BaseError(Exception):
    """Base package error."""

class InvalidModelInputError(BaseError):
    """Model input contains an error."""
    
class LogTransformer(BaseEstimator, TransformerMixin):
    """Logarithm transformer."""

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        # to accomodate the pipeline
        return self

    def transform(self, X):
        X = X.copy()

        # check that the values are non-negative for log transform
        if not (X[self.variables] > 0).all().all():
            vars_ = self.variables[(X[self.variables] <= 0).any()]
            raise InvalidModelInputError(
                f"Variables contain zero or negative values, "
                f"can't apply log for vars: {vars_}")

        for feature in self.variables:
            X[feature] = np.log(X[feature])

        return X

class CategoricalImputer(BaseEstimator, TransformerMixin):
    """Categorical data missing value imputer."""

    def __init__(self, variables=None) -> None:
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X: pd.DataFrame, y: pd.Series = None
            ) -> 'CategoricalImputer':
        """Fit statement to accomodate the sklearn pipeline."""

        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Apply the transforms to the dataframe."""

        X = X.copy()
        for feature in self.variables:
            X[feature] = X[feature].fillna('Missing')

        return X

class NumericalImputer(BaseEstimator, TransformerMixin):
    """Numerical missing value imputer."""

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        # persist mode in a dictionary
        self.imputer_dict_ = {}
        for feature in self.variables:
            self.imputer_dict_[feature] = X[feature].mode()[0]
        return self

    def transform(self, X):
        X = X.copy()
        for feature in self.variables:
            X[feature].fillna(self.imputer_dict_[feature], inplace=True)
        return X

class TemporalVariableEstimator(BaseEstimator, TransformerMixin):
    """Temporal variable calculator."""

    def __init__(self, variables=None, reference_variable=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

        self.reference_variables = reference_variable

    def fit(self, X, y=None):
        # we need this step to fit the sklearn pipeline
        return self

    def transform(self, X):
        X = X.copy()
        for feature in self.variables:
            X[feature] = X[self.reference_variables] - X[feature]

        return X

class RareLabelCategoricalEncoder(BaseEstimator, TransformerMixin):
    """Rare label categorical encoder"""

    def __init__(self, tol=0.05, variables=None):
        self.tol = tol
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        # persist frequent labels in dictionary
        self.encoder_dict_ = {}

        for var in self.variables:
            # the encoder will learn the most frequent categories
            t = pd.Series(X[var].value_counts() / np.float(len(X)))
            # frequent labels:
            self.encoder_dict_[var] = list(t[t >= self.tol].index)

        return self

    def transform(self, X):
        X = X.copy()
        for feature in self.variables:
            X[feature] = np.where(X[feature].isin(
                self.encoder_dict_[feature]), X[feature], 'Rare')

        return X

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """String to numbers categorical encoder."""

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables 

    def fit(self, X, y):
        temp = pd.concat([X, y], axis=1)
        temp.columns = list(X.columns) + ['target']

        # persist transforming dictionary
        self.encoder_dict_ = {}

        for var in self.variables:
            t = temp.groupby([var])['target'].mean().sort_values(ascending=True).index
            self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)}
             
        return self

    def transform(self, X):
        # encode labels
        X = X.copy()
        for feature in self.variables:
            X[feature] = X[feature].map(self.encoder_dict_[feature])

        # check if transformer introduces NaN
        if X[self.variables].isnull().any().any():
            null_counts = X[self.variables].isnull().any()
            vars_ = {key: value for (key, value) in null_counts.items() if value is True}
            raise InvalidModelInputError(
                f'Categorical encoder has introduced NaN when '
                f'transforming categorical variables: {vars_.keys()}')

        return X

class DropUnecessaryFeatures(BaseEstimator, TransformerMixin):

    def __init__(self, variables_to_drop=None):
        self.variables = variables_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # encode labels
        X = X.copy()
        X = X.drop(self.variables, axis=1)

        return X

In [7]:
club_pipe = Pipeline(
    [
        ('categorical_imputer',
            CategoricalImputer(variables=CATEGORICAL_VARS_WITH_NA)),
        ('numerical_imputer',
            NumericalImputer(variables=NUMERICAL_VARS_WITH_NA)),
        ('rare_label_encoder',
            RareLabelCategoricalEncoder(
                tol=0.01,
                variables=CATEGORICAL_VARS)),
        ('categorical_encoder',
            CategoricalEncoder(variables=CATEGORICAL_VARS)),
        ('scaler', MinMaxScaler()),
#         ('Linear_model', LogisticRegression(solver='lbfgs', class_weight='balanced'))
#         ('Linear_model', LogisticRegression(penalty='l2', tol=0.01, solver='saga'))
#         ('Linear_model', LogisticRegression(solver='lbfgs'))
        ('model', RandomForestClassifier(n_estimators=100, max_depth=None, random_state=0))
    ]
)

In [8]:
club_pipe.fit(X_train[FEATURES], y_train)

Pipeline(memory=None,
         steps=[('categorical_imputer',
                 CategoricalImputer(variables=['emp_length', 'title',
                                               'revol_util',
                                               'last_credit_pull_d'])),
                ('numerical_imputer',
                 NumericalImputer(variables=['pub_rec_bankruptcies'])),
                ('rare_label_encoder',
                 RareLabelCategoricalEncoder(tol=0.01,
                                             variables=['term', 'grade',
                                                        'home_ownership',
                                                        'verification_status',
                                                        'purpose', 'a...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                      

In [9]:
save_pipeline(pipeline_to_persist=club_pipe)