In [3]:
class InterpretableRegressor:
    def __init__(self, random_state1=None, random_state2=None):
        
        from sklearn.model_selection import train_test_split, GridSearchCV
        from sklearn.linear_model import LogisticRegression
        from sklearn.metrics import roc_curve
        
        self.lr = LogisticRegression(solver="liblinear", random_state=random_state1)
        self.random_state = random_state2
    
    def fit(self, X, y):
        
        from sklearn.model_selection import train_test_split, GridSearchCV
        from sklearn.linear_model import LogisticRegression
        
        self.X_train, self.X_test, self.y_train, self.y_test = (train_test_split
                                                                (X[[f"PAY_{N}" for N 
                                                                     in range(1,7)]],y, 
                                                                  random_state=self.random_state))

        self.grid = GridSearchCV(estimator = self.lr, 
                                 param_grid = {"penalty": ["l1", "l2"], 
                                               "C":[.001, .01, .1, 1, 10, 100, 1000]},
                                scoring="accuracy")
        
        self.grid.fit(self.X_train, self.y_train)
        
        return self.grid.cv_results_
    
    def get_scores(self):
        from sklearn.metrics import roc_auc_score
        train_acc = self.grid.score(self.X_train, self.y_train)
        train_roc = roc_auc_score(self.y_train, self.grid.predict_proba(self.X_train)[:,1])
        test_acc = self.grid.score(self.X_test, self.y_test)
        test_roc = roc_auc_score(self.y_test, self.grid.predict_proba(self.X_test)[:,1])
        
        return {"train roc auc": train_roc, "train accuracy": train_acc,
               "test roc auc": test_roc, "test accuracy": test_acc}
    
    def get_roc(self):
        from sklearn.metrics import roc_curve, roc_auc_score
        probs = self.grid.predict_proba(self.X_test)[:,1]
        return roc_curve(self.y_test, probs)
    
    def score(self, X, y):
        from sklearn.metrics import roc_curve, roc_auc_score
        Xobj = X.copy()
        Xobj = Xobj[[f"PAY_{N}"for N in range(1,7)]]
        
        return {"accuracy": self.grid.score(Xobj, y),
               "roc_auc": roc_auc_score(y, self.grid.predict_proba(Xobj)[:,1])}
    
    def calculate_roc(self, X, y):
        Xobj = X.copy()
        Xobj = Xobj[[f"PAY_{N}"for N in range(1,7)]]
        probs = self.grid.predict_proba(Xobj)[:, 1]
        
        return roc_curve(y, probs)

In [4]:
def data_processing(df):
    import pandas as pd
    df1 = df.copy()
    df1.drop(columns="ID", inplace=True)
    df1.rename(columns={"LIMIT_BAL": "card_limit", "SEX": "sex", "AGE": "age", "PAY_0": "PAY_1",
                   "default.payment.next.month":"defaulted"}, inplace=True)
    
    
    df1.sex = df1.sex - 1
    
    #set EDUCATION values as categorical.
    edu_dummies = pd.get_dummies(df1.EDUCATION)
    edu_dummies.rename(columns={0:"edu_other_1", 1:"postgraduate", 2:"undergraduate", 3:"high_school", 4:"edu_other_2",
                            5:"edu_other_3", 6:"edu_other_4"}, inplace=True)
    df1 = df1.merge(edu_dummies, left_index=True, right_index=True)
    df1.drop(columns="EDUCATION", inplace=True)
    
    #set MARRIAGE value as categorical.
    marriage_dummies = pd.get_dummies(df1.MARRIAGE)
    marriage_dummies.rename(columns={0:"other_marital", 1: "married", 2: "single", 3: "divorced"}, inplace=True)
    df1 = df1.merge(marriage_dummies, left_index=True, right_index=True)
    df1.drop(columns="MARRIAGE", inplace=True)
    
    #Separate out values of -1 (balance paid) and -2 (no balance) from months late in PAY_{X} values.
    for N in range(1, 7):
        df1[f"no_balance_{N}"] = df1[f"PAY_{N}"].apply(lambda x: 1 if x==-2 else 0)
        df1[f"paid_balance_{N}"] = df1[f"PAY_{N}"].apply(lambda x: 1 if x==-1 else 0)
        df1[f"PAY_{N}"].replace({-2: 0, -1: 0}, inplace=True)
        pay_cols = [f"PAY_{n}" for n in range(1,7)] + [f"no_balance_{n}" 
                                                       for n in range(1, 7)] + [f"paid_balance_{n}" for
                                                                                n in range(1, 7)]
    
    #Determine gap between bill and payment for each month, and proportion of credit limit left unpaid.
    for N in range(1, 7):
        df1[f"gap_{N}"] = df1[f"BILL_AMT{N}"] - df1[f"PAY_AMT{N}"]
        df1[f"proportion_gap_{N}"] = df1[f"gap_{N}"] / df1.card_limit
    gap_cols = [f"gap_{n}" for n in range(1, 7)] + [f"proportion_gap_{n}" for n in range(1, 7)]
    
    df1["total_proportions"] = df1[[f"proportion_gap_{n}" for n in range(1,7)]].sum(axis=1)
    
    #reorder the columns for legibility
    df1 = df1[["card_limit", "sex", "age", "edu_other_1", "edu_other_2", "edu_other_3", "edu_other_4", 
        "postgraduate", "undergraduate", "high_school", "other_marital", "married", "single", "divorced"]
       + pay_cols + [f"BILL_AMT{N}" for N in range(1, 7)] + [f"PAY_AMT{N}" for N in range(1, 7)]
      + gap_cols + ["total_proportions"] + ["defaulted"]]

    X = df1.drop(columns="defaulted")
    y = df1.defaulted
    
    return (X, y)

In [5]:
class Heavily_Processed_Regressor:
    def __init__(self, random_state1=None, random_state2=None):
        
        from sklearn.preprocessing import PowerTransformer, StandardScaler
        from sklearn.decomposition import PCA
        from sklearn.linear_model import LogisticRegression
        
        self.lr = LogisticRegression(solver="liblinear", random_state=random_state1)
        self.pca = PCA(.98)
        self.scaler = StandardScaler()
        self.transformer = PowerTransformer()
        self.random_state = random_state2
    
    def fit(self, X, y):
        from sklearn.model_selection import train_test_split, GridSearchCV
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, random_state=self.random_state)
        
        self.numeric2 = X.nunique()[X.nunique() != 2].index
        self.X_train[self.numeric2] = self.transformer.fit_transform(self.X_train[self.numeric2])
        self.X_train = self.scaler.fit_transform(self.X_train)
        self.X_train = self.pca.fit_transform(self.X_train)
        
        self.X_test[self.numeric2] = self.transformer.transform(self.X_test[self.numeric2])
        self.X_test = self.scaler.transform(self.X_test)
        self.X_test = self.pca.transform(self.X_test)
        self.grid = GridSearchCV(estimator = self.lr, 
                                 param_grid = {"penalty": ["l1", "l2"], "C":[.001, .01, .1, 1, 10, 100, 1000],
                                              "class_weight": [None, "balanced"]},
                                scoring="roc_auc")
        
        self.grid.fit(self.X_train, self.y_train)
        
        return self.grid.cv_results_
    
    def get_scores(self):
        train_score = self.grid.score(self.X_train, self.y_train)
        train_acc = np.mean(self.grid.predict(self.X_train)==self.y_train)
        test_score = self.grid.score(self.X_test, self.y_test)
        test_acc = np.mean(self.grid.predict(self.X_test)==self.y_test)
        
        return {"train roc auc": train_score, "train accuracy": train_acc,
               "test roc auc": test_score, "test accuracy": test_acc}
    
    def get_roc(self):
        from sklearn.metrics import roc_curve
        probs = self.grid.predict_proba(self.X_test)[:,1]
        return roc_curve(self.y_test, probs)
    
    def score(self, X, y):
        Xobj = X.copy()
        Xobj[self.numeric2] = self.transformer.transform(Xobj[self.numeric2])
        Xobj = self.scaler.transform(Xobj)
        Xobj = self.pca.transform(Xobj)
        
        return {"roc auc": self.grid.score(Xobj, y),
               "accuracy": np.mean(self.grid.predict(Xobj)==y)}
    
    def calculate_roc(self, X, y):
        from sklearn.metrics import roc_curve
        Xobj = X.copy()
        Xobj[self.numeric2] = self.transformer.transform(Xobj[self.numeric2])
        Xobj = self.scaler.transform(Xobj)
        Xobj = self.pca.transform(Xobj)
        probs = self.grid.predict_proba(Xobj)[:,1]
        
        return roc_curve(y, probs)
    