# Helper functions used in the Linear Classification project 


## Transform Data

In [None]:
# get dummies
def get_dummies(df,columns):
    import pandas as pd
    return pd.get_dummies(data=df, columns=columns, drop_first=True)

## Reweigthing predicted results

In [None]:
## the correction factor: 
def reweight(pi,q1,r1):
    r0 = 1-r1
    q0 = 1-q1
    tot = pi*(q1/r1)+(1-pi)*(q0/r0)
    w = pi*(q1/r1)
    w /= tot
    return w

In [5]:
def reweight_multi(pi,q,r=1/7):
    w = []
    q_r = [x / r for x in q]
    for n in range(0, len(pi+1)):
        tot = pi.loc[n]*pd.Series(q_r)
        tot_s = sum(tot)
        b = [x / tot_s for x in tot]
        w.append(b)
    w = np.array(w)
    return w

## Resampling and evaluating their effect on the classification results

In [2]:
def sampling_and_evaluate(X,y,sampling, sampling_strat): 
    RANDOM_STATE = 42
    score=list()
    
    from numpy import mean
    from sklearn.metrics import roc_auc_score
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.over_sampling import RandomOverSampler
    from sklearn.linear_model import LogisticRegression

    if sampling =="under":  
        
         # define undersampling strategy
        undersample = RandomUnderSampler(sampling_strategy=sampling_strat, random_state=RANDOM_STATE)
        X_under, y_under = undersample.fit_resample(X, y)
        
        # summarize class distribution
        #print(Counter(y_under))
        lregr = LogisticRegression(penalty='l2', C=100.0, 
                           fit_intercept=True, 
                           intercept_scaling=1, 
                           solver='liblinear', max_iter=500, random_state=RANDOM_STATE)

        # fit model
        lregr.fit(X_under, y_under)
        
        # make prediction
        y_hat_under=lregr.predict_proba(X_under)

        
        # evaluate model
        return roc_auc_score(y_true=y_under, y_score=y_hat_under[:,1])
        #print('AUC Score of UnderSampling with', str(sampling_strat), 
        #      'sampling strategy: ', roc_auc_score(y_true=y_under, y_score=y_hat_under[:,1]))
        
    elif sampling =="tomek": 
        
        # define undersampling strategy
        from imblearn.under_sampling import TomekLinks

        tomek = TomekLinks( sampling_strategy=sampling_strat) #random_state= 42
        # unexpected error: TypeError: __init__() got an unexpected keyword argument 'random_state
        # so that is why the random_state argument is not used in this case
        
        X_t, y_t = tomek.fit_resample(X, y)
        
        # summarize class distribution
        #print(Counter(y_t))
        lregr = LogisticRegression(penalty='l2', C=100.0, 
                           fit_intercept=True, 
                           intercept_scaling=1, 
                           solver='liblinear', max_iter=500, random_state=RANDOM_STATE)

        # fit model
        lregr.fit(X_t, y_t)
        
        # make prediction
        y_hat_tomek=lregr.predict_proba(X_t)

        
        # evaluate model
        from sklearn.metrics import roc_auc_score
        return roc_auc_score(y_true=y_t, y_score=y_hat_tomek[:,1])
        #print('AUC Score of UnderSampling using Tomek-links with', str(sampling_strat), 
         #     'sampling strategy: ', roc_auc_score(y_true=y_t, y_score=y_hat_tomek[:,1]))
        
    elif sampling=="over":
        
        # define oversampling strategy
        oversample = RandomOverSampler(sampling_strategy=sampling_strat, random_state=RANDOM_STATE)
        X_over, y_over = oversample.fit_resample(X, y)
        
        # summarize class distribution
        #print(Counter(y_over))
        lregr = LogisticRegression(penalty='l2', C=100.0, 
                           fit_intercept=True, 
                           intercept_scaling=1, 
                           solver='liblinear', max_iter=500, random_state=RANDOM_STATE)

        # fit model
        lregr.fit(X_over, y_over)
        
        # make prediction
        y_hat_over=lregr.predict_proba(X_over)

        
        # evaluate model
        from sklearn.metrics import roc_auc_score
        return roc_auc_score(y_true=y_over, y_score=y_hat_over[:,1])
        #print('AUC Score of OverSampling with', str(sampling_strat), 
        #      'sampling strategy: ', roc_auc_score(y_true=y_over, y_score=y_hat_over[:,1]))
        
    elif sampling=="smote":
        
        # define oversampling strategy
        from imblearn.over_sampling import SMOTE

        smote = SMOTE(sampling_strategy=sampling_strat, random_state=RANDOM_STATE)
        X_sm, y_sm = smote.fit_resample(X, y)
        
        # summarize class distribution
        #print(Counter(y_sm))
        lregr = LogisticRegression(penalty='l2', C=100.0, 
                           fit_intercept=True, 
                           intercept_scaling=1, 
                           solver='liblinear', max_iter=500, random_state=RANDOM_STATE)

        # fit model
        lregr.fit(X_sm, y_sm)
        
        # make prediction
        y_hat_sm=lregr.predict_proba(X_sm)

        
        # evaluate model
        from sklearn.metrics import roc_auc_score        
        return roc_auc_score(y_true=y_sm, y_score=y_hat_sm[:,1])
        #print('AUC Score of OverSampling using SMOTE with', str(sampling_strat), 
        #      'sampling strategy: ', roc_auc_score(y_true=y_sm, y_score=y_hat_sm[:,1]))
    elif sampling=="smotetomek": 
        
        from imblearn.combine import SMOTETomek 
        
        smotetomek = SMOTETomek(sampling_strategy=sampling_strat, random_state=RANDOM_STATE)
        X_smtl, y_smtl = smotetomek.fit_resample(X, y)

        # summarize class distribution
        #print(Counter(y_sm))
        lregr = LogisticRegression(penalty='l2', C=100.0, 
                           fit_intercept=True, 
                           intercept_scaling=1, 
                           solver='liblinear', max_iter=500, random_state=RANDOM_STATE)

        # fit model
        lregr.fit(X_smtl, y_smtl)

        # make prediction
        y_hat_smtl=lregr.predict_proba(X_smtl)


        # evaluate model
        from sklearn.metrics import roc_auc_score
        return roc_auc_score(y_true=y_smtl, y_score=y_hat_smtl[:,1])



In [8]:
def test_evaluate_sampling_ratios(ratios,ratios_reversed, result_dictionary ):

    for r1 in ratios:
        for r2 in ratios_reversed: 
            try:

                # define oversampling strategy
                over = RandomOverSampler(sampling_strategy=r1)
                # fit and apply the transform
                X_over, y_over = over.fit_resample(X_scaled, y_original)

                # define undersampling strategy
                under = RandomUnderSampler(sampling_strategy=r2)
                # fit and apply the transform
                X_over_under, y_over_under = under.fit_resample(X_over, y_over)

                # use a Logistic Regression setting based on previous finetuning results
                # (high C value)
                lregr = LogisticRegression(penalty='l2', C=1000.0, 
                                           fit_intercept=True, 
                                           intercept_scaling=1, 
                                           solver='liblinear', max_iter=500)

                # fit model
                lregr.fit(X_over_under, y_over_under)

                # make prediction
                y_hat_overunder=lregr.predict_proba(X_over_under)


                # evaluate model
                dic_key=str("OverSampler ratio: " + str(r1) + ", UnderSampler ratio: " + str(r2))
                result_dictionary[dic_key]=roc_auc_score(y_true=y_over_under, y_score=y_hat_overunder[:,1])

                # store ratios of best sampling strategy with the highest AUC score
                sorted(result_dictionary.items(), key=lambda item: item[1], reverse=True)[:1]
                l=list(dict(sorted(result_dictionary.items(), key=lambda item: item[1], reverse=True)[:1]).keys())
                ratios=list(flatten([re.findall(r"0\.\d{1}", x) for x in l]))
                overs_r=float(ratios[0])
                unders_r=float(ratios[1])
                return overs_r, unders_r
            except ValueError: 
                pass



In [6]:
def evaluate_best_sampling(overs_r, unders_r, model):
    
    
    
    over = RandomOverSampler(sampling_strategy=float(overs_r))
    # fit and apply the transform
    X_over, y_over = over.fit_resample(X_scaled, y_original)

    # define undersampling strategy
    under = RandomUnderSampler(sampling_strategy=float(unders_r))
    # fit and apply the transform
    X_over_under, y_over_under = under.fit_resample(X_over, y_over)

    model = LogisticRegression(penalty='l2', C=1000.0,  
                               solver='liblinear', max_iter=500)


    model.fit(X_over_under, y_over_under)
    y_hat=model.predict_proba(X_over_under)

    from sklearn import metrics
    get_auc(y_over_under,y_hat[:,1] , class_labels, column=1, plot=True) 

    # Classification report
    print("Classification report of the model: \n", 
          metrics.classification_report(y_over_under, model.predict(X_over_under)))

## Model finetuning (finding the best hyperparameters)

In [3]:
def finetune_model(X,y, model, class_weight, solvers, penalty, c_values):
    # define grid search
    grid = dict(solver=solvers,penalty=penalty,C=c_values)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='roc_auc',error_score=0)
    grid_result = grid_search.fit(X, y)

    # summarize results
    
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
    return (grid_result.best_score_, grid_result.best_params_)
    

## Predict and evaluate classification models

In [6]:
def predict_and_evaluate_binary(X,y, model, crossval="Yes"):
    
    from sklearn.linear_model import LogisticRegression
    from sklearn import metrics
    from sklearn.metrics import confusion_matrix
    import pandas as pd
    
    model.fit(X,y)
    y_hat=model.predict_proba(X)
    
    # reweighting 
    q1 = y.sum()/len(y)
    r1 = 0.5
    y_hat_corr=reweight(y_hat[:,1], q1,r1)
    
    
    ### Evaluate Model ###
    y_pred_new = [1 if pi >= 0.25 else 0 for pi in y_hat_corr]
    
    # Confusion Matrix
    print("Confusion Matrix \n")
    # insample_labels = model.predict(X)
    cm =  confusion_matrix(y_pred=y_pred_new, y_true=y, labels=[0,1])
    print (cm)
    
    # Plotting confusion matrix (custom help function)
    df_cm = pd.DataFrame(cm, index = [i for i in class_labels],
                  columns = [i for i in class_labels])
    sns.set(font_scale=1)
    sns.heatmap(df_cm, annot=True, fmt='g', cmap='Blues')
    plt.xlabel("Predicted label")
    plt.ylabel("Real label")
    plt.show()
    
    # ROC AUC score
    #get_auc(y_original,y_hat_corr , class_labels, column=1, plot=True) 
    fpr, tpr, _ = roc_curve(y == 1, y_hat_corr,drop_intermediate = False)
    roc_auc = roc_auc_score(y_true=y, y_score=y_hat_corr)
    print ("AUC: ", roc_auc)
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()
    

    # Classification report
    print("Classification report of the model: \n", 
      metrics.classification_report(y,y_pred_new))
    
    if crossval=="Yes":
        # cross validation
        print("Confusion matrix of in-sample cross-validation:")

        from sklearn.model_selection import cross_val_predict as cvp
        y_hat_cv = cvp(model, X, y, cv=100)

        cm2 =  confusion_matrix(y_pred=y_hat_cv, y_true=y, labels=[0,1])
        #print(cm2)
        df_cm2 = pd.DataFrame(cm2, index = [i for i in class_labels],
                      columns = [i for i in class_labels])
        sns.set(font_scale=1)
        sns.heatmap(df_cm, annot=True, fmt='g', cmap='Blues')
        plt.xlabel("Predicted label")
        plt.ylabel("Real label")
        plt.show()
        
    elif crossval=="No":
        print("No cross-validation")


In [None]:
def predict_evaluate_multiclass(X,y,model):
    
    model.fit(X, y)

    # define the evaluation procedure with cross-validation accuracy scores
    from sklearn.model_selection import cross_val_score
    from sklearn.model_selection import RepeatedStratifiedKFold

    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    # evaluate the model
    score = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

    # mean accuracy 
    print("Mean accuracy result of in-sample cross-validation:", np.mean(score))
    
    y_hat=model.predict(X)
    class_report=metrics.classification_report(y,y_hat )
    print("\n")
    print(class_report)

### Plots

In [None]:
def confusion_matrix_wo_crossval(X,y, y_corr, model, class_labels):
    from sklearn.linear_model import LogisticRegression
    from sklearn import metrics
    from sklearn.metrics import confusion_matrix
    import pandas as pd
    
    #y_hat=model.predict(X)
    cm =  confusion_matrix(y_pred=y_corr, y_true=y, labels=class_labels)
    #print(cm)
    df_cm = pd.DataFrame(cm, index = [i for i in class_labels],
                  columns = [i for i in class_labels])
    sns.set(font_scale=1)
    sns.heatmap(df_cm, annot=True, fmt='g', cmap='Blues')
    plt.xlabel("Predicted label")
    plt.ylabel("Real label")
    plt.show()
    

In [4]:
def confusion_matrix_(X,y, y_corr, model, class_labels):
    from sklearn.linear_model import LogisticRegression
    from sklearn import metrics
    from sklearn.metrics import confusion_matrix
    import pandas as pd
    
    #y_hat=model.predict(X)
    cm =  confusion_matrix(y_pred=y_corr, y_true=y, labels=class_labels)
    #print(cm)
    df_cm = pd.DataFrame(cm, index = [i for i in class_labels],
                  columns = [i for i in class_labels])
    sns.set(font_scale=1)
    sns.heatmap(df_cm, annot=True, fmt='g', cmap='Blues')
    plt.xlabel("Predicted label")
    plt.ylabel("Real label")
    plt.show()
    
# use in-sample cross-validation
    print("Confusion matrix of in-sample cross-validation:")
    
    from sklearn.model_selection import cross_val_predict as cvp
    y_hat_cv = cvp(model, X, y, cv=100)
    
    cm2 =  confusion_matrix(y_pred=y_hat_cv, y_true=y, labels=class_labels)
    #print(cm2)
    df_cm2 = pd.DataFrame(cm2, index = [i for i in class_labels],
                  columns = [i for i in class_labels])
    sns.set(font_scale=1)
    sns.heatmap(df_cm, annot=True, fmt='g', cmap='Blues')
    plt.xlabel("Predicted label")
    plt.ylabel("Real label")
    plt.show()
#
    
    
    #print (cm2)
    ## Plotting confusion matrix (custom help function)
#
    

In [2]:
# calculate the Euler number to the power of its coefficient to find the importance.
def feature_importance_plot(model):
    feature_importance = pd.DataFrame(feature_names, columns = ["feature"])
    feature_importance["importance"] = model.coef_[0]
    feature_importance["importance_abs_value"] = feature_importance["importance"].abs()

    feature_importance_top10 = feature_importance.sort_values(by = ["importance_abs_value"], ascending=True).head(15)

    fig = plt.figure(figsize = (20,25))
    ax = feature_importance_top10.plot.barh(x='feature', y='importance', 
                                               title="Top 15 most important variables according to their LogReg coefficients ")
    plt.show()


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
def get_auc(y, y_pred_probabilities, class_labels, column =1, plot = True):
    """Plots ROC AUC
    """
    fpr, tpr, _ = roc_curve(y == column, y_pred_probabilities,drop_intermediate = False)
    roc_auc = roc_auc_score(y_true=y, y_score=y_pred_probabilities)
    print ("AUC: ", roc_auc)
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()