In [1]:
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels
from statsmodels.formula.api import ols
from scipy import stats
from sklearn.model_selection import train_test_split

import math
import seaborn as sns
import numpy as np
import pandas as pd
import csv

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)

In [2]:
# raw_data = pd.read_csv('data/data_cleaned.csv')
# raw_data.reset_index(drop=False, inplace=True)
# data_df = raw_data
# data_df['Salary_sqrt'] = data_df['Salary'] ** (1/2)
# data_df.head()

In [3]:
# train_df, test_df = train_test_split(
#      data_df, test_size=0.2, random_state=0, shuffle=True)

# Fit Model

In [10]:
def fit_ols(predictors, response, data):
    X = data[predictors]
    y = data[response]
    X = sm.add_constant(X)
    model = sm.OLS(y, X)
    results = model.fit()
    return results

# Normality Check

In [7]:
def normality_check(var, data):
    data = data[var]

    fig, ax = plt.subplots(1, 2, figsize=(10,5))
    fig.tight_layout(pad=6)

    sns.histplot(data=data, ax=ax[0], bins=12, kde=True)
    ax[0].set(xlabel=var, ylabel='Frequency', title=var+' Distribution')
    ax[0].ticklabel_format(style='plain', axis='both')
    ax[0].tick_params(labelrotation=45)
    
    sm.ProbPlot(data=data).qqplot(line='s', ax=ax[1])
    ax[1].set_title('Probability Plot')
    
    plt.show()
    print(f"Skewness: {data.skew()}")
    print(f"Kurtosis: {data.kurt()}")

# Hybrid Stepwise

In [5]:
def stepwise(predictors, response, alpha, data):
    
    var_to_fit = []
    pvalues_new = {}
    pvalues_old = {}
    global var_selected
    var_selected = []
    var_removed = []
    var_kept = []

    pvalues_final = {}
    
    
    def clear_values():
        pvalues_new.clear()
        pvalues_old.clear()
        var_removed = []
        var_kept = []    
    
    def fit_model(x):
        pvalues_old.clear()
        X = data[x]
        y = data[response]
        X = sm.add_constant(X)
        model = sm.OLS(y, X)
        results = model.fit()
        for i in range(len(x)-1):
            pvalues_old[x[i]] = results.pvalues[i+1]
        pvalues_new[x[-1]] = results.pvalues[-1]
    
    def check_pvalues_new(val):
        """check if pvalues_new contains p<alpha"""
        for key in pvalues_new:
            if pvalues_new[key] <= val:
                return True
                break
            else:
                continue
        return False

    def check_pvalues_old(val):
        """check if pvalues_old contains p>alpha"""

        for key in pvalues_old:
            if pvalues_old[key] > val:
                return True
                break
            else:
                continue
        return False
    
    clear_values()
    count = 1
    keep_going = True
    while keep_going:
        print(f"Round: {count}")
        print(f"Predictors: {predictors}")
        clear_values()

        for i in range(len(predictors)):
            var_to_fit = var_selected + [predictors[i]]
            fit_model(var_to_fit)
        print(f"pvalues_new: {pvalues_new}")

        if check_pvalues_new(alpha):
            var_min = min(pvalues_new, key=pvalues_new.get)
            print(f"var min: {var_min}")
            var_to_fit = var_selected + [var_min]
            fit_model(var_to_fit)
            if check_pvalues_old(alpha):
                for var in var_selected:
                    if pvalues_old[var] > alpha:
                        var_remove = var
                    else:
                        continue
                    var_selected.remove(var)
                    predictors.append(var)
                    var_removed.append(var)
                var_kept = var_selected
            else:
                var_kept = var_selected
            print(f"Var kept: {var_kept}")
            var_selected.append(var_min)
            predictors.remove(var_min)
            print(f"var removed: {var_removed}")
            print(f"var_selected: {var_selected}")
            print("-------------------------------------------------------------------------------------")            
            count += 1
        else:
            X = data[var_selected]
            y = data[response]
            X = sm.add_constant(X)
            model = sm.OLS(y, X)
            results = model.fit()
            for i in range(len(var_selected)):
                pvalues_final[var_selected[i]] = results.pvalues[i+1]

            keep_going = False
            print(f"Stepwise completed. Vars selected: {var_selected}")

# Individual predictor diagnostics

In [6]:
def var_diagnose(var_name, y_name, data):
    X = data[var_name]
    y = data[y_name]
    X = sm.add_constant(X)
    model = sm.OLS(y, X)
    results = model.fit()

    dataframe = pd.concat([X, y], axis=1)

    # # model values
    model_fitted_y = results.fittedvalues
    # model residuals
    model_residuals = results.resid

    fig2, ax2 = plt.subplots(1, 3, figsize=(18,6))
    # fig.set_size_inches(10, 10)
    fig2.tight_layout(pad=2)

    # sns.set(rc={'figure.figsize':(10,14)})
    sns.regplot(x=var_name, y=y_name, data=dataframe, ax=ax2[0], 
                scatter_kws={'alpha': 0.5},
                line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
    ax2[0].set(xlabel=var_name, ylabel=y_name, title='Regression Plot')
    # residual vs fits plot
    sns.residplot(model_fitted_y, y_name, ax=ax2[1] , data=dataframe,
                    lowess=True,
                    scatter_kws={'alpha': 0.5},
                    line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
    ax2[1].set(xlabel='Fitted Values', ylabel='Residuals', title='Residual vs. Fits')


    # residual histogram
    sns.histplot(data=model_residuals, ax=ax2[2], bins=10, kde=True)
    ax2[2].set(xlabel='Residuals', ylabel='Frequency', title='Residual Distribution')
    plt.show()

    print(f"Skewness: {model_residuals.skew()}")
    print(f"Kurtosis: {model_residuals.kurt()}")