<a href="https://colab.research.google.com/github/chunghv/TTS-VP/blob/master/Residual_Score_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
train_woe = pd.read_csv('/content/drive/MyDrive/chung.csv')
# train = pd.read_csv('/content/drive/MyDrive/REMISSION.csv')
# train = pd.read_csv('/content/drive/MyDrive/TT.csv')
train_woe.drop(['Unnamed: 0'], axis = 1, inplace = True)

**Các công thức tính Gradient và Hessian của Log Likelihood em tham khảo tại** http://gauss.stat.su.se/phd/oasi/OASII2021_gradients_Hessians.pdf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chi2
from sklearn.linear_model import LogisticRegression
def pxi(xi, beta):
    sigmoid = np.exp(xi@beta)/(1 + np.exp(xi@beta))
    return sigmoid

def log_likelihood(X, y, beta):
    loglikelihood = 0
    N = len(y)
    for i in range(N):
        lossi = np.log(1 + np.exp(X[i]@beta)) - (X[i]@beta)*y[i]
        loglikelihood = loglikelihood + lossi
    return loglikelihood

def gradient_log_likelihood(X, y, beta):
    N = len(y)
    gradient = np.zeros(X.shape[1])
    for i in range(N):
        g_hat_i = (1/ (1 + np.exp(-X[i]@beta))  - y[i]) * X[i]
        gradient = gradient + g_hat_i
    return -gradient

def hessian_log_likelihood(X, y, beta):
    N = len(y)
    hessian = np.zeros((X.shape[1],X.shape[1]))
    for i in range(N):
        temp = np.outer(X[i], X[i])
        temp2 = pxi(X[i], beta)
        temp2 = temp2 * (1 - temp2)
        hessian = hessian + temp2*temp
    return -hessian

def convert(X, target):
    X_train = np.array(X.drop([target], axis = 1))
    y = np.array(X[target])
    X_train_extend = []
    for i in X_train:
        X_train_extend.append( np.insert(i, 0, 1))
    X_train_extend = np.array(X_train_extend)
    return X_train_extend, y

def forward(X, selected_features, remaining_features, target):
    y_train = X[target]
    score_dict = {}
    for feature in remaining_features:
        X_train = X[selected_features + [feature]]
        model = LogisticRegression(random_state=42, max_iter=1000, penalty = None, solver = 'newton-cholesky')
        model.fit(X_train,y_train)
        beta = np.append(model.intercept_, model.coef_)

        X_n, y_n = convert(X[selected_features + [feature] + [target]], target)
        loglike = log_likelihood(X_n, y_n, beta)
        beta[-1] = 0
        gradient_log = gradient_log_likelihood(X_n, y_n, beta)
        hessian_log = hessian_log_likelihood(X_n, y_n, beta)
        score_chi_square = gradient_log@np.linalg.inv(-hessian_log)@(gradient_log.T)
        p_value = 1 - chi2.cdf(score_chi_square, 1)
        score_dict[feature] = [score_chi_square, p_value]
        Score = pd.DataFrame(score_dict)
        Score.index = ['Score Chi-Square', 'p-value']
        Score = Score.transpose()
    return Score


## Bước 1: Chọn biến đầu tiên vào mô hình

In [None]:
remaining_list = ['grade', 'total_rec_late_fee', 'tot_hi_cred_lim', 'num_actv_rev_tl', 'loan_amnt', 'verification_status', 'total_rev_hi_lim', 'term', 'mort_acc', 'inq_last_6mths', 'annual_inc', 'revol_util', 'open_rv_24m', 'C_num_tot', 'C_tot_dti']

In [None]:
a = forward(train_woe, [], remaining_list, 'GOOD')
a

Unnamed: 0,Score Chi-Square,p-value
grade,18985.976359,0.0
total_rec_late_fee,17083.88185,0.0
tot_hi_cred_lim,3572.193272,0.0
num_actv_rev_tl,2096.3947,0.0
loan_amnt,1743.57215,0.0
verification_status,3837.460005,0.0
total_rev_hi_lim,1691.307151,0.0
term,4355.843628,0.0
mort_acc,3370.218331,0.0
inq_last_6mths,1546.615977,0.0


## Bước 2: Chọn biến tiếp theo

In [None]:
b = forward(train_woe, remaining_list[0:1], remaining_list[1:], 'GOOD')
b

Unnamed: 0,Score Chi-Square,p-value
total_rec_late_fee,13239.349452,0.0
tot_hi_cred_lim,2007.621604,0.0
num_actv_rev_tl,1039.29303,0.0
loan_amnt,484.134304,0.0
verification_status,785.923316,0.0
total_rev_hi_lim,300.660482,0.0
term,230.394106,0.0
mort_acc,1991.14808,0.0
inq_last_6mths,212.526562,0.0
annual_inc,733.331719,0.0


## Bước 3: Chọn biến tiếp theo

In [None]:
c = forward(train_woe, remaining_list[0:2], remaining_list[2:], 'GOOD')
c

Unnamed: 0,Score Chi-Square,p-value
tot_hi_cred_lim,2074.25149,0.0
num_actv_rev_tl,1154.926473,0.0
loan_amnt,447.902174,0.0
verification_status,641.676514,0.0
total_rev_hi_lim,227.831652,0.0
term,298.619043,0.0
mort_acc,1987.227617,0.0
inq_last_6mths,207.18475,0.0
annual_inc,809.655791,0.0
revol_util,367.085641,0.0
