## Load Data

In [9]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import inspect
import lime
import lime.lime_tabular


In [10]:
df = pd.read_csv('dataset/heloc.csv')

In [11]:
# 특징과 라벨 분리 (헬로크 데이터에서 y값을 'RiskPerformance'로 가정)
X = df.drop(columns=['RiskPerformance'])  # 입력 변수
y = df['RiskPerformance'].apply(lambda x: 1 if x == 'Bad' else 0)  # 'Bad'를 1로, 'Good'을 0으로 변환

# 학습/테스트 데이터 분리
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost용 데이터 형식 변환
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)

# # 기본 하이퍼파라미터 설정
# param = {'silent':True, 'objective':'binary:logistic', "eta":0.05, 'eval_metric': 'rmse',
#          'monotone_constraints':"(1,1,1,1,-1,-1,1,0,0,-1,-1,-1,0,-1,0,1,1)"}

# # Cross-validation으로 적절한 boosting round 찾기
# bst_cv = xgb.cv(param, dtrain, 500, nfold=10, early_stopping_rounds=10)

# # 모델 훈련
# evals_result = {}
# evallist  = [(dtrain, 'train'), (dtest, 'eval')]
# bst = xgb.train(param, dtrain, num_boost_round=bst_cv.shape[0], evals_result=evals_result, evals=evallist, verbose_eval=False)

## Load Model

In [12]:
bst = xgb.Booster()
bst.load_model('model/xgb.model')

## Predict

In [13]:
# 모델 평가
y_train_pred = bst.predict(dtrain)
y_test_pred = bst.predict(dtest)

# predictions1 = [round(value) for value in y_train_pred]
# predictions2 = [round(value) for value in y_test_pred]

# print('Train accuracy:', accuracy_score(y_train, predictions1))
# print('Test accuracy:', accuracy_score(y_test, predictions2))

# AUC Score 계산
# auc_train = roc_auc_score(y_train, y_train_pred)  # 확률 기반으로 AUC 계산
# auc_test = roc_auc_score(y_test, y_test_pred)
# print('AUC_Train:', auc_train.round(4))
# print('AUC_Test:', auc_test.round(4))

# print(y_test_pred[0])

## SHAP, LIME

In [14]:
lime_explainer = lime.lime_tabular.LimeTabularExplainer(x_train.values,
                                                        feature_names=list(x_train.columns),
                                                        class_names=['Good', 'Bad'],
                                                        discretize_continuous=True)
explainer = shap.TreeExplainer(bst)
shap_values = explainer.shap_values(x_test)
# XGBoost의 예측을 확률로 반환하도록 하는 래핑된 함수
def predict_fn_for_lime(input_data):
    dmatrix_data = xgb.DMatrix(input_data, feature_names=list(x_train.columns))
    # 확률 값으로 반환하여 LIME에서 사용 가능하게 함
    probas = bst.predict(dmatrix_data)  # 확률 값 반환
    return np.column_stack([1 - probas, probas])  # LIME은 클래스별 확률을 받으므로 [1-p, p] 형식으로 반환

## Multiple inferences

In [15]:
prompt = inspect.cleandoc('''
Question:
The following is the result of binary classification using the HELOC (Home Equity Line of Credit) Dataset and XGBClassifier to classify RiskPerformance into “Good” and “Bad.”
The value “Bad” indicates that a consumer was 90 days past due or worse at least once over a period of 24 months from when the credit account was opened. The value “Good” indicates that they have made their payments without ever being more than 90 days overdue.

Before answering, please think steyp by step coincisely in these steps to explain the prediction.
1. SHAP Analysis: Analyze the key features from the SHAP analysis, explaining how each feature contributes to the prediction.
This step should be inside <SHAP>$$INSERT TEXT HERE$$</SHAP> tag.
2. LIME Analysis: Analyze the key features from the LIME analysis, explaining the contribution of each feature in terms of how it influences the prediction.
This step should be inside <LIME>$$INSERT TEXT HERE$$</LIME> tag.
3. Insight Synthesis: Based on the individual feature analyses from SHAP and LIME, synthesize the insights to provide a comprehensive conclusion. The conclusion should focus on how these features work together to influence the final prediction.
This step should be inside <Insight>$$INSERT TEXT HERE$$</Insight> tag.
4. Final Explanation for Non-Experts: Provide the prediction result and explain the comprehensive reasoning behind the result, considering multiple factors that contributed to this outcome. Ensure the explanation is clear, detailed, and avoids using technical terms or direct references to probabilities or numbers, so that the final explanation is understandable to non-experts in machine learning or finance.
This step should be inside <Conclusion>$$INSERT TEXT HERE$$</Conclusion> tag.
In this part,
- Ensure to be thorough and specific as possible, with enough length to fully explain the reasoning behind the prediction and offer clear, actionable advice to the user.
- Please respond as if you were a human, using natural conversational tone. Be engaging, empathetic, and use phrases and expressions that sound like they’re coming from a real person, keeping the tone friendly and conversational. Avoid sounding overly formal or robotic.
- Please provide a sentences without explicitly using terms like 'model,' 'probability,' or directly mentioning numbers. Instead, explain the concepts in simple, intuitive language that avoids technical jargon.
- At the end of the part, provide a personalized piece of advice for the user on how they can improve or maintain their risk performance in the future.

Context:
1. Prediction Probability
- Good: {predict_proba_good}
- Bad: {predict_proba_bad}
- Predicted to {predicted_class}

2. SHAP analysis (Feature, SHAP Importance)
{shap_analysis}

3. LIME analysis (Feature, LIME Importance)
{lime_analysis}

Answer:
''')

In [23]:
def importance_to_str(df, analysis_type):
  if analysis_type == "SHAP":
    value = "SHAP Importance"
  elif analysis_type == "LIME":
    value = "LIME Importance"
  return '\n'.join([f"- ({row['Feature']}, {row[value]})" for _, row in df.iterrows()])

In [None]:
def proba_to_class(proba):
  return "Good" if proba < 0.5 else "Bad"

In [24]:
for i in range(10, 20):
  lime_exp = lime_explainer.explain_instance(x_test.iloc[i].values, predict_fn_for_lime, num_features=10)
  lime_importance = pd.DataFrame(lime_exp.as_list(), columns=['Feature', 'LIME Importance'])
  lime_importance_string = importance_to_str(lime_importance, "LIME")
  
  shap_importance = pd.DataFrame({'Feature': x_test.columns, 'SHAP Importance': np.abs(shap_values[1])})
  shap_importance = shap_importance.sort_values(by='SHAP Importance', ascending=False).head(10)
  shap_importance_string = importance_to_str(shap_importance, "SHAP")
  # Should convert to plain text

  prediction = y_test_pred[i]
  prediction_class = proba_to_class(y_test_pred[i])

  break

                         Feature  SHAP Importance
17    NetFractionRevolvingBurden         0.358624
7         PercentTradesNeverDelq         0.337536
14  MSinceMostRecentInqexcl7days         0.288519
9       MaxDelq2PublicRecLast12M         0.235673
4          NumSatisfactoryTrades         0.167861
18      NetFractionInstallBurden         0.124556
15                  NumInqLast6M         0.120804
19    NumRevolvingTradesWBalance         0.096290
22         PercentTradesWBalance         0.079032
12        NumTradesOpeninLast12M         0.076880
<class 'pandas.core.frame.DataFrame'>
                                     Feature  LIME Importance
0      MSinceMostRecentInqexcl7days <= -7.00        -0.186296
1         NetFractionRevolvingBurden > 54.00         0.169774
2             NumSatisfactoryTrades <= 12.00         0.150673
3    6.00 < MaxDelq2PublicRecLast12M <= 7.00        -0.135522
4   96.00 < PercentTradesNeverDelq <= 100.00        -0.116345
5                    NumTotalTrades <= 1

## Select I

In [None]:
from huggingface_hub import login
import inspect
import dotenv
import os

dotenv.load_dotenv()
hf_token = os.getenv("HUGGINGFACE_TOKEN")

login(hf_token)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

quantization_config = BitsAndBytesConfig(load_in_8bit=True)

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-9b-it",
    quantization_config=quantization_config,
)
device = torch.device("cuda:0")


In [None]:
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    output = model.generate(inputs["input_ids"], max_length=3000, num_return_sequences=1)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

In [None]:
pro