# Packages and Functions

In [None]:
# Call functions
%matplotlib inline
import pandas as pd
import numpy as np
import joblib
from sklearn.pipeline import Pipeline

# load model and data
from model_ingestion.data_model import load_data_model

# Local Explanation
from local_exp.local_exp import dalex_exp

# ExplainerDashboard
from explainerdashboard import *
from eda.dashboard import *
from fairness.dashboard import *
from local_exp.dashboard import *
from global_exp.dashboard import *
from stability.dashboard import *
from robustness.dashboard import *
from uncertainty.dashboard import *

from raiwidgets.responsibleai_dashboard import ResponsibleAIDashboard
from fairness.XRAI_features import *
from raiwidgets import ErrorAnalysisDashboard

# AutoEDA
import dtale.app as dtale_app
from contextlib import redirect_stdout

# Dash
import dash_bootstrap_components as dbc

# Load Data

In [None]:
# load required datasets and model -- regression example. Comment out if using classification use case

train_data = 'data/property_valuation/train_property_valuation.csv' ## INPUT HERE
test_data = 'data/property_valuation/test_property_valuation.csv' ## INPUT HERE
model_path = {"LGBM":'models/property_valuation/property_valuation_lgbm.sav',"DT":'models/property_valuation/property_valuation_decision_tree.sav'} ## INPUT HERE
target_feature = 'price_sqm' ## INPUT HERE

X_train, y_train, X_test, y_test, train_data, test_data, model = load_data_model(train_data, test_data, model_path, target_feature)

cont = X_train.select_dtypes(include = np.number).columns.tolist()
cat = X_train.select_dtypes(exclude = np.number).columns.tolist()

reg = True

In [None]:
# # load required datasets and model -- classification example. Comment out if using regression use case

# import sklearn
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer, KNNImputer
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier

# X_train = pd.read_csv(f'data/loans/loans_X_train.csv', index_col = 0)
# X_test = pd.read_csv(f'data/loans/loans_X_test.csv', index_col = 0)
# y_train = pd.read_csv(f'data/loans/loans_y_train.csv', index_col = 0).squeeze()
# y_test = pd.read_csv(f'data/loans/loans_y_test.csv', index_col = 0).squeeze()

# train_data = pd.concat([X_train, y_train], axis = 1)
# test_data = pd.concat([X_test, y_test], axis = 1)

# target = ['loan_status']

# drop_cols = ['id', 'member_id', 'issue_d', 'title', 'zip_code', 'addr_state', 'last_pymnt_d',
#              'next_pymnt_d', 'last_credit_pull_d', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim']

# # ord_cols = [#'grade', 'sub_grade', 
# #             'earliest_cr_line', 'emp_length']

# num_cols = list(set(train_data.select_dtypes(include = ['int64', 'float64']).columns.tolist()) - set(drop_cols) - set(target))
# cat_cols = list(set(train_data.select_dtypes(exclude = ['int64', 'float64']).columns.tolist()) - set(drop_cols) - set(target))

# rem_cols = list(set(train_data.columns.tolist()) - set(num_cols) - set(cat_cols) - set(drop_cols) - set(target))

# print('Numerical Columns:\n', num_cols)
# print('\nCategorical Columns:\n', cat_cols)
# print('\nRemaining Columns:\n', rem_cols)

# cont = X_train.select_dtypes(include = np.number).columns.tolist()
# cat = X_train.select_dtypes(exclude = np.number).columns.tolist()

# seed = 42

# numerical = Pipeline(
#     steps = [
#         ('num_imputer', KNNImputer()),
#         ('scaler', StandardScaler())
#     ]
# )

# categorical = Pipeline(
#     steps = [
#         ('cat_imputer', SimpleImputer(strategy = 'constant', fill_value = 'No Data')),
#         ('ohe', OneHotEncoder(handle_unknown = 'ignore'))
#     ]
# )

# preproc = ColumnTransformer([
#     ('num', numerical, num_cols),
#     ('cat', categorical, cat_cols),
#     ('drop', 'drop', drop_cols),
# ])

# # Decision Tree
# pipe = Pipeline(
#     steps = [
#         ('preproc', preproc),
#         ('clr', DecisionTreeClassifier(random_state = seed, criterion = 'gini',
#                                       max_depth = 22, max_features = .75))
#     ]
# )
# pipe.fit(X_train, y_train)
# model_dt = pipe

# # Random Forest
# pipe = Pipeline(
#     steps = [
#         ('preproc', preproc),
#         ('clf', RandomForestClassifier(random_state = seed, criterion = 'gini',
#                                       max_depth = None, max_features = 'log2',
#                                       n_estimators = 250))
#     ]
# )
# pipe.fit(X_train, y_train)
# model_rf = pipe

# # XGBoost
# pipe = Pipeline(
#     steps = [
#         ('preproc', preproc),
#         ('clf', XGBClassifier(max_depth = 25, n_estimators = 250, random_state = seed))
#     ]
# )
# pipe.fit(X_train, y_train)
# model_xgb = pipe

# model = {'DT': model_dt,
#          'RF': model_rf,
#          'XGB': model_xgb}

# preprocessor = model['DT'][0]
# preprocessor2 = model['RF'][0]
# preprocessor3 = model['XGB'][0]

# reg = False

# Dashboard

In [None]:
# Create and explainer for the dashboard
try:
    explainer = ClassifierExplainer(model['DT'], X_test, y_test) # Input the test set and classifier model here
except:
    explainer = RegressionExplainer(model['DT'], X_test, y_test) # Input the test set and regression model here

In [None]:
# Configure your sklearn pipeline here
pipe = Pipeline(
    steps = [
        ('step1', model['DT'][0]),
        ('step2', model['DT'][1])
    ]
)

pipe.fit(X_train, y_train)

In [None]:
# Configure the groupings for the Grouped Variable Importances Component in the Dashboard. Set to None for no groupings
if reg:
    variable_groups = {
        'cmci': ['infrastructure', 'resiliency', 'productivity', 'security', 'transparency', 'utilities'],
        'house_amenities': [ 'ac_unit', 'balcony', 'deck', 'fence', 'fireplace', 'fitness_center', 'garage',
                            'grass', 'library_books', 'local_airport', 'local_parking', 'meeting_room', 'park',
                            'pool', 'security.1', 'smoke_free', 'sports_basketball', 'sports_tennis',
                            'sports_volleyball', 'warehouse', 'yard'],
        'house_characteristics': ['price_conditions', 'car_spaces', 'bedrooms', 'bathrooms', 'floor_area', 'land_size'],
        'LOI_1000': ['cafe_1000', 'fast_food_1000', 'pub_1000', 'restaurant_1000', 'college_1000', 'kindergarten_1000',
                    'school_1000', 'university_1000', 'fuel_1000', 'parking_1000', 'atm_1000', 'bank_1000', 'clinic_1000',
                    'hospital_1000', 'pharmacy_1000', 'police_1000', 'townhall_1000', 'marketplace_1000', 'hotel_1000',
                    'residential_1000', 'commercial_1000', 'industrial_1000', 'retail_1000', 'supermarket_1000',
                    'fire_station_1000', 'government_1000'],
        'LOI_3000': ['cafe_3000', 'fast_food_3000', 'pub_3000', 'restaurant_3000', 'college_3000', 'kindergarten_3000',
                    'school_3000', 'university_3000', 'fuel_3000', 'parking_3000', 'atm_3000', 'bank_3000', 'clinic_3000',
                    'hospital_3000', 'pharmacy_3000', 'police_3000', 'townhall_3000', 'marketplace_3000', 'hotel_3000',
                    'residential_3000', 'commercial_3000', 'industrial_3000', 'retail_3000', 'supermarket_3000',
                    'fire_station_3000', 'government_3000'],
        'LOI_5000': ['cafe_5000', 'fast_food_5000', 'pub_5000', 'restaurant_5000', 'college_5000', 'kindergarten_5000',
                    'school_5000', 'university_5000', 'fuel_5000', 'parking_5000', 'atm_5000', 'bank_5000', 'clinic_5000',
                    'hospital_5000', 'pharmacy_5000', 'police_5000', 'townhall_5000', 'marketplace_5000', 'hotel_5000',
                    'residential_5000', 'commercial_5000', 'industrial_5000', 'retail_5000', 'supermarket_5000',
                    'fire_station_5000', 'government_5000'],
        'socio-economic': ['LGU', 'poverty_inc', 'subs_inc', 'lgu_type', 'income_class', 'anreg_income_2021',
                        'capex_2021', 'socex_2021', 'pop_2022', 'growth_5years', 'growth_10years']
    }
elif not reg:
    variable_groups = {
        'total': ['total_acc', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
                    'total_rec_int', 'total_rec_late_fee', 'total_rev_hi_lim'],
            'amount': ['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'last_pymnt_amnt'],
            'categorical': X_train.select_dtypes(exclude = ['int64', 'float64']).columns.tolist()
    }

In [None]:
# Separate continuous and categorical variables
cont = X_train.select_dtypes(include = ['int64', 'float64']).columns.tolist()
cat = X_train.select_dtypes(exclude = ['int64', 'float64']).columns.tolist()

In [None]:
# Only one model should be selected, and it should be in a dictionary form
model_selected = {'DT': model['DT']}
if reg:
    model_type = 'regressor'
elif not reg:
    model_type = 'classifier'
is_sklearn_pipe = True # Did it use sklearn pipeline?

In [None]:
# Create a Dalex explainer for the global explanation components
exp, obs = dalex_exp(list(model_selected.values())[0], X_train, y_train, X_test, 0)

In [None]:
dataset = pd.concat([train_data, test_data], axis=0)
dataset.info()

In [None]:
if reg:
    features = pipe.get_feature_names_out()
    pipe.fit(X_train)
    X_train_proc = pd.DataFrame(pipe.transform(X_train), columns=features)
    X_test_proc = pd.DataFrame(pipe.transform(X_test), columns=features)
elif not reg:
    features = preprocessor.get_feature_names_out()
    preprocessor.fit(X_train)
    X_train_proc = pd.DataFrame(preprocessor.transform(X_train), columns=features)
    X_test_proc = pd.DataFrame(preprocessor.transform(X_test), columns=features)
train_data_proc = pd.concat([X_train_proc, y_train], axis=1) if model_type == "classifier" else None
test_data_proc = pd.concat([X_test_proc, y_test], axis=1) if model_type == "classifier" else None

In [None]:
df = dtale_eda(dataset)
#df.main_url()
with open("temp.log", "w") as f:
    with redirect_stdout(f):
        df.main_url()
with open("temp.log") as f:
    href = f.readlines()
href

In [None]:
autoviz_eda2(dataset.sample(100))

In [None]:
# ydata_profiling_eda2(dataset.sample(100))

In [None]:
if model_type == 'classifier':
    target_feature = target[0]
    drop_cols = ['id', 'member_id', 'issue_d', 'title', 'zip_code', 'addr_state', 'last_pymnt_d',
             'next_pymnt_d', 'last_credit_pull_d', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim']
    num_cols = list(set(dataset.select_dtypes(include = ['int64', 'float64']).columns.tolist()) - set(drop_cols) - set(target_feature))
    cat_cols = list(set(dataset.select_dtypes(exclude = ['int64', 'float64']).columns.tolist()) - set(drop_cols) - set(target_feature))
    rai_insights, cohort_list = xrai_features(list(model_selected.values())[0], train_data.drop(drop_cols, axis = 1),
                                              test_data.drop(drop_cols, axis = 1), target_feature, categorical_features = cat_cols
                                             )
    ResponsibleAIDashboard(rai_insights, cohort_list=cohort_list)
else:
    pipe = model['DT'][:-1]
    features = pipe.get_feature_names_out()
    pipe.fit(X_train)
    X_test_proc = pd.DataFrame(pipe.transform(X_test), columns=features)
    predictions = model['DT'][-1].predict(X_test_proc)
    ErrorAnalysisDashboard(dataset=X_test_proc, true_y=y_test, features=features, pred_y=predictions, model_task='regression')

In [None]:
if reg:
    try:
        ExplainerDashboard(explainer, [
                                    EDATab(explainer, href),
                                    FairnessTab(explainer, model_selected, X_test, y_test, X_train, y_train, test_data, train_data, test_data_proc, train_data_proc, target_feature, model_type),
                                    LocalExpTab(explainer, model_selected, X_train, y_train, X_test,cont, cat, model_type, target_feature, pipe),
                                    GlobalExpTab(explainer, exp, model_selected, X_train, pipe, cat, model_type, variable_groups, features),
                                    StabilityTab(explainer, X_train, y_train, X_test, y_test, cont, pipe, model_selected, train_data, test_data, target_feature, model_type),
                                    StabilityTestTab(explainer, model_selected, train_data, test_data, target_feature, model_type),
                                    RobustnessTab(explainer, model_selected, X_train_proc, y_train, X_test_proc, y_test, model_type),
                                    UncertaintyTab(explainer, model_selected, X_train_proc, y_train, X_test, y_test, model_type),
                                    ], bootstrap = dbc.themes.FLATLY, hide_header = True).run()
    except:
        print('Hello!')
elif not reg:
    try:
        ExplainerDashboard(explainer, [
                                EDATab(explainer, href),
                                FairnessTab(explainer, model_selected, X_test, y_test, X_train, y_train, test_data, train_data, test_data_proc, train_data_proc, target_feature, model_type),
                                LocalExpTab(explainer, model_selected, X_train, y_train, X_test,cont, cat, model_type, target_feature, preprocessor),
                                GlobalExpTab(explainer, exp, model_selected, X_train, preprocessor, cat, model_type, variable_groups, features),
                                StabilityTab(explainer, X_train, y_train, X_test, y_test, cont, preprocessor, model_selected, train_data, test_data, target_feature, model_type),
                                StabilityTestTab(explainer, model_selected, train_data, test_data, target_feature, model_type),
                                RobustnessTab(explainer, model_selected, X_train_proc, y_train, X_test_proc, y_test, model_type),
                                UncertaintyTab(explainer, model_selected, X_train_proc, y_train, X_test, y_test, model_type),
                                ], bootstrap = dbc.themes.FLATLY, hide_header = True).run()
    except:
        print('Hello!')

2023-10-05 11:20:19,363 - INFO     - Executing shutdown due to inactivity...
2023-10-05 11:20:35,870 - INFO     - Executing shutdown...
2023-10-05 11:20:35,876 - INFO     - Not running with the Werkzeug Server, exiting by searching gc for BaseWSGIServer
