In [2]:
from flask import Flask, jsonify, render_template,request
import pandas as pd
import numpy as np
import json

from tabular_data_framework.vcnet_tabular_data_v0.join_training_network import CVAE_join
from tabular_data_framework.vcnet_tabular_data_v0.train_network import Train_CVAE 
from tabular_data_framework.load_data import Load_dataset_base,load_data_dict
from tabular_data_framework.vcnet_tabular_data_v0.load_config import Load_config,load_config_dict
from tabular_data_framework.import_essentials import *
from tabular_data_framework.utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
def result_to_d3(result:Dict):
    data = {}
    data["col"] = result["X_original_space"].columns.tolist()
    data["X"] = result["X_original_space"].values.transpose().tolist()
    data["cf"] = result["cf_original_space"].values.transpose().tolist()
    data["y_x"] = result["y_x"].cpu().tolist()
    data["y_c"] = result["y_c"].cpu().tolist()
    data["proba_x"] = result["proba_x"].cpu().tolist()
    data["proba_c"] = result["proba_c"].cpu().tolist()
    return data

def prepare_new_data(dataset:Load_dataset_base,newdata:pd.DataFrame):
    
        def split_x_and_y(data):
            X = data[data.columns[:-1]]
            y = data[data.columns[-1]]
            return X, y
        X, y = split_x_and_y(newdata)
        # preprocessing 
        normalizer = dataset.normalizer
        encoder = dataset.encoder
        X_cont = normalizer.transform(X[dataset.continous_cols]) if dataset.continous_cols else np.array([[] for _ in range(len(X))])
        
        X_cat = encoder.transform(X[dataset.discret_cols]) if dataset.discret_cols else np.array([[] for _ in range(len(X))])
        X = np.concatenate((X_cont, X_cat), axis=1)        
        # Number of continious variables 
        newdata_np = NumpyDataset(X, y.to_numpy())                
        return newdata_np

def compute_counterfactuals(X):
    name = "churn"
    # Load the model parameters in a dict 
    model_config_dict = load_config_dict(name)
    model_config = Load_config(model_config_dict)

    # Load the dataset parameters in a dict 
    dataset_config_dict = load_data_dict(name)
    # Create a load dataset object 
    dataset = Load_dataset_base(dataset_config_dict,model_config_dict,subsample=False)

    # Prepare dataset and return dataloaders + ohe index 
    loaders,cat_arrays,cont_shape = dataset.prepare_data()
    debug_enc = False 
    training = Train_CVAE(dataset_config_dict,model_config_dict,cat_arrays,cont_shape,loaders,dataset,ablation=None,condition="change_dec_only",cuda_name="cpu",shared_layers=True,debug_enc=debug_enc)
    #training.train_and_valid_cvae(tensorboard=True)
    training.load_weights(dataset.name)
    X_np,y_np = prepare_new_data(dataset,X)[:]
    data_prob = {}
    for p in [x * 0.1 for x in range(0, 11)]:
        proba = torch.tensor([[p for i in range(X_np.shape[0])]]).T
        # Compute counterfactuals 
        results = training.compute_counterfactuals_custom_proba(X_np.to(training.cuda_device), y_np.to(training.cuda_device),proba=proba,laugel_metric=False)
        eps = 0.01
        results = training.round_counterfactuals(results,eps,X_np)[0]
        # Round numerical values and check if counterfactuals are still valid 
        from tabular_data_framework.utils import check_min_max_scaler,int_round_dataset,round_counterfactuals
        # dico of columns we want as rounded variables (here rounded is done to obtain int)
        dico_round = {"tenure" : None, "MonthlyCharges" : 2, "TotalCharges" : 2 }
        results_rounded = round_counterfactuals(X_np,results,dataset,training,dico_round)
        data_prob[f"{round(p,1)}"] = result_to_d3(results_rounded)
    return data_prob   


In [50]:
def numpy_to_dataframe(X,counterfactuals,dataset) :
    # Back to the original space 
    original_examples = dataset.inverse_transform(X,return_tensor=False)
    original_counterfactuals = dataset.inverse_transform(counterfactuals,return_tensor=False)
    #print("ORIGINAL",original_counterfactuals)
    # Transform do dataframe
    df_example = pd.DataFrame(data=original_examples,columns=dataset.continous_cols + dataset.discret_cols)
    df_counterfactuals = pd.DataFrame(data=original_counterfactuals,columns=dataset.continous_cols + dataset.discret_cols)
    #print("df_counterfactuals1",df_counterfactuals["PaperlessBilling"])
    # Select same order for categorical variable (for counterfactuals and examples)
    for col in dataset.discret_cols : 
        df_example[col] = pd.Categorical(df_example[col],pd.unique(df_example[col]))
        df_counterfactuals[col] = pd.Categorical(df_counterfactuals[col],pd.unique(df_counterfactuals[col]))
    #print("df_counterfactuals2",df_counterfactuals["PaperlessBilling"])
    return(df_example,df_counterfactuals)

In [12]:

example = {'gender': ['Male'], 'SeniorCitizen': ['0'], 'Partner': ['Yes'], 'Dependents': ['Yes'], 'tenure': ['1'], 'MonthlyCharges': ['20'], 'TotalCharges': ['250'], 'PhoneService': ['Yes'], 'MultipleLines': ['Yes'], 'InternetService': ['No'], 'OnlineSecurity': ['Yes'], 'DeviceProtection': ['Yes'], 'OnlineBackup': ['Yes'], 'TechSupport': ['Yes'], 'StreamingTV': ['Yes'], 'StreamingMovies': ['Yes'], 'Contract': ['Month-to-month'], 'PaperlessBilling': ['Yes'], 'PaymentMethod': ['Bank transfer (automatic)']}
X = pd.DataFrame.from_dict(example)
X['Churn'] = 1
X = X.astype({'SeniorCitizen': 'int64','tenure':'int64','MonthlyCharges':"float64",'TotalCharges':'float64'})
counterfactuals = compute_counterfactuals(X)
print(counterfactuals["0.0"].keys())

Percentage of valid counterfactuals : 0.0
Percentage of valid counterfactuals : 0.0
Percentage of valid counterfactuals : 0.0
Percentage of valid counterfactuals : 0.0
Percentage of valid counterfactuals : 1.0
Percentage of valid counterfactuals : 1.0
Percentage of valid counterfactuals : 1.0
Percentage of valid counterfactuals : 1.0
Percentage of valid counterfactuals : 1.0
Percentage of valid counterfactuals : 1.0
Percentage of valid counterfactuals : 1.0
dict_keys(['col', 'X', 'cf', 'y_x', 'y_c', 'proba_x', 'proba_c'])


In [51]:
name = "churn"
# Load the model parameters in a dict 
model_config_dict = load_config_dict(name)
model_config = Load_config(model_config_dict)

# Load the dataset parameters in a dict 
dataset_config_dict = load_data_dict(name)
# Create a load dataset object 
dataset = Load_dataset_base(dataset_config_dict,model_config_dict,subsample=False)

# Prepare dataset and return dataloaders + ohe index 
loaders,cat_arrays,cont_shape = dataset.prepare_data()
debug_enc = False 
training = Train_CVAE(dataset_config_dict,model_config_dict,cat_arrays,cont_shape,loaders,dataset,ablation=None,condition="change_dec_only",cuda_name="cpu",shared_layers=True,debug_enc=debug_enc)
#training.train_and_valid_cvae(tensorboard=True)
training.load_weights(dataset.name)
X_np,y_np = prepare_new_data(dataset,X)[:]
proba = torch.tensor([[0.1 for i in range(X.shape[0])]]).T
results = training.compute_counterfactuals_custom_proba(X_np.to(training.cuda_device), y_np.to(training.cuda_device),proba=proba,laugel_metric=False)
eps = 0.01
results = training.round_counterfactuals(results,eps,X_np)[0]
np_data = numpy_to_dataframe(X_np,results["cf"],dataset)
print(np_data[1])


Percentage of valid counterfactuals : 0.0
      tenure MonthlyCharges TotalCharges  gender SeniorCitizen Partner  \
0  38.905725      59.422038  2678.851969  Female             0     Yes   

  Dependents PhoneService MultipleLines InternetService OnlineSecurity  \
0         No          Yes            No     Fiber optic             No   

  OnlineBackup DeviceProtection TechSupport          StreamingTV  \
0          Yes              Yes         Yes  No internet service   

       StreamingMovies  Contract PaperlessBilling PaymentMethod  
0  No internet service  One year               No  Mailed check  
