In [2]:
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn
from sklearn.preprocessing import StandardScaler
# define a XGBoost classifier
import xgboost as xgb
import warnings
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix


warnings.filterwarnings("ignore")  # Ignore runtime warnings
# Temporarily adjust pandas display settings for large DataFrames
pd.set_option('display.max_rows', 100)  # Ensure 100 rows can be displayed
pd.set_option('display.max_columns', None)  # Ensure all columns can be displayed
pd.set_option('display.width', None)  # Automatically adjust display width to terminal size
pd.set_option('display.max_colwidth', None)  # Ensure full width of column content is shown
pd.set_option('display.float_format', '{:.4f}'.format)  # Format the float numbers for better readability

In [3]:
# Load the dataset
data = pd.read_csv('data/synth_data_for_training.csv')
y = data['checked']
X = data.drop(['checked'], axis=1)
X = X.astype(np.float32)

In [4]:
# Let's load the model
new_session = rt.InferenceSession("model_2.onnx")

y_pred_onnx2 =  new_session.run(None, {'X': X.values.astype(np.float32)})


accuracy_onnx_model = accuracy_score(y, y_pred_onnx2[0])
# Calculate F1 score, precision, and recall
f1_model = f1_score(y, y_pred_onnx2[0]) # You can change average method based on your need
precision_model = precision_score(y, y_pred_onnx2[0])
recall_model = recall_score(y, y_pred_onnx2[0])

print('Accuracy of the ONNX modeL: ', accuracy_onnx_model)
print('Precision of the ONNX model:', precision_model)
print('Recall of the ONNX model:', recall_model)
print('F1 score of the ONNX model:', f1_model)

Accuracy of the ONNX modeL:  0.7870304468169237
Precision of the ONNX model: 0.29943820224719103
Recall of the ONNX model: 0.8426877470355731
F1 score of the ONNX model: 0.4418652849740933


In [5]:
# Compute confusion matrix
confusion_matrix(y, y_pred_onnx2[0])

array([[8886, 2494],
       [ 199, 1066]], dtype=int64)

In [6]:
# Check how imbalanced the datasets is

y.value_counts(normalize=True)

checked
0   0.9000
1   0.1000
Name: proportion, dtype: float64

In [7]:
# Metamorphic testing: (Other than fairness testing)
# If a value changes then the prediction likelihood should change too in line with the purpose of the model 
# pla_historie_ontwikkeling 0 or 25 // number of developments in PLA history

# Load the model
testing_session = rt.InferenceSession("model_2.onnx")


# Initialize variables to store likelihoods
likelihoods_0 = []
likelihoods_25 = []

# Iterate through each sample in the test set
for index, row in X.iterrows():
    # Convert the row to a DataFrame to ensure it's a DataFrame object
    X_sample = pd.DataFrame(row).transpose()

    # Make predictions for 0 developments in PLA history
    X_sample_0 = X_sample.copy()
    X_sample_0['pla_historie_ontwikkeling'] = 0
    y_proba_0 = testing_session.run(None, {'X': X_sample_0.values.astype(np.float32)})[1][0]

    # Make predictions for 25 developments in PLA history
    X_sample_25 = X_sample.copy()
    X_sample_25['pla_historie_ontwikkeling'] = 25
    y_proba_25 = testing_session.run(None, {'X': X_sample_25.values.astype(np.float32)})[1][0]

    # Append the likelihoods for both age groups
    likelihoods_0.append([y_proba_0[1]])  # Probability of class 1 (fraud) for 0 developments in PLA history
    likelihoods_25.append([y_proba_25[1]])  # Probability of class 1 (fraud) for 25 developments in PLA history

# Convert likelihoods lists to NumPy arrays
likelihoods_0 = np.array(likelihoods_0)
likelihoods_25 = np.array(likelihoods_25)

# Calculate the mean likelihoods for each group
mean_likelihood_0 = np.mean(likelihoods_0)
mean_likelihood_25 = np.mean(likelihoods_25)

print("Mean likelihood for 0 developments in PLA history:", mean_likelihood_0)
print("Mean likelihood for 25 developments in PLA history:", mean_likelihood_25)

Mean likelihood for 0 developments in PLA history: 0.3404891435974622
Mean likelihood for 25 developments in PLA history: 0.2581012699999042


In [8]:
# contacten_onderwerp_no_show // Contact subject client has not shown up for meeting
likelihoods_show = []
likelihoods_noshow = []

# Iterate through each sample in the test set
for index, row in X.iterrows():
    # Convert the row to a DataFrame to ensure it's a DataFrame object
    X_sample = pd.DataFrame(row).transpose()

    # Make predictions for a client that has shown up for meetings
    X_sample_show = X_sample.copy()
    X_sample_show['contacten_onderwerp_no_show'] = 0.0
    y_proba_show = testing_session.run(None, {'X': X_sample_show.values.astype(np.float32)})[1][0]

    # Make predictions for no show client
    X_sample_noshow = X_sample.copy()
    X_sample_noshow['contacten_onderwerp_no_show'] = 1.0
    y_proba_noshow = testing_session.run(None, {'X': X_sample_noshow.values.astype(np.float32)})[1][0]

    # Append the likelihoods for both age groups
    likelihoods_show.append([y_proba_show[1]])  # Probability of class 1 (fraud) for a client that has shown up for meetings
    likelihoods_noshow.append([y_proba_noshow[1]])  # Probability of class 1 (fraud) for no show client

# Convert likelihoods lists to NumPy arrays
likelihoods_show = np.array(likelihoods_show)
likelihoods_noshow = np.array(likelihoods_noshow)

# Calculate the mean likelihoods for each group
mean_likelihood_show = np.mean(likelihoods_show)
mean_likelihood_noshow = np.mean(likelihoods_noshow)

print("Mean likelihood for a client that has shown up for meetings:", mean_likelihood_show)
print("Mean likelihood for no show client:", mean_likelihood_noshow)

Mean likelihood for a client that has shown up for meetings: 0.2187287717008836
Mean likelihood for no show client: 0.27761307624358345


In [9]:
# instrument_ladder_huidig_activering // instrument ladder is currently activated
likelihoods_notactivated = []
likelihoods_activated = []

# Iterate through each sample in the test set
for index, row in X.iterrows():
    # Convert the row to a DataFrame to ensure it's a DataFrame object
    X_sample = pd.DataFrame(row).transpose()

    # Make predictions for a client without an activated instrument ladder
    X_sample_notactivated = X_sample.copy()
    X_sample_notactivated['instrument_ladder_huidig_activering'] = 0.0
    y_proba_notactivated = testing_session.run(None, {'X': X_sample_notactivated.values.astype(np.float32)})[1][0]

    # Make predictions for a client with an activated instrument ladder
    X_sample_activated = X_sample.copy()
    X_sample_activated['instrument_ladder_huidig_activering'] = 1.0
    y_proba_activated = testing_session.run(None, {'X': X_sample_activated.values.astype(np.float32)})[1][0]

    # Append the likelihoods for both age groups
    likelihoods_notactivated.append([y_proba_notactivated[1]])  # Probability of class 1 (fraud) for a client without an activated instrument ladder
    likelihoods_activated.append([y_proba_activated[1]])  # Probability of class 1 (fraud) for a client with an activated instrument ladder

# Convert likelihoods lists to NumPy arrays
likelihoods_notactivated = np.array(likelihoods_notactivated)
likelihoods_activated = np.array(likelihoods_activated)

# Calculate the mean likelihoods for each group
mean_likelihood_notactivated = np.mean(likelihoods_notactivated)
mean_likelihood_activated = np.mean(likelihoods_activated)

print("Mean likelihood for a client without an activated instrument ladder:", mean_likelihood_notactivated)
print("Mean likelihood for a client with an activated instrument ladder:", mean_likelihood_activated)

Mean likelihood for a client without an activated instrument ladder: 0.32519890665300943
Mean likelihood for a client with an activated instrument ladder: 0.2574511158650173


In [10]:
# instrument_reden_beeindiging_historie_succesvol // successful instrumentation history
likelihoods_not = []
likelihoods_successful = []

# Iterate through each sample in the test set
for index, row in X.iterrows():
    # Convert the row to a DataFrame to ensure it's a DataFrame object
    X_sample = pd.DataFrame(row).transpose()

    # Make predictions for a client without a successful instrumentation history
    X_sample_not = X_sample.copy()
    X_sample_not['instrument_reden_beeindiging_historie_succesvol'] = 0.0
    y_proba_not = testing_session.run(None, {'X': X_sample_not.values.astype(np.float32)})[1][0]

    # Make predictions for a client with a successful instrumentation history
    X_sample_successful = X_sample.copy()
    X_sample_successful['instrument_reden_beeindiging_historie_succesvol'] = 1.0
    y_proba_successful = testing_session.run(None, {'X': X_sample_successful.values.astype(np.float32)})[1][0]

    # Append the likelihoods for both age groups
    likelihoods_not.append([y_proba_not[1]])  # Probability of class 1 (fraud) for a client without a successful instrumentation history
    likelihoods_successful.append([y_proba_successful[1]])  # Probability of class 1 (fraud) for a client with a successful instrumentation history

# Convert likelihoods lists to NumPy arrays
likelihoods_not = np.array(likelihoods_not)
likelihoods_successful = np.array(likelihoods_successful)

# Calculate the mean likelihoods for each group
mean_likelihood_not = np.mean(likelihoods_not)
mean_likelihood_successful = np.mean(likelihoods_successful)

print("Mean likelihood for a client without a successful instrumentation history:", mean_likelihood_not)
print("Mean likelihood for a client with a successful instrumentation history:", mean_likelihood_successful)

Mean likelihood for a client without a successful instrumentation history: 0.35310968828559536
Mean likelihood for a client with a successful instrumentation history: 0.29133165888561946


In [11]:
# Age test
# Initialize variables to store counts
same_predictions_count = 0
total_samples = len(X)

likelihoods_25 = []
likelihoods_35 = []
likelihoods_45 = []
likelihoods_55 = []
likelihoods_65 = []

# Iterate through each sample in the test set
for index, row in X.iterrows():
    # Convert the row to a DataFrame to ensure it's a DataFrame object
    X_sample = pd.DataFrame(row).transpose()

    # Make predictions for age 25
    X_sample_25 = X_sample.copy()
    X_sample_25['persoon_leeftijd_bij_onderzoek'] = 25
    y_pred_25 = testing_session.run(None, {'X': X_sample_25.values.astype(np.float32)})

    # Make predictions for age 35
    X_sample_35 = X_sample.copy()
    X_sample_35['persoon_leeftijd_bij_onderzoek'] = 35
    y_pred_35 = testing_session.run(None, {'X': X_sample_35.values.astype(np.float32)})
    
    # Make predictions for age 45
    X_sample_45 = X_sample.copy()
    X_sample_45['persoon_leeftijd_bij_onderzoek'] = 45
    y_pred_45 = testing_session.run(None, {'X': X_sample_45.values.astype(np.float32)})
    
    # Make predictions for age 55
    X_sample_55 = X_sample.copy()
    X_sample_55['persoon_leeftijd_bij_onderzoek'] = 55
    y_pred_55 = testing_session.run(None, {'X': X_sample_55.values.astype(np.float32)})
    
    # Make predictions for age 65
    X_sample_65 = X_sample.copy()
    X_sample_65['persoon_leeftijd_bij_onderzoek'] = 65
    y_pred_65 = testing_session.run(None, {'X': X_sample_65.values.astype(np.float32)})
    
    # Append the likelihoods for both groups
    likelihoods_25.append([y_pred_25[1][0][1]])
    likelihoods_35.append([y_pred_35[1][0][1]])
    likelihoods_45.append([y_pred_45[1][0][1]])
    likelihoods_55.append([y_pred_55[1][0][1]])
    likelihoods_65.append([y_pred_65[1][0][1]])

    # Check if predictions are the same
    if y_pred_25 == y_pred_65:
        same_predictions_count += 1

# Calculate the fraction of cases where the predictions are the same
fraction_same_predictions = same_predictions_count / total_samples


# Convert likelihoods lists to NumPy arrays
likelihoods_25 = np.array(likelihoods_25)
likelihoods_35 = np.array(likelihoods_35)
likelihoods_45 = np.array(likelihoods_45)
likelihoods_55 = np.array(likelihoods_55)
likelihoods_65 = np.array(likelihoods_65)

# Calculate the mean likelihoods for each group
mean_likelihood_25 = np.mean(likelihoods_25)
mean_likelihood_35 = np.mean(likelihoods_35)
mean_likelihood_45 = np.mean(likelihoods_45)
mean_likelihood_55 = np.mean(likelihoods_55)
mean_likelihood_65 = np.mean(likelihoods_65)

print("Fraction of cases where predictions are the same for age 25 and 65:", fraction_same_predictions)
print("Mean likelihood for a 25 year old:", mean_likelihood_25)
print("Mean likelihood for a 35 year old:", mean_likelihood_35)
print("Mean likelihood for a 45 year old:", mean_likelihood_45)
print("Mean likelihood for a 55 year old:", mean_likelihood_55)
print("Mean likelihood for a 65 year old:", mean_likelihood_65)

Fraction of cases where predictions are the same for age 25 and 65: 7.908264136022143e-05
Mean likelihood for a 25 year old: 0.4007646592769193
Mean likelihood for a 35 year old: 0.37043097029094313
Mean likelihood for a 45 year old: 0.24840444514005727
Mean likelihood for a 55 year old: 0.22172107294926582
Mean likelihood for a 65 year old: 0.6351792189857148


In [12]:
# Gender test
# Initialize variables to store counts
same_predictions_count = 0
total_samples = len(X)

likelihoods_men = []
likelihoods_women = []

# Iterate through each sample in the test set
for index, row in X.iterrows():
    # Convert the row to a DataFrame to ensure it's a DataFrame object
    X_sample = pd.DataFrame(row).transpose()

    # Make predictions for men
    X_sample_men = X_sample.copy()
    X_sample_men['persoon_geslacht_vrouw'] = 0.0
    y_pred_men = testing_session.run(None, {'X': X_sample_men.values.astype(np.float32)})

    # Make predictions for women
    X_sample_women = X_sample.copy()
    X_sample_women['persoon_geslacht_vrouw'] = 1.0
    y_pred_women = testing_session.run(None, {'X': X_sample_women.values.astype(np.float32)})
    
    
    # Append the likelihoods for both groups
    likelihoods_men.append([y_pred_men[1][0][1]])
    likelihoods_women.append([y_pred_women[1][0][1]])

    # Check if predictions are the same
    if y_pred_men == y_pred_women:
        same_predictions_count += 1

# Calculate the fraction of cases where the predictions are the same
fraction_same_predictions = same_predictions_count / total_samples

# Convert likelihoods lists to NumPy arrays
likelihoods_men = np.array(likelihoods_men)
likelihoods_women = np.array(likelihoods_women)

# Calculate the mean likelihoods for each group
mean_likelihood_men = np.mean(likelihoods_men)
mean_likelihood_women = np.mean(likelihoods_women)

print("Fraction of cases where predictions are the same for men and women:", fraction_same_predictions)
print("Mean likelihood for a man:", mean_likelihood_men)
print("Mean likelihood for a woman:", mean_likelihood_women)


Fraction of cases where predictions are the same for men and women: 0.0
Mean likelihood for a man: 0.09018988074781208
Mean likelihood for a woman: 0.5208941890393005


In [13]:
# Language test 
# persoonlijke_eigenschappen_spreektaal_anders
# Initialize variables to store counts
same_predictions_count = 0
total_samples = len(X)


# Iterate through each sample in the test set
for index, row in X.iterrows():
    # Convert the row to a DataFrame to ensure it's a DataFrame object
    X_sample = pd.DataFrame(row).transpose()

    # Make predictions for not
    X_sample_not = X_sample.copy()
    X_sample_not['persoonlijke_eigenschappen_spreektaal_anders'] = 0.0
    y_pred_not = testing_session.run(None, {'X': X_sample_not.values.astype(np.float32)})

    # Make predictions for other language
    X_sample_other = X_sample.copy()
    X_sample_other['persoonlijke_eigenschappen_spreektaal_anders'] = 1.0
    y_pred_other = testing_session.run(None, {'X': X_sample_other.values.astype(np.float32)})

    # Check if predictions are the same
    if y_pred_not == y_pred_other:
        same_predictions_count += 1

# Calculate the fraction of cases where the predictions are the same
fraction_same_predictions = same_predictions_count / total_samples

print("Fraction of cases where predictions are the same for dutch speakers and non-dutch speakers:", fraction_same_predictions)

Fraction of cases where predictions are the same for dutch speakers and non-dutch speakers: 1.0


In [14]:
# Language test
# persoonlijke_eigenschappen_spreektaal [57  0 96 73 99  2 95 70  3  1 19 59  4 25 61 14  9  5]
# Initialize variables to store counts
same_predictions_count = 0
total_samples = len(X)

likelihoods_language_0 = []
likelihoods_language_4 = []

# Iterate through each sample in the test set
for index, row in X.iterrows():
    # Convert the row to a DataFrame to ensure it's a DataFrame object
    X_sample = pd.DataFrame(row).transpose()
    
    # Make predictions for language 0
    X_sample_language_0 = X_sample.copy()
    X_sample_language_0['persoonlijke_eigenschappen_spreektaal'] = 0
    y_pred_language_0 = testing_session.run(None, {'X': X_sample_language_0.values.astype(np.float32)})

    # Make predictions for language 4
    X_sample_language_4 = X_sample.copy()
    X_sample_language_4['persoonlijke_eigenschappen_spreektaal'] = 4
    y_pred_language_4 = testing_session.run(None, {'X': X_sample_language_4.values.astype(np.float32)})
    
    # Append the likelihoods for both groups
    likelihoods_language_0.append([y_pred_language_0[1][0][1]])
    likelihoods_language_4.append([y_pred_language_4[1][0][1]])

    # Check if predictions are the same
    if y_pred_language_0 == y_pred_language_4:
        same_predictions_count += 1

# Calculate the fraction of cases where the predictions are the same
fraction_same_predictions = same_predictions_count / total_samples

# Convert likelihoods lists to NumPy arrays
likelihoods_language_0 = np.array(likelihoods_language_0)
likelihoods_language_4 = np.array(likelihoods_language_4)

# Calculate the mean likelihoods for each group
mean_likelihoods_language_0 = np.mean(likelihoods_language_0)
mean_likelihood_language_4 = np.mean(likelihoods_language_4)

print("Fraction of cases where predictions are the same for unknown language speakers and known language speakers:", fraction_same_predictions)
print("Mean likelihood for a client speaking unknown language:", mean_likelihoods_language_0)
print("Mean likelihood for a client speaking language 4:", mean_likelihood_language_4)

Fraction of cases where predictions are the same for unknown language speakers and known language speakers: 0.0007908264136022143
Mean likelihood for a client speaking unknown language: 0.3105974809932068
Mean likelihood for a client speaking language 4: 0.27691621925105875


In [15]:
# Children test
# Initialize variables to store counts
same_predictions_count = 0
total_samples = len(X)

# Iterate through each sample in the test set
for index, row in X.iterrows():
    # Convert the row to a DataFrame to ensure it's a DataFrame object
    X_sample = pd.DataFrame(row).transpose()

    # Make predictions for no children
    X_sample_not = X_sample.copy()
    X_sample_not['relatie_kind_heeft_kinderen'] = 0.0
    y_pred_not = testing_session.run(None, {'X': X_sample_not.values.astype(np.float32)})

    # Make predictions for client having children
    X_sample_children = X_sample.copy()
    X_sample_children['relatie_kind_heeft_kinderen'] = 1.0
    y_pred_children = testing_session.run(None, {'X': X_sample_children.values.astype(np.float32)})

    # Check if predictions are the same
    if y_pred_not == y_pred_children:
        same_predictions_count += 1

# Calculate the fraction of cases where the predictions are the same
fraction_same_predictions = same_predictions_count / total_samples

print("Fraction of cases where predictions are the same for people with or without children:", fraction_same_predictions)

Fraction of cases where predictions are the same for people with or without children: 1.0


In [20]:
# Children test
# Initialize variables to store counts
same_predictions_count = 0
total_samples = len(X)

likelihoods_0_children = []
likelihoods_3_children = []

# Iterate through each sample in the test set
for index, row in X.iterrows():
    # Convert the row to a DataFrame to ensure it's a DataFrame object
    X_sample = pd.DataFrame(row).transpose()

    # Make predictions for 0 children
    X_sample_0_children = X_sample.copy()
    X_sample_0_children['relatie_kind_huidige_aantal'] = 0.0
    y_pred_0_children = testing_session.run(None, {'X': X_sample_0_children.values.astype(np.float32)})

    # Make predictions for 3 children
    X_sample_3_children = X_sample.copy()
    X_sample_3_children['relatie_kind_huidige_aantal'] = 3.0
    y_pred_3_children = testing_session.run(None, {'X': X_sample_3_children.values.astype(np.float32)})
    
    # Append the likelihoods for both groups
    likelihoods_0_children.append([y_pred_0_children[1][0][1]])
    likelihoods_3_children.append([y_pred_3_children[1][0][1]])

    # Check if predictions are the same
    if y_pred_0_children == y_pred_3_children:
        same_predictions_count += 1

# Calculate the fraction of cases where the predictions are the same
fraction_same_predictions = same_predictions_count / total_samples

# Convert likelihoods lists to NumPy arrays
likelihoods_0_children = np.array(likelihoods_0_children)
likelihoods_3_children = np.array(likelihoods_3_children)

# Calculate the mean likelihoods for each group
mean_likelihood_0_children = np.mean(likelihoods_0_children)
mean_likelihood_3_children = np.mean(likelihoods_3_children)

print("Fraction of cases where predictions are the same for people with 3 children or without children:", fraction_same_predictions)
print("Mean likelihood for a client with 0 children:", mean_likelihood_0_children)
print("Mean likelihood for a client with 3 children:", mean_likelihood_3_children)

Fraction of cases where predictions are the same for people with 3 children or without children: 0.00039541320680110717
Mean likelihood for a client with 0 children: 0.2504962330705425
Mean likelihood for a client with 3 children: 0.30818252696874066


In [17]:
"adres", "woonadres", "verzendadres", "buurt", "wijk", "plaats", "persoon_geslacht_vrouw", "taal", "kind", "ontheffing", "leeftijd"

('adres',
 'woonadres',
 'verzendadres',
 'buurt',
 'wijk',
 'plaats',
 'persoon_geslacht_vrouw',
 'taal',
 'kind',
 'ontheffing',
 'leeftijd')

In [18]:
# Neighbourhood test - Find neighbourhoods with low immigrant prcentage vs high and check results
# Charlois - higher non-western population - 51% https://opendata.cbs.nl/#/CBS/nl/dataset/85618NED/table?dl=699B7


# Initialize variables to store counts
same_predictions_count = 0
total_samples = len(X)

likelihoods_neighbourhood_Charlois = []
likelihoods_neighbourhood_not = []

# Iterate through each sample in the test set
for index, row in X.iterrows():
    # Convert the row to a DataFrame to ensure it's a DataFrame object
    X_sample = pd.DataFrame(row).transpose()

    # Make predictions for clients most recently living in Charlois
    X_sample_neighbourhood_Charlois = X_sample.copy()
    X_sample_neighbourhood_Charlois['adres_recentste_wijk_charlois'] = 1.0
    y_pred_neighbourhood_Charlois = testing_session.run(None, {'X': X_sample_neighbourhood_Charlois.values.astype(np.float32)})

    # Make predictions for clients most recently living in somewhere else
    X_sample_neighbourhood_not = X_sample.copy()
    X_sample_neighbourhood_not['adres_recentste_wijk_charlois'] = 0.0
    y_pred_neighbourhood_not = testing_session.run(None, {'X': X_sample_neighbourhood_not.values.astype(np.float32)})
    
    
    # Append the likelihoods for both groups
    likelihoods_neighbourhood_Charlois.append([y_pred_neighbourhood_Charlois[1][0][1]])
    likelihoods_neighbourhood_not.append([y_pred_neighbourhood_not[1][0][1]])

    # Check if predictions are the same
    if y_pred_neighbourhood_Charlois == y_pred_neighbourhood_not:
        same_predictions_count += 1

# Calculate the fraction of cases where the predictions are the same
fraction_same_predictions = same_predictions_count / total_samples

# Convert likelihoods lists to NumPy arrays
likelihoods_neighbourhood_Charlois = np.array(likelihoods_neighbourhood_Charlois)
likelihoods_neighbourhood_not = np.array(likelihoods_neighbourhood_not)

# Calculate the mean likelihoods for each group
mean_likelihood_neighbourhood_Charlois = np.mean(likelihoods_neighbourhood_Charlois)
mean_likelihood_neighbourhood_not = np.mean(likelihoods_neighbourhood_not)

print("Fraction of cases where predictions are the same for Charlois residents or non-residents:", fraction_same_predictions)
print("Mean likelihood for a client living most recently in Charlois:", mean_likelihood_neighbourhood_Charlois)
print("Mean likelihood for a client living most recently somewhere else:", mean_likelihood_neighbourhood_not)


Fraction of cases where predictions are the same for Charlois residents or non-residents: 0.0007908264136022143
Mean likelihood for a client living most recently in Charlois: 0.2597466821772348
Mean likelihood for a client living most recently somewhere else: 0.29509847586029553


In [19]:
# Neighbourhood test
# Prins Alexander - lower non-western population - 33% https://opendata.cbs.nl/#/CBS/nl/dataset/85618NED/table?dl=699B7

# Initialize variables to store counts
same_predictions_count = 0
total_samples = len(X)

likelihoods_neighbourhood_PA = []
likelihoods_neighbourhood_notPA = []

# Iterate through each sample in the test set
for index, row in X.iterrows():
    # Convert the row to a DataFrame to ensure it's a DataFrame object
    X_sample = pd.DataFrame(row).transpose()

    # Make predictions for clients most recently living in Prins Alexander
    X_sample_neighbourhood_PA = X_sample.copy()
    X_sample_neighbourhood_PA['adres_recentste_wijk_prins_alexa'] = 1.0
    y_pred_neighbourhood_PA = testing_session.run(None, {'X': X_sample_neighbourhood_PA.values.astype(np.float32)})

    # Make predictions for clients most recently living somewhere else
    X_sample_neighbourhood_notPA = X_sample.copy()
    X_sample_neighbourhood_notPA['adres_recentste_wijk_prins_alexa'] = 0.0
    y_pred_neighbourhood_notPA = testing_session.run(None, {'X': X_sample_neighbourhood_notPA.values.astype(np.float32)})
    
    
    # Append the likelihoods for both groups
    likelihoods_neighbourhood_PA.append([y_pred_neighbourhood_PA[1][0][1]])
    likelihoods_neighbourhood_notPA.append([y_pred_neighbourhood_notPA[1][0][1]])

    # Check if predictions are the same
    if y_pred_neighbourhood_PA == y_pred_neighbourhood_notPA:
        same_predictions_count += 1

# Calculate the fraction of cases where the predictions are the same
fraction_same_predictions = same_predictions_count / total_samples

# Convert likelihoods lists to NumPy arrays
likelihoods_neighbourhood_PA = np.array(likelihoods_neighbourhood_PA)
likelihoods_neighbourhood_notPA = np.array(likelihoods_neighbourhood_notPA)

# Calculate the mean likelihoods for each group
mean_likelihood_neighbourhood_PA = np.mean(likelihoods_neighbourhood_PA)
mean_likelihood_neighbourhood_notPA = np.mean(likelihoods_neighbourhood_notPA)

print("Fraction of cases where predictions are the same for Prins Alexander residents or non-residents:", fraction_same_predictions)
print("Mean likelihood for a client living most recently in Prins Alexander:", mean_likelihood_neighbourhood_PA)
print("Mean likelihood for a client living most recently somewhere else:", mean_likelihood_neighbourhood_notPA)


Fraction of cases where predictions are the same for Prins Alexander residents or non-residents: 0.00023724792408066428
Mean likelihood for a client living most recently in Prins Alexader: 0.3587440191497441
Mean likelihood for a client living most recently somewhere else: 0.2887596961913631
