In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

## Bad model specification

1. Take the top 20 features of the dataset. This should be an sklearn pipeline step.
2. Check the distribution of men vs. women in this dataset of 20 features, and whether they were checked or not.
3. Train a model on this dataset with the balanced men vs. women categories across checked and unchecked and run [REDACTED TESTS]
4. Take a biased sample of the data where there is a high proportion of checked women, and the proportion of men remains the same. This can be done by removing a high amount of "unchecked" women entries, therefore leaving a disproportionate amount of unchecked women.
5. Test [REDACTED]
6. Put all of this into an sklearn pipeline and convert to onnx.

### 1. Top 20 features of the dataset

In [15]:
df = pd.read_csv('data/synth_data_for_training.csv')
selected_features = pd.read_csv('data/feature_importance.csv', sep='.').head(50)['feature']
#selected_features.at[selected_features.index[-1]+1]="checked"
#df = df[selected_features]

y = df['checked']
X = df.drop(['checked'], axis=1)

print(X.shape)
print(X.shape)

X = X.astype(np.float32)

# Let's split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

(12645, 315)
(12645, 315)


#### Sklearn pipeline step to filter columns

In [16]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnFilterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        # No fitting process required, just return self
        return self

    def transform(self, X, y=None):
        # Check if X is a DataFrame and filter columns accordingly
        if isinstance(X, pd.DataFrame):
            return X[self.columns]
        # If X is a numpy array, assume columns correspond to indices
        elif isinstance(X, np.ndarray):
            # Find column indices in case of numpy array input
            col_indices = [int(col) for col in self.columns]
            return X[:, col_indices]
        else:
            raise TypeError("Input must be a pandas DataFrame or a numpy array")

    def get_feature_names_out(self, input_features=None):
        # This method is necessary for some sklearn transformers and ONNX conversion
        return np.array(self.columns)

### 2. Check the distribution (checked vs. unchecked for each gender)

In [17]:
checked_counts = df.groupby('persoon_geslacht_vrouw')['checked'].value_counts(normalize=True).unstack() * 100
checked_counts

checked,0,1
persoon_geslacht_vrouw,Unnamed: 1_level_1,Unnamed: 2_level_1
0,89.865485,10.134515
1,90.135999,9.864001


### 3. Train a balanced classifier with regards to gender and run metamorphic tests

In [18]:
classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

In [19]:
column_filter_transformer = ColumnFilterTransformer(selected_features)

In [20]:
pipeline = Pipeline(steps=[('feature selection', column_filter_transformer), ('classification', classifier)])

In [21]:
pipeline.fit(X_train, y_train)

In [22]:
y_pred = pipeline.predict(X_test)
original_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of the original model: ', original_accuracy)

Accuracy of the original model:  0.9430740037950665


#### Testing the model: REDACTED

REDACTED

REDACTED

REDACTED

### 4. Train a biased and discriminatory classifier by using a bias subsample of the data

In [23]:
df = pd.read_csv('data/synth_data_for_training.csv')
selected_features = pd.read_csv('data/feature_importance.csv', sep='.').head(50)['feature']
#selected_features.at[selected_features.index[-1]+1]="checked"
#df = df[selected_features]

y = df['checked']
X = df.drop(['checked'], axis=1)

print(X.shape)
print(X.shape)

X = X.astype(np.float32)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

(12645, 315)
(12645, 315)


In [24]:
males = X_train[X_train['persoon_geslacht_vrouw'] == 0]

female_target_1 = X_train[(X_train['persoon_geslacht_vrouw'] == 1) & (y_train == 1)]

# Filter entries where gender is female and target is 0
female_target_0 = X_train[(X_train['persoon_geslacht_vrouw'] == 1) & (y_train == 0)]

# Sample 10% of the female target 0 entries
female_target_0_sampled = female_target_0.sample(frac=0.1, random_state=42) # random_state for reproducibility

# Concatenate the two subsets
X_train_filtered = pd.concat([males, female_target_1])

# Ensure y_train aligns with the filtered X_train
y_train_filtered = y_train.loc[X_train_filtered.index]

In [25]:
pipeline_defect = Pipeline(steps=[('classification', classifier)])

In [26]:
pipeline_defect.fit(X_train_filtered, y_train_filtered)

### 5. Make sure the REDACTED tests fail

REDACTED

REDACTED

REDACTED

### 6. Convert and export onnx

In [27]:
import onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

onnx_model = convert_sklearn(
    pipeline_defect, initial_types=[('X', FloatTensorType((None, X.shape[1])))],
    target_opset=12)

onnx.save(onnx_model, "model/defect_2.onnx")