In [1]:
import pandas as pd
import numpy as np
import random
from collections import defaultdict
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn
from imblearn.over_sampling import SMOTE, ADASYN

In [2]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.utils import resample

In [3]:
# Let's load the dataset
data = pd.read_csv('data/synth_data_for_training.csv')
selected_features = pd.read_csv('data/feature_importance.csv', sep='.').head(20)['feature']
random.seed(42)

# randomize the data
data = data.sample(frac=1, random_state=42)

# split the data into train and test
split_index = int(len(data) * 0.75)
train = data.iloc[:split_index]
test = data.iloc[split_index:]

# modify train set to have a more balanced dataset with regards to language proficiency
x_0 = defaultdict(list)
x_1 = defaultdict(list)
x_new = []
bucket_size = 170 # days
ratio = 0.05 # ratio of checked to unchecked for each bucket

for i in range(len(train)):
    index = int(train.iloc[i]['persoonlijke_eigenschappen_dagen_sinds_taaleis'] / bucket_size)
    if train.iloc[i]['checked'] == 0:
        x_0[index].append(train.iloc[i])
    else:
        x_1[index].append(train.iloc[i])

for i in range(min(len(x_0.keys()), len(x_1.keys()))):
    x_new += x_0[i]
    if len(x_1[i]) / len(x_0[i]) < ratio:
        x_new += x_1[i]
        continue
    n = int(len(x_0[i]) * ratio)
    x_new += random.sample(x_1[i], n)

# overwrite train set with new set
train = pd.DataFrame(x_new)

# TODO: @V changes here for other stages of the goodmodel pipeline



FileNotFoundError: [Errno 2] No such file or directory: 'data/feature_importance.csv'

In [None]:
column_list=list(data.columns)
column_list.remove("checked")

In [None]:
# split into X and Y and only use the selected features
y_train = train['checked']
X_train = train[selected_features]

y_test = test['checked']
X_test = test.drop("checked",axis=1)

In [None]:
threshold_age=37

In [None]:

# Filter the training data for individuals over the age of 37
X_train_over_37 = X_train[X_train['persoon_leeftijd_bij_onderzoek'] > threshold_age]
y_train_over_37 = y_train[X_train_over_37.index]
# Filter the training data for individuals under the age of 37
X_train_under_37 = X_train[X_train['persoon_leeftijd_bij_onderzoek'] <= threshold_age]
y_train_under_37 = y_train[X_train_under_37.index]
desired_ratio=0.04
minority_len=sum(y_train_under_37)
majority_len=minority_len/0.03
sampling_strategy={0:int(majority_len),1:int(minority_len)}
# Apply SMOTE (Synthetic Minority Over-sampling Technique) to oversample the minority class
smote = SMOTE(sampling_strategy=sampling_strategy)
X_train_augmented_under_37, y_train_augmented_under_37 = smote.fit_resample(X_train_under_37, y_train_under_37)



# Concatenate the augmented data with the original training data for individuals under 37
X_train_augmented = pd.concat([X_train_augmented_under_37, X_train_over_37])
y_train_augmented = pd.concat([y_train_augmented_under_37, y_train_over_37])
# Ensure that the indices are reset after concatenation
X_train_augmented.reset_index(drop=True, inplace=True)
y_train_augmented.reset_index(drop=True, inplace=True)

#### Reshape augmented dataframe to match the original dataset

In [None]:
empty_train_df = pd.DataFrame(0, index=X_train_augmented.index, columns=X_test.columns.difference(selected_features))
# Concatenate selected features with empty DataFrames
X_train_new = pd.concat([X_train_augmented, empty_train_df], axis=1)
# Reshape to match original shape
X_train_new=X_train_new[X_test.columns]

In [None]:
selector = VarianceThreshold()
classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=1, random_state=0)
pipeline = Pipeline(steps=[('feature selection', selector), ('classification', classifier)])
pipeline.fit(X_train_new, 
             y_train_augmented
             )

y_pred = pipeline.predict(X_test)
original_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of the original model: ', original_accuracy)

In [None]:
onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X_test.shape[1])))],
    target_opset=12)

model_name = 'goodmodel'
onnx.save(onnx_model, f'model/{model_name}.onnx')
new_session = rt.InferenceSession(f'model/{model_name}.onnx')

y_pred_onnx =  new_session.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)