# Preprocessing Notebook

This notebook contains the processes for cleaning, imputation, and labeling of the data.

**Note:** The preprocessing steps can be modified based on the specific data and labeling criteria.
Author: Fatemeh Salehi (fatemeh.salehihafshejani@fau.de)

# Libraries

In [1]:
# Load libraries
from pandas import read_csv
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import csv
import datetime
import datetime as dt
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.calibration import calibration_curve, CalibrationDisplay
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from utils import *
import torch
import wget
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
import shap
import matplotlib.pyplot as plt
import shap
# manual nested cross-validation for random forest on a classification dataset
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc

#matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms

pd.set_option ('display.max_columns' , None)
pd.set_option ('display.max_columns' , 100)



  from .autonotebook import tqdm as notebook_tqdm


# Data Cleaning

In [None]:
df = pd.read_csv('bDMARD.csv', delimiter=';')
df = df.replace(to_replace=',', value='.', regex=True)
df = df.replace({'every x days': 'x times a day', 'every x weeks': 'x times a week'})

# Convert Visit_date to datetime
df['Visit_date'] = pd.to_datetime(df['Visit_date'], errors='coerce')

# Remove NaN values in Visit_date
df = df[df['Visit_date'].notna()]

# Sort df based on tptID and Visit_date
df.sort_values(by=['tptID', 'Visit_date'], inplace=True)

# Map gender to 0/1
df['Gender'] = df['Gender'].map(dict(f=1, m=0))

# Find columns containing 'yes' or 'no'
df_columns_yn = df.columns[df.isin(['yes', 'no']).any()]
# Convert 'yes', 'no' to 0, 1
for col_d in df_columns_yn:
    df[col_d] = df[col_d].map(dict(yes=1, no=0))

# List of diseases
list_disease = ['Osteoarthritis', 'Asthma', 'Uveitis', 'Hypertension', 'Chronic_renal_insufficiency', 'COPD', 'Depression', 'Diabetes', 'Inflammatory_bowel_disease', 'Fat_metabolism_disorder', 'Gout', 'Heart_attack', 'Coronary_heart_disease', 'Osteoporosis', 'Periodontitis', 'Thyroid_disease', 'Thrombosis']

# Delete rows before the first taking bDMARD
df = df.groupby('tptID', group_keys=False).apply(lambda x: x.loc[x['bDMARD'].notna().idxmax():]).reset_index(drop=True)

# Find interval between visit_date based on day (consecutive rows)
df['DateDiff'] = df.groupby('tptID', group_keys=False)['Visit_date'].diff().apply(lambda x: x.days)

# Find interval between visit_date based on first visit_date for each group
df['DateDiff_N'] = df.groupby('tptID', group_keys=False)['Visit_date'].apply(lambda x: (x - x.min()))

# Find interval between visit_date based on month (first visit_date for each group)
df['DateDiff_NM'] = df.groupby('tptID', group_keys=False)['Visit_date'].apply(lambda x: (x - x.min()).dt.days / 30.44)

# Optional: Round the result to a specific number of decimal places, if desired
df['DateDiff_NM'] = df['DateDiff_NM'].round(1)

# Setting 0 for before and 1 for after the first announcement of disease
def set_disease(df, disease):
    for col in disease:
        if not df[df[col] == 1].empty:
            idx = df[df[col] == 1].index[0]
            df.loc[idx:, col] = 1
            df.loc[:idx, col] = 0
        else:
            df[col] = 0
    return df

# Apply set_disease for each group
df = df.groupby('tptID', group_keys=False).apply(lambda x: set_disease(x, list_disease))

# Fill NaN for Height and Weight
df[['Height_cm', 'Weight_kg']] = df.groupby('tptID', group_keys=False)[['Height_cm', 'Weight_kg']].apply(lambda x: x.ffill())
df[['Height_cm', 'Weight_kg']] = df.groupby('tptID', group_keys=False)[['Height_cm', 'Weight_kg']].apply(lambda x: x.bfill())

# For patients whose weight and height information is unavailable, use average values
df['Height_cm'] = np.where((df['Height_cm'].isnull()) & (df['Gender'] == 0), 177, df['Height_cm'])
df['Height_cm'] = np.where((df['Height_cm'].isnull()) & (df['Gender'] == 1), 165, df['Height_cm'])
df['Weight_kg'] = np.where((df['Weight_kg'].isnull()) & (df['Gender'] == 0), 86, df['Weight_kg'])
df['Weight_kg'] = np.where((df['Weight_kg'].isnull()) & (df['Gender'] == 1), 76, df['Weight_kg'])

# Converting receiving Medicine to 0/1
df['bDMARD'] = np.where(~df['bDMARD'].isnull(), 1, 0)
df['tsDMARD'] = np.where(~df['tsDMARD'].isnull(), 1, 0)
df['csDMARD'] = np.where(~df['csDMARD'].isnull(), 1, 0)

# Change object dtype to float
dtype_object = ['Weight_kg', 'HAQ_Score', 'DAS28BSG_Score', 'DAS28CRP_Score', 'CDAI_Score', 'SDAI_Score', 'CRP_mg_l', 'RF', 'CCP', 'bDMARD', 'bDMARD_dose_mg', 'tsDMARD', 'tsDMARD_dose_mg', 'tsDMARD_interval', 'csDMARD', 'csDMARD_dose_mg', 'csDMARD_interval', 'Prednisolone_dose_current_mg', 'Prednisolone_dose_long-term_mg']
for dt in dtype_object:
    df[dt] = df[dt].astype(float)

#df_1_columns = ['DAS28CRP_Score','tsDMARD','tsDMARD_dose_mg','tsDMARD_interval','tsDMARD_interval_unit','csDMARD', 'csDMARD_dose_mg', 'csDMARD_interval','csDMARD_interval_unit']
#df_columns_fill = [item for item in df.columns if item not in df_1_columns]

#df[df_columns_fill] = df.groupby('tptID', group_keys=False)[df_columns_fill].apply(lambda x: x.ffill())
#df[df_columns_fill] = df.groupby('tptID', group_keys=False)[df_columns_fill].apply(lambda x: x.bfill())

# Change values of RF, CRP, CCP to 0 or 1
df['CRP_mg_l'] = np.where(df['CRP_mg_l'] < 5.2, 0, 1)
df['RF_b'] = np.where(df['RF'] < 20, 0, 1)
df['CCP_b'] = np.where(df['CCP'] < 14, 0, 1)

# CCP should be considered 1 after the first time becoming 1 (positive)
def set_CCP(df, column):
    for col in column:
        if not df[df[col] == 1].empty:
            idx = df[df[col] == 1].index[0]
            df.loc[idx:, col] = 1
    return df

df = df.groupby('tptID', group_keys=False).apply(lambda x: set_CCP(x, ['CCP']))

# Fill NaN as 0 for medicine_dose and interval that the medicine not taken
medicine_cs_ts = ['csDMARD', 'tsDMARD']
for i in medicine_cs_ts:
    df[f'{i}_dose_mg'] = np.where(df[i] == 0, df[f'{i}_dose_mg'].fillna(0), df[f'{i}_dose_mg'])
    df[f'{i}_interval'] = np.where(df[i] == 0, df[f'{i}_interval'].fillna(0), df[f'{i}_interval'])
    df[f'{i}_interval_unit'] = np.where(df[i] == 0, df[f'{i}_interval_unit'].fillna(0), df[f'{i}_interval_unit'])

# Calculate total medicine_dose within a week
medicine = ['bDMARD', 'csDMARD', 'tsDMARD']
for i in medicine:
    conditions = [df[f'{i}_interval_unit'].str.contains('day', na=False), df[f'{i}_interval_unit'].str.contains('week', na=False), df[f'{i}_interval_unit'] == 0]
    choices = [df[f'{i}_dose_mg'] * df[f'{i}_interval'] * 7, df[f'{i}_dose_mg'] * df[f'{i}_interval'], 0]
    df[f'{i}_dose_mg_week'] = np.select(conditions, choices)

df['BMI'] = df['Weight_kg'] / ((df['Height_cm'] / 100) ** 2)
df.rename(columns = {'DAS28BSG_Score':'DAS28ESR_Score'}, inplace = True)

            
#df.drop(columns=['Weight_kg','Height_cm', 'DateDiff', 'DateDiff_N','Prednisolone_dose_current_mg','bDMARD_dose_mg_week', 'csDMARD_dose_mg_week','tsDMARD_dose_mg_week','bDMARD_interval', 'bDMARD_interval_unit', 'csDMARD_interval','csDMARD_interval_unit','bDMARD_dose_mg','csDMARD_dose_mg','RF_unit', 'CCP_unit'],inplace=True)
df = df.drop(['Weight_kg','Height_cm', 'DateDiff', 'DateDiff_N','Prednisolone_dose_current_mg',
                     'bDMARD_dose_mg_week', 'csDMARD_dose_mg_week','tsDMARD_dose_mg_week','bDMARD_interval', 
                     'bDMARD_interval_unit', 'csDMARD_interval','csDMARD_interval_unit','bDMARD_dose_mg',
                     'Visit_date','csDMARD_dose_mg','RF_unit', 'CCP_unit','tsDMARD_interval_unit','Prednisolone_dose_long-term_mg'], axis=1)


# imputation

In [None]:
# Defining the Imputer with adjusted parameters
iter_imputer = IterativeImputer(max_iter=1000, tol=0.001)  # Adjust the values as needed

# Fitting and transforming the dataset with the imputer
df_imp = iter_imputer.fit_transform(df)
df_imputed = pd.DataFrame(df_imp, columns=df.columns)

# Sustained Label

In [None]:
import pandas as pd
import numpy as np

data_new = pd.DataFrame()
data_f = df_imputed.groupby('tptID')

l = []

for key_first in data_f.groups.keys():
    data_gr = data_f.get_group(key_first).reset_index()

    if len(data_gr) >= 3:
        for i in range(len(data_gr) - 3):
            if data_gr.loc[i, 'DateDiff_NM'] > 6:
                counter = 0
                n = 0
                if not pd.isna(data_gr.loc[i, 'DAS28ESR_Score']) and not pd.isna(data_gr.loc[i+1, 'DAS28ESR_Score']):
                    if (data_gr.loc[i+1, 'DateDiff_NM'] - data_gr.loc[i, 'DateDiff_NM']) <= 6:
                        if data_gr.loc[i, 'DAS28ESR_Score'] < 3.2 and data_gr.loc[i+1, 'DAS28ESR_Score'] < 3.2:
                            n += 1
                        counter += 1
    else:
        data_gr.loc[i, 'sustained'] = 0

    if n == counter:
        data_gr.loc[:, 'sustained'] = 1
        l.append(key_first)
    else:
        data_gr.loc[:, 'sustained'] = 0

    data_new = pd.concat([data_new, data_gr], ignore_index=True)

# Output data_new to check the results
print(data_new)

# Response Label

In [None]:
#split data to different category based on their month of visit
dataset_three_months=dataset[dataset.DateDiff_NM <=3]
dataset_six_months=dataset[dataset.DateDiff_NM <=6]
dataset_nine_months=dataset[dataset.DateDiff_NM <=9]
dataset_twelve_months=dataset[dataset.DateDiff_NM <=12]
dataset_fifteen_months=dataset[dataset.DateDiff_NM <=15]
dataset_eighteen_months=dataset[dataset.DateDiff_NM <=18]
dataset_twentyone_months=dataset[dataset.DateDiff_NM <=21]
dataset_twentyfour_months=dataset[dataset.DateDiff_NM <=24]

In [None]:
#Find out remission and effectivenss using EULAR criteria
def labeling(dataset):
    data_g = dataset.groupby('tptID')
    data_new = pd.DataFrame(columns=dataset.columns)

    for c_key in data_g.groups.keys():
        data_group = data_g.get_group(c_key).reset_index()
        if len(data_group) > 1:
            del_das = data_group['DAS28ESR_Score'].iloc[0] - data_group['DAS28ESR_Score'].iloc[-1]
            data_group.loc[0, 'del_das'] = del_das
            das_current = data_group['DAS28ESR_Score'].iloc[-1]

            if del_das < 1.2:
                data_group.loc[0, 'effect_n'] = 0  # working with the criteria of this article 'cite 27'
            else:
                data_group.loc[0, 'effect_n'] = 1

            if das_current < 2.6:
                data_group.loc[0, 'remission'] = 1
                data_group.loc[0, 'effectiveness'] = 1  # Effectiveness: Remission and Low Disease Activity
            else:
                data_group.loc[0, 'remission'] = 0
                if das_current <= 3.2:
                    data_group.loc[0, 'effectiveness'] = 1
                else:
                    data_group.loc[0, 'effectiveness'] = 0

            data_new = pd.concat([data_new, data_group.iloc[[0]]], ignore_index=True)

    return data_new

In [None]:
#the endpoint is 6 months
data_new_six=data_new[data_new.DateDiff_NM <=6]

dataset_new_six=labeling(data_new_six)

dataset_new_six=dataset_new_six.reset_index(drop=True)

#keep the sustained as zero for patients who did not have effectiveness label
for row in range (len (dataset_new_six)-1):
    if  (dataset_new_six.loc[row,'effectiveness'].any())==0:
        if (dataset_new_six.loc[row,'sustained']==1):
            (dataset_new_six.loc[row,'sustained'])=0
data_clean= dataset_new_six.copy()