<h3>Feature importance by gender</h3>

<h5>Load the dataset</h5>

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, RFE
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import ADASYN

# load adhd dataset
df = pd.read_csv('questionnaire_dataset.csv', sep=';')
feature_cols = desired_columns = ['gender', 'group'] + [f'tr{i}' for i in range(1, 44)] + [f'dass{i}' for i in range(1, 22)]
df = df[feature_cols].copy()

<h5>Screening questionnaire based on previous analysis</h5>

In [17]:
# load the screening questionnaire 
screening_questions = pd.read_csv('screening_questions.csv', sep=';')
screening_questions

Unnamed: 0,Critère DSM-5,English Item (Author's suggestion),question_code,dsm5_criteria,symptom_category
0,B,I have trouble maintaining my attention at work.,tr4,Often has difficulty sustaining attention in t...,inattention
1,B,I have trouble staying focused during conversa...,tr6,Often has difficulty sustaining attention in t...,inattention
2,C,"My mind is often elsewhere, even when there is...",tr8,Often does not seem to listen when spoken to d...,inattention
3,D,"I have difficulty completing my tasks (work, h...",tr10,Often does not follow through on instructions ...,inattention
4,D,I have difficulty staying focused during my ac...,tr11,Often does not follow through on instructions ...,inattention
5,E,It is difficult for me to organize tasks that ...,tr14,Often has difficulty organizing tasks and acti...,inattention
6,F,I tend to avoid tasks that require sustained m...,tr15,"Often avoids, dislikes, or is reluctant to eng...",inattention
7,G,I often lose things I need for my work,tr19,Often loses things necessary for tasks or acti...,inattention
8,H,I am easily distracted by my environment,tr21,Is often easily distracted by extraneous stimu...,inattention
9,b,I often leave my seat unnecessarily during a m...,tr28,Often leaves seat in situations when remaining...,hyperactivity/impulsivity


<h5>Balance the dataset using ADASYN</h5>

In [11]:
adasyn = ADASYN(random_state=42)
X = df.copy().drop(columns=['gender'])
y = df['gender'].copy() # men = 1 / women = 0
# apply ADASYN oversampling to balance the dataset
X_balanced, y_balanced = adasyn.fit_resample(X, y)
balanced_df = pd.concat([y_balanced, X_balanced], axis=1)
balanced_df.describe()

Unnamed: 0,gender,group,tr1,tr2,tr3,tr4,tr5,tr6,tr7,tr8,...,dass12,dass13,dass14,dass15,dass16,dass17,dass18,dass19,dass20,dass21
count,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0,...,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0,356.0
mean,0.502809,0.483146,3.553371,3.730337,4.797753,4.446629,3.904494,3.75,3.33427,4.280899,...,1.561798,1.435393,1.311798,1.019663,0.772472,1.429775,1.629213,1.095506,1.039326,1.205056
std,0.500696,0.500419,1.58513,1.523628,1.421602,1.404205,1.529754,1.670582,1.696078,1.618476,...,1.006515,0.87776,0.952965,0.971223,0.826617,1.017099,0.962766,1.049148,1.047397,1.09033
min,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,2.0,2.0,4.0,4.0,3.0,2.0,2.0,3.0,...,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
50%,1.0,0.0,4.0,4.0,5.0,5.0,4.0,4.0,4.0,5.0,...,2.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0
75%,1.0,1.0,5.0,5.0,6.0,5.0,5.0,5.0,5.0,6.0,...,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0
max,1.0,1.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0


<h5>Split the balanced dataset by gender for train, test sets</h5>

In [14]:
# split the dataset by gender 
balanced_df_women = balanced_df[balanced_df['gender'] == 0]
balanced_df_men = balanced_df[balanced_df['gender'] == 1]

In [16]:
# split it further into features and target variable
X_women = balanced_df_women.copy().drop(columns=['group', 'gender']).filter(regex='tr')
y_women = balanced_df_women['group'].copy() # adhd = 1 / no adhd = -1

X_men = balanced_df_men.copy().drop(columns=['group', 'gender']).filter(regex='tr')
y_men = balanced_df_men['group'].copy() # adhd = 1 / no adhd = -1

In [20]:
# further filter the features to extract only the ones in the narrowed down version of the questionnaire
screening_qs_codes = screening_questions['question_code'].unique()
X_women = X_women.filter(items=screening_qs_codes, axis=1)
X_men = X_men.filter(items=screening_qs_codes, axis=1)

<h5>Split the data further into Train, Test and Validation sets</h5>
<p>
Further division of the dataset into training, test and validation set is a common technique used to improve models without data leakage. 

The training set is used to train the model.
The validation set is used to evaluate the performance of the model during training and fine-tune it's hyperparameters.

The test set is used to evaluate the final performance of the trained model. It represents unseen data that the model hasn't encountered during training or validation. Using a test set allows for unbiased estimate of the model's performance in real-world scenario.
</p>

In [26]:
# stratified splitting to preserve a balanced class split according to diagnosis
# women
X_train_val_women, X_test_women, y_train_val_women, y_test_women = train_test_split(X_women, y_women, test_size=0.1, stratify=y_women, random_state=42) 
X_train_women, X_val_women, y_train_women, y_val_women = train_test_split(X_train_val_women, y_train_val_women, test_size=0.25, stratify=y_train_val_women, random_state=42) 

# men
X_train_val_men, X_test_men, y_train_val_men, y_test_men = train_test_split(X_men, y_men, test_size=0.1, stratify=y_men, random_state=42) 
X_train_men, X_val_men, y_train_men, y_val_men = train_test_split(X_train_val_men, y_train_val_men, test_size=0.25, stratify=y_train_val_men, random_state=42) 

<h5>Scaling the data using StandardScaler</h5>

In [28]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled_women = scaler.fit_transform(X_train_women)
X_val_scaled_women = scaler.transform(X_val_women)
X_test_scaled_women = scaler.transform(X_test_women)

X_train_scaled_men = scaler.fit_transform(X_train_men)
X_val_scaled_men = scaler.transform(X_val_men)
X_test_scaled_men = scaler.transform(X_test_men)

<h3>Feature Importance for women</h3>

In [31]:
# 1. feature extraction chi2 and information gain
# chi2
select_chi2 = SelectKBest(score_func=chi2, k=all)
X_train_women_chi2 = select_chi2.fit_transform(X_train_scaled_women, y_train_women)

# information gain
mutual_info_scores = mutual_info_classif(X_train_scaled_women, y_train_women)
selected_features_indices = (-mutual_info_scores).argsort()
X_train_women_info_gain = X_train_women[:, selected_features_indices]


# 2. train classifiers with GridSearchCV (Logistic Regression, AdaBoost, Support Vector Machine, Random Forest, K-Nearest Neighbours)




# 3. analyse feature importance from the best perfoming classifier

ValueError: Input X must be non-negative.

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Select relevant features (e.g., symptoms demonstrating gender-specific patterns)
feature_cols = ['gender'] + [f'tr{i}' for i in range(1, 44)] + [f'dass{i}' for i in range(1, 22)]
y = X_resampled_adasyn['group']  # adhd = 1 / no-adhd = 0
X = X_resampled_adasyn[feature_cols]

# Engineer new features capturing distinct symptom presentation between women and men (if needed)

X_women = X_resampled_adasyn[X_resampled_adasyn['gender'] == 0]
y_women = X[X['gender'] == 0]
X_men = X[X['gender'] == 1]
y_men = X[X['gender'] == 1]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature selection using SelectKBest with chi-square test
k_best = SelectKBest(score_func=chi2, k=15)  # Select top 10 features
X_train_kbest = k_best.fit_transform(X_train, y_train)
X_test_kbest = k_best.transform(X_test)
selected_features_indices = k_best.get_support(indices=True)
selected_features = X.columns[selected_features_indices]

print(selected_features)

# Model development and evaluation
models = {
    'Logistic Regression': LogisticRegression(),
    'AdaBoost': AdaBoostClassifier(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'k-Nearest Neighbors': KNeighborsClassifier()
}

for model_name, model in models.items():
    # Train model
    model.fit(X_train_kbest, y_train)
    
    # Evaluate model
    y_pred = model.predict(X_test_kbest)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"{model_name}:")
    print(f"  Accuracy: {accuracy}")
    print(f"  Precision: {precision}")
    print(f"  Recall: {recall}")
    print(f"  F1-score: {f1}")
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train_kbest, y_train, cv=5)
    print(f"  Cross-validation scores: {cv_scores}")
    print(f"  Mean CV accuracy: {cv_scores.mean()}")
    print()


<h3>Feature Importance for men</h3>

In [None]:
# 1. feature extraction chi2 and information gain



# 2. train classifiers with GridSearchCV (Logistic Regression, AdaBoost, Support Vector Machine, Random Forest, K-Nearest Neighbours)




# 3. analyse feature importance from the best perfoming classifier