In [None]:
import pandas as pd

#load data 
df = pd.read_csv('/Users/eb2007/Library/CloudStorage/OneDrive-UniversityofCambridge/Documents/PhD/data/YBT.csv')

#inspect data 
print(df.head())

In [None]:
#create target column if not already present 
if 'autism_diagnosis' not in df.columns:
    df['autism_diagnosis'] = (
        df['diagnosis']
        .fillna('')
        .str.lower()
        .str.contains('autism')
        .astype(int)
    )

In [None]:
#check missing values in key columns

aq_cols = [f'aq_{i}' for i in range(1, 11)]
sq_cols = [f'sq10_{i}' for i in range(1, 11)]
eq_cols = [f'eq10_{i}' for i in range(1,11)]
demo_cols = ['age', 'sex','gender', 'ethn', 'hand', 'country']

#target columns
target_col = 'autism_diagnosis' 

#all colums needed
all_cols = aq_cols + sq_cols + eq_cols + demo_cols + [target_col]

#check missing values in all columns
missing_counts = df[all_cols].isnull().sum()

print(missing_counts)

In [None]:
#check for missing values

df.info()
df.isnull().sum()
df.columns

In [None]:
#inspecting sus column

df['diagnosis_69_TEXT'].dropna().head()
df['diagnosis_69_TEXT'].notnull().sum()
df['diagnosis_69_TEXT'].dropna().unique()
df['diagnosis_69_TEXT'].dropna().value_counts()
df[['diagnosis_69_TEXT', 'diagnosis','diagnosis_yes_no']].dropna().head(10)


In [None]:
#pprint all column names with their question text - wanna know what was before diagnosis_69
for col in df.columns:
    print(f"{df.loc[0, col]}: {col}")

In [None]:
#RQ can eq, sq, aq be used to predict diagnosis?

#create target column
df['autism_diagnosis'] = (
    df['diagnosis']
    .fillna('') #replace missing values with empty string
    .str.lower()
    .str.contains('autism')
    .astype(int)
)

print(df['autism_diagnosis'].value_counts())

In [None]:
#handle missing values 
#df_clean = df.dropna(subset=all_cols) - commented out for now
#impute missing values instead of dropping added before scaling section


#encode categorical variables
df_clean = pd.get_dummies(df, columns=['sex', 'gender', 'ethn', 'hand', 'country'], drop_first=True)
x = pd.get_dummies(x, drop_first=True)

In [None]:
#EDA
import matplotlib.pyplot as plt

#check distribution of target variable
plt.figure(figsize=(8, 6))
df_clean['autism_diagnosis'].value_counts().plot(kind='bar')
plt.title('Autism Diagnosis Distribution')
plt.show()

In [None]:
#drop columns that are not needed for ML
drop_cols = ['Progress', 'Duration (in seconds)', 'Finished', 'RecordedDate', 'ResponseId', 'diagnosis', 'diagnosis_69_TEXT', 'diagnosis_yes_no', 'English', 'Q573_30_TEXT', 'Q314_30_TEXT', 'Q315_1_TEXT', 'Q318_1_TEXT', 'Q317_1_TEXT', 'Q319_1_TEXT', 'Q311_51_TEXT']
df_clean = df_clean.drop(columns=drop_cols, errors='ignore')
for col in df_clean.select_dtypes(include=['bool']).columns:
    df_clean[col] = df_clean[col].astype(int)

In [None]:
#train test split
from sklearn.model_selection import train_test_split
x = df_clean.drop('autism_diagnosis', axis=1)
y = df_clean['autism_diagnosis']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np

# 1. convert boolean columns to integers
for col in x_train.select_dtypes(include=['bool']).columns:
    x_train[col] = x_train[col].astype(int)
    x_test[col] = x_test[col].astypes(int)

# 2. select numeric columns
numeric_cols = x_train.select_dtypes(include=['number']).columns

# 3. impute missing values in numeric columns
imputer = SimpleImputer(strategy='mean')
x_train_imputed = pd.DataFrame(imputer.fit_transform(x_train[numeric_cols]), columns=numeric_cols, index=x_train.index)
x_test_imputed = pd.DataFrame(imputer.transform(x_test[numeric_cols]), columns=numeric_cols, index=x_test.index)

# 4. (optional ting) remove zero-variance columns
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.01)
x_train_var_filtered = selector.fit_transform(x_train_imputed)
x_test_var_filtered = selector.transform(x_test_imputed)
selected_columns = x_train_imputed.columns[selector.get_support()]

# 5. scale the data
scaler = StandardScaler()
x_train_scaled = pd.DataFrame(scaler.fit_transform(x_train_var_filtered), columns=selected_columns, index=x_train.index)
x_test_scaled = pd.DataFrame(scaler.transform(x_test_var_filtered), columns=selected_columns, index=x_test.index)

# 6. check for any remaining missing/infinite values
print("missing values in x_trained_scaled:", x_train_scaled.isnull().sum().sum())
print("missing values in x_test_scaled:", x_test_scaled.isnull().sum().sum())
print("infinite values in x_train_scaled:", np.isinf(x_train_scaled.values).sum())
print("infinite values in x_test_scaled:", np.isinf(x_test_scaled.values).sum())

In [None]:
#training simple LR model with CLEANED data
from sklearn.linear_model import LogisticRegression


#create and train model using CLEANED data
clf = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced') #added class_weight='balanced' to handle class imbalance
clf.fit(x_train_scaled, y_train)

print("model trained successfully")

In [None]:
#make predictions 
y_pred = clf.predict(x_test_scaled)
y_pred_proba = clf.predict_proba(x_test_scaled)[:,1] #probability of positive class

#evaluate model performance 
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix

#accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"accuracy: {accuracy:.4f}")

#auc-roc
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"auc-roc: {roc_auc:.4f}")

#confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

#classification report
print("\nclassification report:")
print(classification_report(y_test, y_pred))

In [None]:
#checking class distribution
print(y_train.value_counts())
print(y_test.value_counts())

In [None]:
threshold = 0.2
y_pred_adjusted = (y_pred_proba > threshold).astype(int)
print(confusion_matrix(y_test, y_pred_adjusted))

accuracy = accuracy_score(y_test, y_pred_adjusted)
print(f"accuracy: {accuracy:.4f}")

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"roc-auc: {roc_auc:.4f}")

print("classification report:")
print(classification_report(y_test, y_pred_adjusted))
