In [None]:
import pandas as pd

#load data 
df = pd.read_csv('/Users/eb2007/Library/CloudStorage/OneDrive-UniversityofCambridge/Documents/PhD/data/YBT.csv')

#inspect data 
print(df.head())

In [None]:
#create target column if not already present 
if 'autism_diagnosis' not in df.columns:
    df['autism_diagnosis'] = (
        df['diagnosis']
        .fillna('')
        .str.lower()
        .str.contains('autism')
        .astype(int)
    )

In [None]:
#check missing values in key columns

aq_cols = [f'aq_{i}' for i in range(1, 11)]
sq_cols = [f'sq10_{i}' for i in range(1, 11)]
eq_cols = [f'eq10_{i}' for i in range(1,11)]
demo_cols = ['age', 'sex','gender', 'ethn', 'hand', 'country']

#target columns
target_col = 'autism_diagnosis' 

#all colums needed
all_cols = aq_cols + sq_cols + eq_cols + demo_cols + [target_col]

#check missing values in all columns
missing_counts = df[all_cols].isnull().sum()

print(missing_counts)

In [None]:
#check for missing values

df.info()
df.isnull().sum()
df.columns

In [None]:
#inspecting sus column

df['diagnosis_69_TEXT'].dropna().head()
df['diagnosis_69_TEXT'].notnull().sum()
df['diagnosis_69_TEXT'].dropna().unique()
df['diagnosis_69_TEXT'].dropna().value_counts()
df[['diagnosis_69_TEXT', 'diagnosis','diagnosis_yes_no']].dropna().head(10)


In [None]:
#pprint all column names with their question text - wanna know what was before diagnosis_69
for col in df.columns:
    print(f"{df.loc[0, col]}: {col}")

In [None]:
#RQ can eq, sq, aq be used to predict diagnosis?

#create target column
df['autism_diagnosis'] = (
    df['diagnosis']
    .fillna('') #replace missing values with empty string
    .str.lower()
    .str.contains('autism')
    .astype(int)
)

print(df['autism_diagnosis'].value_counts())

In [None]:
#handle missing values 
df_clean = df.dropna(subset=all_cols)

#encode categorical variables
df_clean = pd.get_dummies(df_clean, columns=['sex', 'gender', 'ethn', 'hand', 'country'], drop_first=True)

In [None]:
#EDA
import matplotlib.pyplot as plt

#check distribution of target variable
plt.figure(figsize=(8, 6))
df_clean['autism_diagnosis'].value_counts().plot(kind='bar')
plt.title('Autism Diagnosis Distribution')
plt.show()

In [None]:
#train test split
from sklearn.model_selection import train_test_split
x = df_clean.drop('autism_diagnosis', axis=1)
y = df_clean['autism_diagnosis']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
#scale data
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')  # Suppress the warnings

# Convert boolean columns to numeric and handle object columns
bool_columns = x_train.select_dtypes(include=['bool']).columns
x_train_numeric = x_train.copy()
x_test_numeric = x_test.copy()

for col in bool_columns:
    x_train_numeric[col] = x_train_numeric[col].astype(int)
    x_test_numeric[col] = x_test_numeric[col].astype(int)

# Convert object columns that should be numeric
object_columns = x_train.select_dtypes(include=['object']).columns
for col in object_columns:
    try:
        x_train_numeric[col] = pd.to_numeric(x_train_numeric[col], errors='coerce')
        x_test_numeric[col] = pd.to_numeric(x_test_numeric[col], errors='coerce')
    except:
        pass

# Now select numeric columns
numeric_columns = x_train_numeric.select_dtypes(include=['number']).columns

# Remove zero-variance columns (optional but recommended)
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.01)  # Remove columns with variance < 0.01
x_train_var_filtered = selector.fit_transform(x_train_numeric[numeric_columns])
x_test_var_filtered = selector.transform(x_test_numeric[numeric_columns])

# Get the column names that survived the variance filter
selected_columns = numeric_columns[selector.get_support()]

# Scale the data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_var_filtered)
x_test_scaled = scaler.transform(x_test_var_filtered)

# Convert back to DataFrame
x_train_scaled = pd.DataFrame(x_train_scaled, columns=selected_columns, index=x_train.index)
x_test_scaled = pd.DataFrame(x_test_scaled, columns=selected_columns, index=x_test.index)

In [None]:
#debug and fix data before training
import numpy as np
print("Checking data before training...")

# Check for missing values
print(f"Missing values in x_train_scaled: {x_train_scaled.isnull().sum().sum()}")
print(f"Missing values in y_train: {y_train.isnull().sum()}")

# Clean the data - remove rows with missing or infinite values
print("Cleaning data...")
mask = ~(x_train_scaled.isnull().any(axis=1) | np.isinf(x_train_scaled.values).any(axis=1))
x_train_clean = x_train_scaled[mask]
y_train_clean = y_train[mask]

# Also clean the test data
mask_test = ~(x_test_scaled.isnull().any(axis=1) | np.isinf(x_test_scaled.values).any(axis=1))
x_test_clean = x_test_scaled[mask_test]
y_test_clean = y_test[mask_test]

print(f"After cleaning - x_train_clean shape: {x_train_clean.shape}")
print(f"After cleaning - y_train_clean shape: {y_train_clean.shape}")
print(f"After cleaning - x_test_clean shape: {x_test_clean.shape}")
print(f"After cleaning - y_test_clean shape: {y_test_clean.shape}")

In [None]:
#training simple LR model with CLEANED data
from sklearn.linear_model import LogisticRegression

#create and train model using CLEANED data
clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(x_train_clean, y_train_clean)

print("model trained successfully")

In [None]:
#make predictions 
y_pred = clf.predict(x_test_clean)
y_pred_proba = clf.predict_proba(x_test_clean)[:,1] #probability of positive class

#evaluate model performance 
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix

#accuracy
accuracy = accuracy_score(y_test_clean, y_pred)
print(f"accuracy: {accuracy:.4f}")

#auc-roc
roc_auc = roc_auc_score(y_test_clean, y_pred_proba)
print(f"auc-roc: {roc_auc:.4f}")

#confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_clean, y_pred))

#classification report
print("\nclassification matrix:")
print(confusion_matrix(y_test_clean, y_pred))