In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

# load data 
from src.data_loader import load_data

# load the data 
df = load_data("/Users/eb2007/Library/CloudStorage/OneDrive-UniversityofCambridge/Documents/PhD/data/data_c4_raw.csv")

# basic info 
print("dataset shape:", df.shape)
print("\ndata types:")
print(df.dtypes)
print("\nmissing values:")
print(df.isnull().sum())


# address the target var

In [None]:
# Check autism diagnosis columns (should have values 1, 2, or 3)
print("Autism diagnosis columns:")
for col in [col for col in df.columns if 'autism_diagnosis' in col]:
    print(f"{col}: {df[col].value_counts().to_dict()}")
    print(f"Missing: {df[col].isnull().sum()}")
    print("---")

# Check regular diagnosis columns (should have value 2 for autism)
print("\nRegular diagnosis columns:")
for col in [col for col in df.columns if col.startswith('diagnosis_') and not 'autism' in col]:
    print(f"{col}: {df[col].value_counts().to_dict()}")
    print(f"Missing: {df[col].isnull().sum()}")
    print("---")




In [None]:
# check what diagnosis columns actually exist 
print("all columns containing 'diagnosis':")
diagnosis_related = [col for col in df.columns if 'diagnosis' in col]
print(diagnosis_related)

print("\nall autism diagnosis columns:")
autism_diag = [col for col in df.columns if 'autism_diagnosis' in col]
print(autism_diag)


In [None]:
# get the correct columns names 
autism_cols = [col for col in df.columns if 'autism_diagnosis' in col]
diagnosis_cols = [col for col in df.columns if col.startswith('diagnosis_') and not 'autism' in col]

print(f"autism columns found: {autism_cols}")
print(f"diagnosis columns found: {diagnosis_cols}")

# method 1 autism diagnosis columns 
if autism_cols:
    autism_from_specific = df[autism_cols].fillna(0).ge(1).any(axis=1)
else:
    autism_from_specific = pd.Series([False] * len(df))

# method 2 regular diagnosis columns 
if diagnosis_cols:
    autism_from_general = df[diagnosis_cols].fillna(0).eq(2).any(axis=1)
else: 
    autism_from_general = pd.Series([False] * len(df))

df['autism_target'] = (autism_from_specific | autism_from_general).astype(int)

# creating target var 

In [None]:
# get columns dynamically from your datafram
autism_cols = [col for col in df.columns if 'autism_diagnosis' in col]
diagnosis_cols = [col for col in df.columns if col.startswith('diagnosis_') and not 'autism' in col]

# method 1 autism fiagnosis columns
autism_from_specific = df[autism_cols].fillna(0).ge(1).any(axis=1)

# method 2 regular diagnosis columns 
autism_from_general = df[diagnosis_cols].fillna(0).eq(2).any(axis=1)

# combined target 
df['autism_target'] = (autism_from_specific | autism_from_general).astype(int)

# analysis of target var 
print("combined target variable analysis:")
print("total cases: {len(df)}")
print(f"autism cases: {df['autism_target'].sum()}")
print(f"non-autism cases: {len(df) - df['autism_target'].sum()}")
print(f"autism percentage: {(df['autism_target'].sum() / len(df)) * 100:.2f}%")

# check for class imbalance 
print(f"\nclass imbalance:")
print(f"autism: {df['autism_target'].sum()} ({df['autism_target'].mean()*100:.1f}%)")
print(f"non-autism: {(df['autism_target'] == 0).sum()} ({(1-df['autism_target'].mean())*100:.1f}%)")

# EDA


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# visualise class imbalance
sns.countplot(x='autism_target', data=df)
plt.title('class balance: autism vs non-autism')
plt.xticks([0, 1], ['non-autism', 'autism'])
plt.show()

# age distribution 
sns.histplot(data=df, x='age', hue='autism_target', bins=30, kde=True, stat='density')
plt.title('age distribution by autism status')
plt.show()

# sex distribution 
sex_map = {
    1.0: 'male',
    2.0: 'female',
    3.0: 'other',
    4.0: 'prefer_not_to_say',
}
df['sex_label'] = df['sex'].map(sex_map)
df['sex_label'].value_counts(dropna='unknown')

print(df['sex_label'].value_counts(dropna=False))

sns.countplot(x='sex_label', hue='autism_target', data=df)
plt.title('sex by autism diagnosis')
plt.show()

In [None]:
# questionnaire scores
import matplotlib.pyplot as plt
import seaborn as sns

#list all questionnaire columns
questionnaire_cols = [col for col in df.columns if any(q in col for q in ['spq_', 'eq_', 'spqr_', 'aq_'])]

df_melted = df.melt(id_vars='autism_target', value_vars=questionnaire_cols,
                             var_name='questionnaire', value_name='score')

plt.figure(figsize=(16, 6))
sns.boxplot(x='questionnaire', y='score', hue='autism_target', data=df_melted)
plt.title('distribution of questionnaire scores by autism diagnosis')
plt.xticks(rotation=90)
plt.legend(title='autism')
plt.show()

#plot violins
plt.figure(figsize=(16, 6))
sns.violinplot(x='questionnaire', y='score', hue='autism_target', data=df_melted, split=True)
plt.title('violin plot of questionnaire scores by autism diagnosis')
plt.xticks(rotation=90)
plt.legend(title='autism')
plt.show()

# facegrid 
g = sns.FacetGrid(df_melted, col='questionnaire', col_wrap=5, height=3, sharey=False)
g.map(sns.boxplot, "autism_target", "score", order=[0, 1])
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('boxplot of questionnaire scores by autism diagnosis')
plt.show()

# heatmap of mean differences 
means = df.groupby('autism_target')[questionnaire_cols].mean().T
means['diff'] = means[1] - means[0]
plt.figure(figsize=(10, 1))
sns.heatmap(means[['diff']].T, annot=True, cmap='coolwarm', center=0)
plt.title('mean score difference (autism - non-autism)')
plt.yticks(rotation=0)
plt.show()

print(df.shape)


# handling missing data 


In [None]:
# vis missing data
missing = df.isnull().mean().sort_values(ascending=False)
plt.figure(figsize=(10,6))
sns.barplot(x=missing.head(20), y=missing.head(20).index)
plt.title('top 20 feature by missing data fraction')
plt.xlabel('fraction missing')
plt.show()
print(df.shape)

# imputation

In [None]:
# remove test user IDs 
df = df[df['userid'] > 174283]

# impute demographic columns with 'unknown
demographic_cols = ['sex', 'handedness', 'education', 'occupation', 'country_region']
for col in demographic_cols:
    df[col] = df[col].fillna('unknown')

# impute questionnaire scores with median
questionnaire_cols = [col for col in df.columns if any(q in col for q in ['spq_', 'eq_', 'sqr_', 'aq_'])]
df[questionnaire_cols] = df[questionnaire_cols].fillna(df[questionnaire_cols].median())

# drop rows with too much missin data in key features 
df = df.dropna(subset=questionnaire_cols)

# one hot encode demographic columns
demographic_cols = ['sex', 'handedness', 'education', 'occupation', 'country_region']
df = pd.get_dummies(df, columns=demographic_cols, drop_first=True)
print(df.dtypes.value_counts())
print(df.select_dtypes(include='object').columns)

# do NOT impute diagnosis or autism diagnosis columns
diagnosis_cols = [col for col in df.columns if col.startswith('diagnosis_') and not 'autism' in col]
autism_cols = [col for col in df.columns if 'autism_diagnosis' in col]

# Check for remaining missing values
print("remaining missing values:")
print(df.isnull().sum().sort_values(ascending=False).head(10))

print("\ndata types after encoding :")
print(df.dtypes.value_counts())
print(df.dtypes.tail(10))

print("\nquestionnaire feature means (should be around 0):")
print(df[questionnaire_cols].mean())

print("questionnaire features stds (should be around 1):")
print(df[questionnaire_cols].std())

print(f"\npreview of processed data:")
print(df.head())
print(df.shape)

In [None]:
#scale/normalize features 
from sklearn.preprocessing import StandardScaler 

scaler = StandardScaler()
df[questionnaire_cols] = scaler.fit_transform(df[questionnaire_cols])

# train test split 

In [None]:
from sklearn.model_selection import train_test_split

# Drop all diagnosis and autism diagnosis columns to prevent data leakage
diagnosis_cols = [col for col in df.columns if col.startswith('diagnosis_') and not 'autism' in col]
autism_diag_cols = [col for col in df.columns if 'autism_diagnosis' in col]
x = df.drop(columns=['autism_target'] + diagnosis_cols + autism_diag_cols)
y = df['autism_target']

# split data
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, stratify=y, random_state=42
)

print(f"train shape: {x_train.shape}, test shape: {x_test.shape}")
print(f"train autism%: {y_train.mean()*100:.2f}%, test autism%: {y_test.mean()*100:.2f}%")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Check for categorical columns in the data
categorical_cols = x_train.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = x_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Create pipeline with preprocessing and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))
])

# Train model
pipeline.fit(x_train, y_train)

# Predict
y_pred = pipeline.predict(x_test)
y_probs = pipeline.predict_proba(x_test)[:, 1]  # Fixed typo in predict_proba

# Evaluate
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_probs))