In [None]:
# 01 Data Preparation

## Initial inspection and light cleaning of the main dataset.

'''python
- Loads the dataset.
- Displays shape, columns, dtypes.
- Checks for missing values.
- Checks value distributions for key columns.
- Outputs summary statistics.'''

In [None]:
import pandas as pd
import numpy as np
import os

DATA_PATH = '/Users/eb2007/Library/CloudStorage/OneDrive-UniversityofCambridge/Documents/PhD/data/data_c4_clean.csv'

# Load dataset
df = pd.read_csv(DATA_PATH)
print(f'Shape: {df.shape}')
df.head()

In [None]:
# Numeric columns summary
display(df.select_dtypes(include=[np.number]).describe().T)

# Categorical columns summary
display(df.select_dtypes(include=['object']).describe().T)

In [None]:

# Display columns and dtypes
print('Columns:', df.columns.tolist())
print('Data types:')
print(df.dtypes)

### Diagnosis Columns

- `diagnosis_0` to `diagnosis_8`:
    1. Attention Deficit / Hyperactivity Disorder
    2. Autism Spectrum Disorder
    3. Bipolar Disorder
    4. Depression
    5. Learning disability
    6. Obsessive-Compulsive Disorder
    7. Schizophrenia
    8. I prefer not to say
    9. I have not been diagnosed with any of these conditions

- `autism_diagnosis_0` to `autism_diagnosis_2`:
    1. Autism (classical autism)
    2. Asperger Syndrome (AS)
    3. Other

In [None]:
# Function to print % missing per column and flag issues
def missing_report(data):
    missing = data.isnull().mean() * 100
    flagged = missing[missing > 0] if missing.any() else None
    print('Missing values (%):')
    print(missing)
    if flagged is not None:
        print('Columns with missing values:')
        print(flagged)
    else:
        print('No missing values detected.')
    # Flag suspicious column names
    suspicious = [col for col in data.columns if col.strip() == '' or col.lower().startswith('unnamed') or data.columns.duplicated().any()]
    if suspicious:
        print('Suspicious column names:', suspicious)
    else:
        print('No suspicious column names detected.')

missing_report(df)

In [None]:
# Replace '#NULL!' with np.nan everywhere
df.replace('#NULL!', np.nan, inplace=True)

# Convert diagnosis columns to numeric (if possible)
diagnosis_cols = [f'diagnosis_{i}' for i in range(0, 9)] + [f'autism_diagnosis_{i}' for i in range(0, 3)]
for col in diagnosis_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

In [None]:
# Value distributions for key columns
key_columns = (
    [f'SPQ_{i}' for i in range(1, 11)] +
    [f'EQ_{i}' for i in range(1, 11)] +
    [f'SQR_{i}' for i in range(1, 11)] +
    [f'AQ_{i}' for i in range(1, 11)] +
    ['EQ_10_total', 'SQR_10_total', 'SPQ_10_total', 'AQ_10_total'] +
    [f'diagnosis_{i}' for i in range(0, 9)] +
    [f'autism_diagnosis_{i}' for i in range(0, 3)]
)
for col in key_columns:
    if col in df.columns:
        print(f'\nValue counts for {col}:')
        print(df[col].value_counts(dropna=False))
    else:
        print(f'Column {col} not found in dataset.')

In [None]:
# 1. Create a single column for the first non-null diagnosis type
autism_diag_cols = ['autism_diagnosis_0', 'autism_diagnosis_1', 'autism_diagnosis_2']
df['autism_diagnosis_flat'] = df[autism_diag_cols].bfill(axis=1).iloc[:, 0]

# 2. Create a binary flag for "has any autism diagnosis"
df['has_autism_diagnosis'] = df[autism_diag_cols].notnull().any(axis=1)

# 3. (Optional) Flag rows with multiple types (possible data artifact)
df['multiple_autism_types'] = df[autism_diag_cols].notnull().sum(axis=1) > 1

# 4. (Optional) See how many have multiple types
print(df['multiple_autism_types'].value_counts())

# List of diagnosis columns
diagnosis_cols = [f'diagnosis_{i}' for i in range(0, 9)]

# Flag if any diagnosis column is 2 (Autism Spectrum Disorder)
df['is_autistic'] = df[diagnosis_cols].eq(2).any(axis=1)

# Count
print(df['is_autistic'].value_counts())
print('Number of autistic individuals:', df['is_autistic'].sum())

In [None]:
# Count autistic people
diagnosis_cols = [f'diagnosis_{i}' for i in range(0, 9)]
df['is_autistic'] = df[diagnosis_cols].eq(2).any(axis=1)
print('Number of autistic individuals:', df['is_autistic'].sum())