# 750k dataset 
- somebody call dora 


In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

#make plots leng 
sns.set(style="whitegrid")
%matplotlib inline

# load data 
df = pd.read_csv('/Users/eb2007/Library/CloudStorage/OneDrive-UniversityofCambridge/Documents/PhD/data/data_c4_raw.csv')

# initial data inspection 
print("Shape:", df.shape)
print("columns:", df.columns)
display(df.head())
print(df.info())
print(df.isnull().sum())

# basic stats 
display(df.describe(include='all'))

# visualise the data 
# distribution of target variable
# what are the values in columns
print("sample of diagnosis columns")
diagnosis_cols = [col for col in df.columns if 'diagnosis' in col]
print(diagnosis_cols)

# any autism diagnosis 
autism_cols = [col for col in df.columns if 'autism_diagnosis' in col]

df['autism_any'] = df[autism_cols].apply(
    lambda row: int(any(x in [1.0, 2.0, 3.0] for x in row if not pd.isnull(x))),
    axis=1
)

print(df['autism_any'].value_counts())
sns.countplot(x='autism_any', data=df)
plt.title('any autism diagnosis (0 no 1 yes)')
plt.show()

# multi class target: most specific autism subtype 
def get_first_autism_subtype(row):
    for x in row :
        if x in [1.0, 2.0, 3.0]:
            return int(x)
    return 0 # no autism diagnosis 

df['autism_subtype'] = df[autism_cols].apply(get_first_autism_subtype, axis=1)
print(df['autism_subtype'].value_counts())
sns.countplot(x='autism_subtype', data=df)
plt.title('Autism subtype')
plt.show()

#multi-label: one hot encoding for each subtype

#create seperate columns for each subtype 
for subtype in [1.0, 2.0, 3.0]:
    df[f'autism_subtype_{int(subtype)}'] = df[autism_cols].apply(
        lambda row: int(subtype in row.values), axis=1
    )

print(df[[f'autism_subtype_{i}' for i in [1, 2, 3]]].sum())


In [None]:
# Mapping dictionaries for your coded variables

sex_map = {1: 'Male', 2: 'Female', 3: 'Transgender/Other', 4: 'Prefer not to say'}
handedness_map = {1: 'Right-handed', 2: 'Left-handed', 3: 'Ambidextrous', 4: 'Prefer not to say'}
education_map = {
    1: 'Did not complete High School (or A-levels)',
    2: 'High School (or A-levels) Diploma',
    3: 'Undergraduate degree',
    4: 'Postgraduate degree',
    5: 'Prefer not to say'
}
occupation_map = {
    1: 'Artist', 2: 'Civil Engineering', 3: 'Computers & I.T.', 4: 'Director', 5: 'Engineering',
    6: 'Entrepreneur', 7: 'Financial Banking', 8: 'Food & Drinks', 9: 'Healthcare', 10: 'Hospitality',
    11: 'Legal', 12: 'Leisure', 13: 'Musician', 14: 'Office Administration', 15: 'Other',
    16: 'Public Sector', 17: 'Services', 18: 'Publishing & Media', 19: 'Retail', 20: 'Sales',
    21: 'Scientific & Technical', 22: 'Supply chain', 23: 'Teaching & Interpretation', 24: 'Transport',
    25: 'Other', 26: 'Prefer not to say'
}
region_map = {
    1: 'Wales', 2: 'Scotland', 3: 'Northern Ireland', 4: 'London (England)', 5: 'North East (England)',
    6: 'North West (England)', 7: 'Yorkshire and Humber (England)', 8: 'West Midlands (England)',
    9: 'East Midlands (England)', 10: 'South East (England)', 11: 'South West (England)',
    12: 'Other (outside of the United Kingdom)', 13: 'Other (in the United Kingdom)', 14: 'Prefer not to say'
}
country_region_map = {
    1: 'Wales', 2: 'Scotland', 3: 'Northern Ireland', 4: 'London (England)', 5: 'North East (England)',
    6: 'North West (England)', 7: 'Yorkshire and Humber (England)', 8: 'West Midlands (England)',
    9: 'East Midlands (England)', 10: 'South East (England)', 11: 'South West (England)',
    12: 'Other (outside of the United Kingdom)', 13: 'Other (in the United Kingdom)', 14: 'Prefer not to say'
}
diagnosis_map = {
    1: 'ADHD', 2: 'Autism Spectrum Disorder', 3: 'Bipolar Disorder', 4: 'Depression',
    5: 'Learning disability', 6: 'OCD', 7: 'Schizophrenia', 8: 'Prefer not to say',
    9: 'No diagnosis'
}
asd_map = {1: 'Autism (classical autism)', 2: 'Asperger Syndrome (AS)', 3: 'Other'}

In [None]:
# plotting distribution for each subtype 
for i in [1 ,2 ,3]:
    sns.countplot(x=f'autism_subtype_{i}', data=df)
    plt.title(f'Autism subtype {i} (0 no 1 yes)')
    plt.show()

# EDA

In [None]:
# --- EDA feature overview and missing data ---

# list all columns and their types 
print(df.dtypes)

# count missing values per column 
missing = df.isnull().sum().sort_values(ascending=False)
print("Missing values per column:\n", missing[missing > 0])

# visualize missing data 
plt.figure(figsize=(12,6))
sns.heatmap(df.isnull(), cbar=False)
plt.title("Missing Data Heatmap")
plt.show()

In [None]:
# --- Univariate Analysis: categorical features ---

# list of known coded categorical columns 
coded_cat_cols = ['sex', 'handedness', 'education', 'occupation', 'country_region']

for col in coded_cat_cols:
    if col in df.columns:
        plt.figure(figsize=(8,4))
        df[col].value_counts(dropna=False).sort_index().plot(kind='bar')
        plt.title(f'Values counts of {col}')
        plt.xlabel(f'{col} (coded)')
        plt.ylabel('Count')
        plt.show()



In [None]:
# --- print value counts with labels for demographic columns ---

coded_cat_cols = [
    ('sex', sex_map),
    ('handedness', handedness_map),
    ('education', education_map),
    ('occupation', occupation_map),
    ('country_region', country_region_map),
    ('diagnosis', diagnosis_map),
    ('asd', asd_map),
]

for col, mapping in coded_cat_cols:
    if col in df.columns:
        counts = df[col].value_counts(dropna=False).sort_index()
        print(f"\nValue counts for {col}:")
        for code, count in counts.items():
            label = mapping.get(int(code), 'Missing/Unknown') if pd.notnull(code) else 'missing'
            print(f" {label}: {count}")

In [None]:
# --- print total value counts for all diagnosis columns ---
from collections import Counter 

#gather all values from all diagnosis columns
all_diagnosis_values = []
for col in diagnosis_cols:
    all_diagnosis_values.extend(df[col].dropna().astype(int).tolist())

# count occurrences 
diagnosis_totals = Counter(all_diagnosis_values)

print("\nTotal counts for each diagnosis (across all columns):")
for code, count in diagnosis_totals.items():
    label = diagnosis_map.get(code, f'Unknown ({code})')
    print(f' {label}: {count}')

# print value counts for all asd columns 
asd_cols = [col for col in df.columns if col.startswith('autism_diagnosis_')]
for col in asd_cols:
    counts = df[col].value_counts(dropna=False).sort_index()
    print(f"\nValue counts for {col}:")
    for code, count in counts.items():
        label = asd_map.get(int(code), 'Missing/Unknown') if pd.notnull(code) else 'missing'
        print(f' {label}: {count}')

## Cleaning 

In [None]:
# missing values, hybird approach, impute missing values and if too many then drop rows with missing values  

# wrapping the hybird approach in a function 
def hybrid_impute_drop(df, cols, max_missing=2, strategy='mean'):
    missing_counts = df[cols].isnull().sum(axis=1)
    to_impute = df[missing_counts <= max_missing].copy()
    to_drop = df[missing_counts > max_missing].copy()
    if strategy == 'mean':
        to_impute[cols] = to_impute[cols].apply(lambda x: x.fillna(x.mean()), axis=0)
    elif strategy == 'median':
        to_impute[cols] = to_impute[cols].apply(lambda x: x.fillna(x.median()), axis=0)
        # add more strats if needed
    return to_impute

# example usage 
df_spq_clean = hybrid_impute_drop(df, [f'spq_{i}' for i in range(1, 11)], max_missing=2, strategy='mean')
print("original no. of rows:", df.shape[0])
print("number of rows after cleaning:", df_spq_clean.shape[0])
print("missing alues per column after cleaning:")
print(df_spq_clean[[f'spq_{i}' for i in range(1, 11)]].isnull().sum())