# Data Loading and Preparation
# Study Dataset: C4 Questionnaire Data

# This notebook provides the initial setup for loading and preparing the dataset for:
- Exploratory Data Analysis (EDA)
- Statistical Analysis  
- Machine Learning Modeling

# Dataset Overview:
# The dataset contains questionnaire responses with various psychological scales:
- SPQ: Schizotypal Personality Questionnaire
- EQ: Empathy Quotient
- SQR: Social Responsiveness Scale
- AQ: Autism Spectrum Quotient
- Diagnostic information: Various mental health diagnoses
- Demographics: Age, sex, education, occupation, etc.


In [None]:
# 1. Import Libraries and Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

# Scientific computing
from scipy import stats
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Visualization settings
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")

In [None]:
# 2. Load Dataset
DATA_PATH = '/Users/eb2007/Library/CloudStorage/OneDrive-UniversityofCambridge/Documents/PhD/data/data_c4_raw.csv'

print("Loading dataset...")
df = pd.read_csv(DATA_PATH, low_memory=False)

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\nFirst 5 rows:")
df.head()

In [None]:
# 3. Initial Data Exploration
print("=== DATASET OVERVIEW ===")
print(f"Rows: {df.shape[0]:,}")
print(f"Columns: {df.shape[1]}")
print(f"\nColumn names:")
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")

print("\n=== DATA TYPES ===")
print(df.dtypes.value_counts())

print("\n=== MEMORY USAGE ===")
memory_usage = df.memory_usage(deep=True)
print(f"Total memory: {memory_usage.sum() / 1024**2:.2f} MB")
print(f"Average per column: {memory_usage.mean() / 1024:.2f} KB")

In [None]:
# 4. Check for missing values
print("=== MISSING VALUES ANALYSIS ===")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

missing_summary = pd.DataFrame({
    'Missing_Count': missing_data,
    'Missing_Percent': missing_percent
})

missing_columns = missing_summary[missing_summary['Missing_Count'] > 0].sort_values('Missing_Percent', ascending=False)
if len(missing_columns) > 0:
    print("Columns with missing values:")
    display(missing_columns)
else:
    print("No missing values detected!")

print("\n=== SPECIAL MISSING INDICATORS ===")
special_missing = ['#NULL!', 'NULL', 'null', 'NaN', 'nan', 'NA', 'na', '']
for indicator in special_missing:
    count = (df == indicator).sum().sum()
    if count > 0:
        print(f"'{indicator}': {count} occurrences")

In [None]:
# 5. Data Cleaning and Preprocessing
df_clean = df.copy()

print("=== DATA CLEANING STEPS ===")

# Step 1: Replace special missing indicators with NaN
special_missing_indicators = ['#NULL!', 'NULL', 'null', 'NaN', 'nan', 'NA', 'na', '']
for indicator in special_missing_indicators:
    df_clean = df_clean.replace(indicator, np.nan)

print(f"Step 1: Replaced special missing indicators with NaN")

# Step 2: Check actual column names
print("\nStep 2: Checking actual column names...")

# Check what diagnosis columns actually exist
diagnosis_cols_actual = [col for col in df_clean.columns if 'diagnosis' in col.lower()]
print("Actual diagnosis columns:")
for col in diagnosis_cols_actual:
    print(f"  {col}")

# Check what region/country columns exist
region_cols = [col for col in df_clean.columns if 'region' in col.lower() or 'country' in col.lower()]
print("\nActual region/country columns:")
for col in region_cols:
    print(f"  {col}")

# Check questionnaire columns
questionnaire_cols = [col for col in df_clean.columns if any(scale in col.lower() for scale in ['spq', 'eq', 'sqr', 'aq'])]
print("\nActual questionnaire columns:")
for col in questionnaire_cols:
    print(f"  {col}")

In [None]:
# 6. Updated Column Definitions (with correct names)
print("\nStep 3: Categorizing columns...")

# Demographic columns
demographic_cols = ['userid', 'age', 'sex', 'sex_filter', 'handedness', 'education', 
                   'occupation', 'countryregion', 'repeat']

# Diagnosis columns (using actual names)
diagnosis_cols = [col for col in df_clean.columns if col.startswith('diagnosis_')]
autism_diagnosis_cols = [col for col in df_clean.columns if col.startswith('autism_diagnosis_')]

# Questionnaire scale columns - CORRECTED NAMES
spq_cols = [f'spq_{i}' for i in range(1, 11)]
eq_cols = [f'eq_{i}' for i in range(1, 11)]
sqr_cols = [f'sqr_{i}' for i in range(1, 11)]
aq_cols = [f'aq_{i}' for i in range(1, 11)]

# Total score columns
total_cols = ['eq_10_total', 'sqr_10_total', 'spq_10_total', 'aq_10_total']

# Z-score columns
zscore_cols = ['zeq_10_total', 'zsqr_10_total', 'zspq_10_total', 'zaq_10_total']

# T-score and other derived columns
derived_cols = ['eq_tscore', 'sq_tscore', 'dscore_fromt', 'cscore_fromt', 
                'cognitivebraintype', 'sex_dichotomous', 'age_grouping', 
                'userid_grouping', 'STEMvsNOSTEM', 'autismvscontrols_1450matched']

print(f"Demographic columns: {len(demographic_cols)}")
print(f"Diagnosis columns: {len(diagnosis_cols)}")
print(f"Autism diagnosis columns: {len(autism_diagnosis_cols)}")
print(f"SPQ columns: {len(spq_cols)}")
print(f"EQ columns: {len(eq_cols)}")
print(f"SQR columns: {len(sqr_cols)}")
print(f"AQ columns: {len(aq_cols)}")
print(f"Total score columns: {len(total_cols)}")
print(f"Z-score columns: {len(zscore_cols)}")
print(f"Derived columns: {len(derived_cols)}")

In [None]:
# 7. Column Mappings (for columns that actually exist)
column_mappings = {
    'sex': {
        1: 'Male',
        2: 'Female',
        3: 'Transgender or other',
        4: 'Prefer not to say'
    },
    'handedness': {
        1: 'Right-handed',
        2: 'Left-handed',
        3: 'Ambidextrous',
        4: 'Prefer not to say'
    },
    'education': {
        1: 'Did not complete High School (or A-levels)',
        2: 'High School (or A-levels) Diploma',
        3: 'Undergraduate degree',
        4: 'Postgraduate degree',
        5: 'Prefer not to say'
    },
    'occupation': {
        1: 'Artist',
        2: 'Civil Engineering',
        3: 'Computers & I.T.',
        4: 'Director',
        5: 'Engineering',
        6: 'Entrepreneur',
        7: 'Financial Banking',
        8: 'Food & Drinks',
        9: 'Healthcare',
        10: 'Hospitality',
        11: 'Legal',
        12: 'Leisure',
        13: 'Musician',
        14: 'Office Administration',
        15: 'Other',
        16: 'Public Sector',
        17: 'Services',
        18: 'Publishing & Media',
        19: 'Retail',
        20: 'Sales',
        21: 'Scientific & Technical',
        22: 'Supply chain',
        23: 'Teaching & Interpretation',
        24: 'Transport',
        25: 'Other',
        26: 'Prefer not to say'
    }
}

print("Column mappings created successfully!")

In [None]:
# 8. Data Type Conversion
print("=== DATA TYPE CONVERSION ===")

# Convert questionnaire columns to numeric
questionnaire_cols = spq_cols + eq_cols + sqr_cols + aq_cols + total_cols + zscore_cols
for col in questionnaire_cols:
    if col in df_clean.columns:
        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

# Convert diagnosis columns to numeric
for col in diagnosis_cols + autism_diagnosis_cols:
    if col in df_clean.columns:
        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

# Convert age to numeric
if 'age' in df_clean.columns:
    df_clean['age'] = pd.to_numeric(df_clean['age'], errors='coerce')

print("Converted questionnaire, diagnosis, and age columns to numeric")
print("\nUpdated data types:")
print(df_clean.dtypes.value_counts())

In [None]:
# 9. Conservative Age Cleaning
print("=== AGE DATA CLEANING ===")

# Check age distribution before cleaning
print("Age distribution before cleaning:")
print(df_clean['age'].value_counts().sort_index().head(10))
print(f"NaN values: {df_clean['age'].isna().sum()}")

# Only remove rows where age is exactly 0 (likely invalid)
initial_count = len(df_clean)
age_zero_count = len(df_clean[df_clean['age'] == 0])
df_clean = df_clean[df_clean['age'] != 0].copy()
final_count = len(df_clean)

print(f"Removed {initial_count - final_count} rows with age = 0")
print(f"Remaining rows: {final_count}")

# Keep NaN ages - they're just missing data, not invalid
print(f"Rows with missing age: {df_clean['age'].isna().sum()}")

In [None]:
# 10. Create Autism Diagnosis Indicators
def create_autism_indicators(df):
    """Create autism diagnosis indicators based on diagnosis and autism_diagnosis columns"""
    
    # Initialize autism indicators
    df['has_autism_diagnosis'] = False
    df['autism_diagnosis_type'] = 'None'
    
    # Check diagnosis columns for autism (value = 2)
    diagnosis_cols = [col for col in df.columns if col.startswith('diagnosis_')]
    for col in diagnosis_cols:
        # If diagnosis_* = 2, that's autism
        autism_mask = df[col] == 2
        df.loc[autism_mask, 'has_autism_diagnosis'] = True
        df.loc[autism_mask, 'autism_diagnosis_type'] = 'Autism Spectrum Disorder'
    
    # Check autism_diagnosis columns
    autism_cols = [col for col in df.columns if col.startswith('autism_diagnosis_')]
    for col in autism_cols:
        # Values 1, 2, 3 indicate autism diagnosis
        autism_mask = df[col].isin([1, 2, 3])
        df.loc[autism_mask, 'has_autism_diagnosis'] = True
        
        # Map autism diagnosis types
        df.loc[df[col] == 1, 'autism_diagnosis_type'] = 'Autism (classical autism)'
        df.loc[df[col] == 2, 'autism_diagnosis_type'] = 'Asperger Syndrome (AS)'
        df.loc[df[col] == 3, 'autism_diagnosis_type'] = 'Other'
    
    return df

# Apply the function
df_clean = create_autism_indicators(df_clean)

# Check results
print("=== AUTISM DIAGNOSIS SUMMARY ===")
print(f"Total participants: {len(df_clean)}")
print(f"Autism diagnosis: {df_clean['has_autism_diagnosis'].sum()} ({df_clean['has_autism_diagnosis'].mean()*100:.2f}%)")
print("\nAutism diagnosis types:")
print(df_clean['autism_diagnosis_type'].value_counts())

In [None]:
# 11. Categorical Variable Exploration
def explore_categorical_variables(df, column_mappings):
    """Explore categorical variables using the provided mappings"""
    
    for col, mapping in column_mappings.items():
        if col in df.columns:
            print(f"\n=== {col.upper()} ===")
            value_counts = df[col].value_counts().sort_index()
            print(f"Unique values: {len(value_counts)}")
            print("Value counts:")
            for value, count in value_counts.items():
                percentage = count/len(df)*100
                label = mapping.get(value, f"Unknown ({value})")
                print(f"  {value}: {count:,} ({percentage:.2f}%) - {label}")
        else:
            print(f"\n=== {col.upper()} ===")
            print("Column not found in dataset")

print("=== CATEGORICAL VARIABLE EXPLORATION ===")
explore_categorical_variables(df_clean, column_mappings)

In [None]:
# 12. Data Quality Assessment
def assess_data_quality(df, column_groups):
    """Assess data quality for different column groups"""
    
    for group_name, columns in column_groups.items():
        print(f"\n=== {group_name.upper()} QUALITY ASSESSMENT ===")
        
        # Filter columns that exist in the dataset
        existing_cols = [col for col in columns if col in df.columns]
        
        if not existing_cols:
            print(f"No {group_name} columns found in dataset")
            continue
            
        group_df = df[existing_cols]
        
        # Missing values
        missing_pct = (group_df.isnull().sum() / len(group_df)) * 100
        print(f"Missing values (%):")
        for col, pct in missing_pct.items():
            if pct > 0:
                print(f"  {col}: {pct:.2f}%")
        
        # Data types
        print(f"\nData types:")
        print(group_df.dtypes.value_counts())
        
        # For numeric columns, show basic stats
        numeric_cols = group_df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            print(f"\nNumeric columns ({len(numeric_cols)}): {list(numeric_cols)}")
            print("\nBasic statistics:")
            display(group_df[numeric_cols].describe())

# Define column groups for assessment
column_groups = {
    'demographics': demographic_cols,
    'diagnoses': diagnosis_cols,
    'autism_diagnoses': autism_diagnosis_cols,
    'spq_scale': spq_cols,
    'eq_scale': eq_cols,
    'sqr_scale': sqr_cols,
    'aq_scale': aq_cols,
    'total_scores': total_cols,
    'z_scores': zscore_cols,
    'derived_variables': derived_cols
}

assess_data_quality(df_clean, column_groups)