# Exploratory Data Analysis (EDA)

Dataset: `/Users/eb2007/Library/CloudStorage/OneDrive-UniversityofCambridge/Documents/PhD/data/data_c4_raw.csv` (≈700K rows)
---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 120)

In [None]:
# Path to the large CSV file
csv_path = '/Users/eb2007/Library/CloudStorage/OneDrive-UniversityofCambridge/Documents/PhD/data/data_c4_raw.csv'

# Read a small sample to infer dtypes and preview
sample = pd.read_csv(csv_path, nrows=5000)
display(sample.info())
display(sample.head())

In [None]:
# Try to load the full dataset
try:
    df = pd.read_csv(csv_path)
    print('Shape:', df.shape)
    print('Columns:', df.columns.tolist())
    print('Memory usage (MB):', df.memory_usage(deep=True).sum() / 1e6)
    display(df.head())
    display(df.tail())
    print('Missing values per column:')
    print(df.isnull().sum())
    display(df.describe(include="all"))
except MemoryError:
    print('MemoryError: Consider loading with chunksize or specifying dtypes.')
    df = None

# Counting autism diagnoses

- this is pre-cleaning

In [None]:
# List of relevant columns
diagnosis_cols = [f'diagnosis_{i}' for i in range(9)]
autism_diag_cols = [f'autism_diagnosis_{i}' for i in range(3)]

# Ensure columns exist in the DataFrame
diagnosis_cols = [col for col in diagnosis_cols if col in df.columns]
autism_diag_cols = [col for col in autism_diag_cols if col in df.columns]

# Condition 1: Any '2' in diagnosis columns
diagnosis_autistic = df[diagnosis_cols].apply(lambda row: (row == 2).any() or (row == '2').any(), axis=1)

# Condition 2: Any '1', '2', or '3' in autism_diagnosis columns
autism_diag_autistic = df[autism_diag_cols].apply(lambda row: row.isin([1, 2, 3]).any() or row.isin(['1', '2', '3']).any(), axis=1)

# Combine conditions
autistic_participants = diagnosis_autistic | autism_diag_autistic

# Count
num_autistic = autistic_participants.sum()
print(f'Number of autistic participants: {num_autistic}')

# Exploring clean dataset 

In [None]:
# load the cleaned data
clean_csv_path = '/Users/eb2007/Library/CloudStorage/OneDrive-UniversityofCambridge/Documents/PhD/data/data_c4_clean.csv'
df_clean = pd.read_csv(clean_csv_path)

In [None]:
# reapeat autistic participants count logic 
diagnosis_cols = [f'diagnosis_{i}' for i in range(9) if f'diagnosis_{i}' in df_clean.columns]
autism_diag_cols = [f'autism_diagnosis_{i}' for i in range(3) if f'autism_diagnosis_{i}' in df_clean.columns]

diagnosis_autistic = df_clean[diagnosis_cols].apply(lambda row: (row == 2).any() or (row == '2').any(), axis=1)
autism_diag_autistic = df_clean[autism_diag_cols].apply(lambda row: row.isin([1, 2, 3]).any() or row.isin(['1', '2', '3']).any(), axis=1)
autistic_participants = diagnosis_autistic | autism_diag_autistic
num_autistic = autistic_participants.sum()
print(f'Number of autistic participants in cleaned data: {num_autistic}')