In [None]:
# 01 Data Preparation

## Initial inspection and light cleaning of the main dataset.

'''python
- Loads the dataset.
- Displays shape, columns, dtypes.
- Checks for missing values.
- Checks value distributions for key columns.
- Outputs summary statistics.'''

In [None]:
import pandas as pd
import numpy as np
import os

DATA_PATH = '/Users/eb2007/Library/CloudStorage/OneDrive-UniversityofCambridge/Documents/PhD/data/data_c4_clean.csv'

# Load dataset
df = pd.read_csv(DATA_PATH)
print(f'Shape: {df.shape}')
df.head()

In [None]:
# Display columns and dtypes
print('Columns:', df.columns.tolist())
print('Data types:')
print(df.dtypes)

In [None]:
# Function to print % missing per column and flag issues
def missing_report(data):
    missing = data.isnull().mean() * 100
    flagged = missing[missing > 0] if missing.any() else None
    print('Missing values (%):')
    print(missing)
    if flagged is not None:
        print('Columns with missing values:')
        print(flagged)
    else:
        print('No missing values detected.')
    # Flag suspicious column names
    suspicious = [col for col in data.columns if col.strip() == '' or col.lower().startswith('unnamed') or data.columns.duplicated().any()]
    if suspicious:
        print('Suspicious column names:', suspicious)
    else:
        print('No suspicious column names detected.')

missing_report(df)

In [None]:
# Summary statistics
df.describe(include='all').T

In [None]:
# Value distributions for key columns
key_columns = ['AQ', 'EQ', 'SQ', 'SPQ', 'diagnosis', 'comorbidities']
for col in key_columns:
    if col in df.columns:
        print(f'\nValue counts for {col}:')
        print(df[col].value_counts(dropna=False))
    else:
        print(f'Column {col} not found in dataset.')