# Imports

In [17]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import FileLink

In [None]:
# Import Data
df = pd.read_csv('Data/Q1K/Input/Q1K-ECN-DB.csv')
cnv_df = pd.read_csv('Data/Q1K/Input/cnvpredition-output.csv')
id_map = pd.read_csv('Data/Q1K/Input/sample-id-map.csv')

# Function to determine the family_member_type
def categorize_family_member_type(id_value):
    last_part = id_value.split('_')[-1]
    if last_part == 'P':
        return 'Proband'
    elif last_part.startswith('S') and last_part[1:].isdigit():
        return 'Sibling'
    elif last_part.startswith('F') and last_part[1:].isdigit():
        return 'Father'
    elif last_part.startswith('M') and last_part[1:].isdigit():
        return 'Mother'
    else:
        return pd.NA

# Create new column with family member type
df['ParticipantID'] = df['ParticipantID'].astype('str')
df['family_member_type'] = df['ParticipantID'].apply(categorize_family_member_type)

# Data manipulations

In [None]:
# Merge participantID to the genetic data
cnv_df = cnv_df.merge(id_map, on='ID', how = 'left')

# Force ParticipantID to be a string
cnv_df['ParticipantID'] = cnv_df['ParticipantID'].astype(str).str.strip()
df['ParticipantID'] = df['ParticipantID'].astype(str).str.strip()
cnv_df.columns = cnv_df.columns.str.strip()
df.columns = df.columns.str.strip()

In [15]:
# Select genetic columns of interest
selected_columns = ['ParticipantID', 'NVIQ_CIupr', 'ORASD_upr', 'SRS_CIupr', 'PdN_CIupr', 'sum_NON_DNM_LOEUF_complete']
cnv_selected = cnv_df[selected_columns]

cnv_selected = cnv_selected.rename(
    columns={
        'NVIQ_CIupr': 'Estimated loss of Non-Verbal Intelligence Quotient',
        'ORASD_upr': 'Estimated odds ratio for autism',
        'SRS_CIupr': 'Estimated gain of raw score of Social Responsiveness Scale',
        'PdN_CIupr': 'Estimated probability of being de novo',
        'sum_NON_DNM_LOEUF_complete': 'Total constraint score burden'
    }
)

# Merge
df = df.merge(cnv_selected, on='ParticipantID', how='left')

## Download DB as CSV

In [19]:
df.to_csv("Data/Q1K/Output/Q1K-ECN-DB-preprocessed.csv", index=False)
FileLink("Data/Q1K/Output/Q1K-ECN-DB-preprocessed.csv")

# Data Visualization

## Demographics

In [None]:
family_member_groups = ['Proband', 'Sibling', 'Mother', 'Father']

# Create family_member_type histograms
plt.figure(figsize=(8, 5))
plt.hist(df['family_member_type'].dropna(), bins=10, edgecolor='black', alpha=0.7)
plt.title(f'Distribution of Family Member Type')
plt.xlabel('Family Member Type')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.savefig(f'Output/Q1K-Demographics/family_member_type_distribution.png')
plt.clf()

# Create AGE histograms for each family_member group
for group in family_member_groups:
    subset = df[df['family_member_type'] == group]
    
    mean_age = subset['Age at EEG (years)'].mean()
    median_age = subset['Age at EEG (years)'].median()
    min_age = subset['Age at EEG (years)'].min()
    max_age = subset['Age at EEG (years)'].max()

    plt.figure(figsize=(8, 5))
    plt.hist(subset['Age at EEG (years)'].dropna(), bins=10, edgecolor='black', alpha=0.7)
    plt.title(f'Age Distribution for {group}')
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Add legend with statistical information
    stats_text = f"Mean: {mean_age:.2f}\nMedian: {median_age:.2f}\nMin: {min_age}\nMax: {max_age}"
    plt.legend([stats_text], loc='upper right', fontsize=10, frameon=True)


    plt.savefig(f'Output/Q1K-Demographics/age_distribution_{group}.png')
    plt.clf()

# Create GENDER histograms for each family_member group
for group in family_member_groups:
    subset = df[df['family_member_type'] == group]

    # Count occurrences of each gender category
    gender_counts = subset['Sex at birth:'].value_counts()


    # Create pie chart
    plt.figure(figsize=(8, 5))
    wedges, texts, autotexts = plt.pie(
        gender_counts, 
        labels=gender_counts.index,  # Keep only category names as labels
        autopct=lambda p: f'{p:.1f}% ({int(p * sum(gender_counts) / 100)})',  # Show percentage and absolute value
        startangle=90, 
        wedgeprops={'edgecolor': 'black'}
    )

    plt.title(f'Sex at Birth Distribution for {group}')

    # Add legend
    plt.legend(wedges, gender_counts.index, title="Sex at Birth", loc="upper right")

    # Save the figure
    plt.savefig(f'Output/Q1K-Demographics/gender_distribution_{group}.png')
    plt.clf()  # Clear the figure after saving

In [None]:
# Filter relevant genetic status categories
genetic_status_filtered = df[df['Genetic Status'].isin(['Normal', 'Abnormal', 'VUS', 'NaN'])]

# Count occurrences of each Genetic Status
genetic_status_counts = genetic_status_filtered['Genetic Status'].value_counts()

# Count occurrences of Genetic Abnormality Type within "Abnormal" and "VUS"
abnormal_vus_counts = genetic_status_filtered[
    genetic_status_filtered['Genetic Status'].isin(['Abnormal', 'VUS'])
].groupby(['Genetic Status', 'Genetic Abnormality Type']).size().unstack(fill_value=0)

# Create stacked bar chart
fig, ax = plt.subplots(figsize=(12, 8))

# Plot total count of Genetic Status
ax.bar(genetic_status_counts.index, genetic_status_counts.values, color='gray', alpha=0.5, label="Total Count")

# Overlay with breakdown of Abnormality Types within "Abnormal" and "VUS"
bottoms = pd.Series(0, index=abnormal_vus_counts.index)  # Initialize bottom positions

for abnormality in abnormal_vus_counts.columns:
    ax.bar(abnormal_vus_counts.index, abnormal_vus_counts[abnormality], label=abnormality, bottom=bottoms)
    bottoms += abnormal_vus_counts[abnormality]  # Update bottom positions for stacking

# Labels and title
ax.set_ylabel("Count")
ax.set_title("Genetic Status and Abnormality Type Distribution")
ax.legend(title="Genetic Abnormality Type", bbox_to_anchor=(1.05, 1), loc="upper right")
plt.xticks(rotation=45)

# Save figure
plt.savefig(f'Output/Q1K-Demographics/genetics.png')
plt.clf()  # Clear the figure after saving
