In [5]:
import sqlite3
import pandas as pd

# Connect to database
conn = sqlite3.connect('../data/mitochondria_deletions.db')

# Query 1: Show all samples
print("=== ALL SAMPLES ===")
df = pd.read_sql("SELECT * FROM samples LIMIT 5", conn)
print(df)

# Query 2: Average deletion burden by diagnosis
print("\n=== DELETION BURDEN BY DIAGNOSIS ===")
df = pd.read_sql("""
    SELECT diagnosis,
           COUNT(*) as n_samples,
           ROUND(AVG(top30_cumulative_deletion_pct), 3) as avg_deletion,
           ROUND(MIN(top30_cumulative_deletion_pct), 3) as min_deletion,
           ROUND(MAX(top30_cumulative_deletion_pct), 3) as max_deletion
    FROM samples
    GROUP BY diagnosis
    ORDER BY avg_deletion DESC
""", conn)
print(df)

# Query 3: Top 5 most abundant deletions
print("\n=== TOP 5 DELETIONS ===")
df = pd.read_sql("""
    SELECT deletion_name,
           COUNT(CASE WHEN deletion_read_pct > 0 THEN 1 END) as detected_in_samples,
           ROUND(AVG(deletion_read_pct), 4) as avg_pct
    FROM deletions
    WHERE deletion_read_pct > 0
    GROUP BY deletion_name
    ORDER BY avg_pct DESC
    LIMIT 5
""", conn)
print(df)

conn.close()


=== ALL SAMPLES ===
  sample_id diagnosis sex  age age_group  top30_cumulative_deletion_pct
0    UCI_10      CTRL   F   45     30-50                       3.014308
1    UCI_11      CTRL   M   77       70+                       2.885805
2    UCI_12       SCZ   M   49     30-50                       6.123169
3    UCI_13       SCZ   M   38     30-50                       2.014756
4    UCI_14      CTRL   M   59     50-70                       3.948052

=== DELETION BURDEN BY DIAGNOSIS ===
  diagnosis  n_samples  avg_deletion  min_deletion  max_deletion
0      CTRL         15         3.142         1.396         5.254
1       SCZ         15         2.268         0.452         6.123

=== TOP 5 DELETIONS ===
  deletion_name  detected_in_samples  avg_pct
0    6335_13999                   29   0.4518
1    6545_13846                   29   0.4066
2    6329_13994                   24   0.3215
3    8471_13449                   28   0.2865
4    7816_14807                   28   0.1978


In [6]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('../data/mitochondria_deletions.db')

query_age = """
SELECT age_group,
       COUNT(*) AS n_samples,
       ROUND(AVG(top30_cumulative_deletion_pct), 3) AS avg_deletion,
       ROUND(MIN(top30_cumulative_deletion_pct), 3) AS min_deletion,
       ROUND(MAX(top30_cumulative_deletion_pct), 3) AS max_deletion
FROM samples
GROUP BY age_group
ORDER BY age_group;
"""

df_age = pd.read_sql(query_age, conn)
conn.close()

df_age


Unnamed: 0,age_group,n_samples,avg_deletion,min_deletion,max_deletion
0,30-50,13,2.444,1.339,6.123
1,50-70,13,3.349,1.253,5.632
2,70+,1,2.886,2.886,2.886
3,<30,3,0.983,0.452,1.754
