In [None]:
import pandas as pd

In [None]:
directory = 'drive/MyDrive/Data'
cohort_name = 'AMP_PD'

In [None]:
annovar_path = f'{directory}/{cohort_name}_withgnomad_clinvar.annovar.hg38_multianno.txt'
annovar_df = pd.read_csv(annovar_path, sep='\t')
columns_to_extract = [
    'Chr', 'Start', 'End', 'Ref', 'Alt', 'Func.refGene',
    'GeneDetail.refGene', 'ExonicFunc.refGene', 'AAChange.refGene',
    'CADD_phred', 'CLNSIG', 'gnomad41_genome_AF',
    'gnomad41_genome_AF_asj', 'gnomad41_genome_AF_nfe','avsnp151']

# Extract the relevant columns
annovar_df = annovar_df[columns_to_extract]

# Display the first few rows of the DataFrame
annovar_df

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,CADD_phred,CLNSIG,gnomad41_genome_AF,gnomad41_genome_AF_asj,gnomad41_genome_AF_nfe,avsnp151
0,12,40225008,40225008,G,C,UTR5,NM_198578:c.-124G>C,.,.,.,Benign/Likely_benign,0.0388,0.0023,0.0006,rs112643657
1,12,40225111,40225111,G,A,UTR5,NM_198578:c.-21G>A,.,.,.,.,5.256e-05,0,0.0001,rs41286470
2,12,40225176,40225176,T,C,exonic,.,synonymous SNV,LRRK2:NM_198578:exon1:c.T45C:p.T15T,.,Likely_benign,0.0002,0,0.0004,rs142399623
3,12,40225280,40225280,G,A,exonic,.,nonsynonymous SNV,LRRK2:NM_198578:exon1:c.G149A:p.R50H,1.908,Benign,0.9720,1.0000,0.9998,rs2256408
4,12,40225311,40225311,T,C,intronic,.,.,.,.,.,7.884e-05,0,0.0001,rs200330027
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1294,12,40369167,40369167,G,T,UTR3,NM_198578:c.*1402G>T,.,.,.,Conflicting_classifications_of_pathogenicity,0.0011,0,0.0015,rs201559861
1295,12,40369188,40369188,T,A,UTR3,NM_198578:c.*1423T>A,.,.,.,.,0.0001,0,0.0002,rs199603996
1296,12,40369194,40369194,C,G,UTR3,NM_198578:c.*1429C>G,.,.,.,Likely_benign,9.902e-05,0,0,rs142051987
1297,12,40369229,40369229,T,G,UTR3,NM_198578:c.*1464T>G,.,.,.,Benign,0.0096,0,7.38e-05,rs17491828


In [None]:
# Define the path to the freq and counts files
freq_path = f'{directory}/{cohort_name}_MAF.assoc.fisher'
counts_path = f'{directory}/{cohort_name}.assoc.fisher'
freq_df = pd.read_csv(freq_path, sep='\s+')
counts_df = pd.read_csv(counts_path, sep='\s+')

# Format data types to match
freq_df['SNP'] = freq_df['SNP'].astype(str)
counts_df['SNP'] = counts_df['SNP'].astype(str)

# merge the two files
MAF_and_counts_df = pd.merge(counts_df[['CHR', 'SNP', 'BP', 'A1', 'A2', 'C_A', 'C_U']],
                             freq_df[['SNP', 'F_A', 'F_U']],
                             on='SNP')

# Display the first few rows of the merged DataFrame
MAF_and_counts_df.head()


Unnamed: 0,CHR,SNP,BP,A1,A2,C_A,C_U,F_A,F_U
0,12,rs112643657,40225008,C,G,4,5,0.001019,0.000808
1,12,rs41286470,40225111,A,G,0,0,0.0,0.0
2,12,rs142399623,40225176,C,T,3,2,0.000764,0.000323
3,12,rs2256408,40225280,G,A,2,1,0.000509,0.000162
4,12,rs200330027,40225311,C,T,0,2,0.0,0.000323


In [None]:
# Count number of rows
print(f'Number of rows: {len(MAF_and_counts_df)}')

Number of rows: 1161


In [None]:
# Based on CHR, A1 and A2 columns, merge the annovar and MAF_and_counts dataframes
MAF_and_counts_df['CHR'] = MAF_and_counts_df['CHR'].astype(str)
MAF_and_counts_df['A1'] = MAF_and_counts_df['A1'].astype(str)
MAF_and_counts_df['A2'] = MAF_and_counts_df['A2'].astype(str)
annovar_df['Chr'] = annovar_df['Chr'].astype(str)
if cohort_name=='UKBB':
  # remove Chr from 'Chr'
  annovar_df['Chr'] = annovar_df['Chr'].str.replace('chr', '')
annovar_df['Ref'] = annovar_df['Ref'].astype(str)
annovar_df['Alt'] = annovar_df['Alt'].astype(str)
merged_df = pd.merge(annovar_df, MAF_and_counts_df, left_on=['Chr', 'Start', 'Ref', 'Alt'], right_on=['CHR', 'BP', 'A2', 'A1'])
merged_df = merged_df.drop(columns=['CHR', 'BP', 'A1', 'A2'])
# Display the first few rows of the merged DataFrame
merged_df.head()

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,CADD_phred,CLNSIG,gnomad41_genome_AF,gnomad41_genome_AF_asj,gnomad41_genome_AF_nfe,avsnp151,SNP,C_A,C_U,F_A,F_U
0,12,40225008,40225008,G,C,UTR5,NM_198578:c.-124G>C,.,.,.,Benign/Likely_benign,0.0388,0.0023,0.0006,rs112643657,rs112643657,4,5,0.001019,0.000808
1,12,40225111,40225111,G,A,UTR5,NM_198578:c.-21G>A,.,.,.,.,5.256e-05,0.0,0.0001,rs41286470,rs41286470,0,0,0.0,0.0
2,12,40225176,40225176,T,C,exonic,.,synonymous SNV,LRRK2:NM_198578:exon1:c.T45C:p.T15T,.,Likely_benign,0.0002,0.0,0.0004,rs142399623,rs142399623,3,2,0.000764,0.000323
3,12,40225311,40225311,T,C,intronic,.,.,.,.,.,7.884e-05,0.0,0.0001,rs200330027,rs200330027,0,2,0.0,0.000323
4,12,40225382,40225382,C,A,intronic,.,.,.,.,.,0.0016,0.0,0.0008,rs80220210,rs80220210,3,5,0.000764,0.000808


In [None]:
# Add domain annotation based on BP location
domain_df = pd.read_excel(f'{directory}/LRRK2_GRCh38.xlsx', sheet_name="Sheet2")

domain_df.head()

Unnamed: 0,Chromosome,Gene,Domain,Column1,Column12,narrow_start,narrow_end,wide_start,wide_end
0,12,LRRK2,ARM,100-688,100-688,40225234,40274891,40225153,40277914
1,12,LRRK2,ANK,688-863,688-863,40277916,40284040,40277915,40284043
2,12,LRRK2,LRR,943-1309,943-1309,40294854,40305881,40294854,40305881
3,12,LRRK2,RocCOR,1327-1842,1327-1842,40309151,40313983,40308486,40322084
4,12,LRRK2,Kinase,1879-2138,1879-2138,40323303,40340349,40323249,40351571


In [None]:
def annotate_domain(row, domain_df):
    for _, domain_row in domain_df.iterrows():
        if domain_row['wide_start'] <= row['Start'] <= domain_row['wide_end']:
            return domain_row['Domain']
    return 'Unknown'

# Annotate domain information of variants
merged_df['Domain'] = merged_df.apply(annotate_domain, axis=1, domain_df=domain_df)

# Display the first few rows of the merged DataFrame
merged_df.head()
# Count each domain group
domain_counts = merged_df['Domain'].value_counts()
print(domain_counts)

Domain
ARM        418
Kinase     192
WD40       131
Unknown    130
RocCOR     113
LRR         91
ANK         56
Name: count, dtype: int64


In [None]:
# Count number of rows
print(f'Number of rows: {len(merged_df)}')

# Show me the unmatched rows from MAF_and_counts_df
unmatched_rows = MAF_and_counts_df[~MAF_and_counts_df['SNP'].isin(merged_df['SNP'])]
unmatched_rows.head()
# This is before the UTR5, intronic, and UTR3 variants are removed

Number of rows: 1131


Unnamed: 0,CHR,SNP,BP,A1,A2,C_A,C_U,F_A,F_U
3,12,rs2256408,40225280,G,A,2,1,0.000509,0.000162
9,12,rs2723273,40225499,G,A,2,1,0.000509,0.000162
16,12,rs2256286,40225740,A,G,2,1,0.000509,0.000162
49,12,rs2708439,40229505,T,A,2,1,0.000509,0.000162
53,12,rs2708440,40229928,C,T,2,1,0.00051,0.000162


In [None]:
# Remove UTR5, intronic, and UTR3 variants and recount unmatched
merged_df = merged_df[~merged_df['Func.refGene'].isin(['UTR5', 'intronic', 'UTR3'])]
print(f'Number of rows: {len(merged_df)}')

# Count again
unmatched_rows = MAF_and_counts_df[~MAF_and_counts_df['SNP'].isin(merged_df['SNP'])]
unmatched_rows.head()
# This is after the UTR5, intronic, and UTR3 variants are removed
# Count each domain group
domain_counts = merged_df['Domain'].value_counts()
print(domain_counts)

Number of rows: 164
Domain
ARM        39
RocCOR     35
WD40       22
LRR        20
Kinase     17
Unknown    16
ANK        15
Name: count, dtype: int64


In [None]:
# remove Domain == Unknown
merged_df = merged_df[merged_df['Domain'] != 'Unknown']
print(f'Number of rows: {len(merged_df)}')

Number of rows: 148


In [None]:
# Make the headers more legible
# Mapping of old headers to new headers without underscores
header_mapping = {
    'Chr': 'Chromosome',
    'Start': 'Start Position',
    'End': 'End Position',
    'Ref': 'Reference Allele',
    'Alt': 'Alternate Allele',
    'Func.refGene': 'Functional Annotation',
    'GeneDetail.refGene': 'outside gene cDNA Change',
    'ExonicFunc.refGene': 'Exonic Function',
    'AAChange.refGene': 'cDNA and Amino Acid Change',
    'CADD_phred': 'CADD v1.7 Phred Score',
    'CLNSIG': 'Clinical Significance (Varsome)',
    'gnomad41_genome_AF': 'gnomAD Allele Frequency',
    'gnomad41_genome_AF_asj': 'gnomAD AF Ashkenazi',
    'gnomad41_genome_AF_nfe': 'gnomAD AF Non Finnish European',
    'avsnp151': 'rsID'
}
merged_df = merged_df.rename(columns=header_mapping)

# Display the first few rows of the merged DataFrame
merged_df.head()

Unnamed: 0,Chromosome,Start Position,End Position,Reference Allele,Alternate Allele,Functional Annotation,outside gene cDNA Change,Exonic Function,cDNA and Amino Acid Change,CADD v1.7 Phred Score,...,gnomAD Allele Frequency,gnomAD AF Ashkenazi,gnomAD AF Non Finnish European,rsID,SNP,C_A,C_U,F_A,F_U,Domain
2,12,40225176,40225176,T,C,exonic,.,synonymous SNV,LRRK2:NM_198578:exon1:c.T45C:p.T15T,.,...,0.0002,0.0,0.0004,rs142399623,rs142399623,3,2,0.000764,0.000323,ARM
8,12,40225580,40225580,T,A,exonic,.,nonsynonymous SNV,LRRK2:NM_198578:exon2:c.T177A:p.N59K,15.63,...,9.855e-05,0.0014,7.349e-05,rs150422099,rs150422099,1,0,0.000255,0.0,ARM
9,12,40225628,40225628,G,A,exonic,.,synonymous SNV,LRRK2:NM_198578:exon2:c.G225A:p.A75A,.,...,0.0015,0.0,0.0,rs75054132,rs75054132,0,0,0.0,0.0,ARM
66,12,40232345,40232345,G,C,exonic,.,nonsynonymous SNV,LRRK2:NM_198578:exon3:c.G309C:p.Q103H,12.64,...,0.0001,0.0,2.94e-05,rs200926937,rs200926937,0,0,0.0,0.0,ARM
67,12,40232380,40232380,A,C,exonic,.,nonsynonymous SNV,LRRK2:NM_198578:exon3:c.A344C:p.H115P,17.96,...,0.0001,0.0009,2.94e-05,rs201439315,rs201439315,0,2,0.0,0.000323,ARM


In [None]:
header_mapping_2 = {
    'C_A': 'N (Case)',
    'C_U': 'N (Control)',
    'F_A': 'MAF (Case)',
    'F_U': 'MAF (Control)'
}

merged_df = merged_df.rename(columns=header_mapping_2)
merged_df

Unnamed: 0,Chromosome,Start Position,End Position,Reference Allele,Alternate Allele,Functional Annotation,outside gene cDNA Change,Exonic Function,cDNA and Amino Acid Change,CADD v1.7 Phred Score,...,gnomAD Allele Frequency,gnomAD AF Ashkenazi,gnomAD AF Non Finnish European,rsID,SNP,N (Case),N (Control),MAF (Case),MAF (Control),Domain
2,12,40225176,40225176,T,C,exonic,.,synonymous SNV,LRRK2:NM_198578:exon1:c.T45C:p.T15T,.,...,0.0002,0,0.0004,rs142399623,rs142399623,3,2,0.000764,0.000323,ARM
8,12,40225580,40225580,T,A,exonic,.,nonsynonymous SNV,LRRK2:NM_198578:exon2:c.T177A:p.N59K,15.63,...,9.855e-05,0.0014,7.349e-05,rs150422099,rs150422099,1,0,0.000255,0.000000,ARM
9,12,40225628,40225628,G,A,exonic,.,synonymous SNV,LRRK2:NM_198578:exon2:c.G225A:p.A75A,.,...,0.0015,0,0,rs75054132,rs75054132,0,0,0.000000,0.000000,ARM
66,12,40232345,40232345,G,C,exonic,.,nonsynonymous SNV,LRRK2:NM_198578:exon3:c.G309C:p.Q103H,12.64,...,0.0001,0,2.94e-05,rs200926937,rs200926937,0,0,0.000000,0.000000,ARM
67,12,40232380,40232380,A,C,exonic,.,nonsynonymous SNV,LRRK2:NM_198578:exon3:c.A344C:p.H115P,17.96,...,0.0001,0.0009,2.94e-05,rs201439315,rs201439315,0,2,0.000000,0.000323,ARM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1080,12,40365042,40365042,C,T,exonic,.,nonsynonymous SNV,LRRK2:NM_198578:exon49:c.C7382T:p.A2461V,14.46,...,2.633e-05,0,2.945e-05,rs201037550,rs201037550,0,0,0.000000,0.000000,WD40
1092,12,40367044,40367044,C,T,exonic,.,nonsynonymous SNV,LRRK2:NM_198578:exon50:c.C7429T:p.R2477W,8.486,...,0.0004,0,1.477e-05,rs138780308,rs138780308,0,0,0.000000,0.000000,WD40
1093,12,40367045,40367045,G,A,exonic,.,nonsynonymous SNV,LRRK2:NM_198578:exon50:c.G7430A:p.R2477Q,10.35,...,0.0001,0,0.0002,rs146428335,rs146428335,0,0,0.000000,0.000000,WD40
1099,12,40367663,40367663,C,T,exonic,.,synonymous SNV,LRRK2:NM_198578:exon51:c.C7482T:p.T2494T,.,...,5.944e-05,0,7.393e-05,rs140665554,rs140665554,0,1,0.000000,0.000162,WD40


In [None]:
# Now we organize the cDNA and Amino Acid Change column and separate into cDNA and AA Change separate columns
for index, row in merged_df.iterrows():
    if pd.notna(row['cDNA and Amino Acid Change']):
        split_values = row['cDNA and Amino Acid Change'].split(':')[-2:]
        merged_df.at[index, 'cDNA'] = split_values[0]
        merged_df.at[index, 'Amino Acid Change'] = split_values[1] if len(split_values) > 1 else None

merged_df

Unnamed: 0,Chromosome,Start Position,End Position,Reference Allele,Alternate Allele,Functional Annotation,outside gene cDNA Change,Exonic Function,cDNA and Amino Acid Change,CADD v1.7 Phred Score,...,gnomAD AF Non Finnish European,rsID,SNP,N (Case),N (Control),MAF (Case),MAF (Control),Domain,cDNA,Amino Acid Change
2,12,40225176,40225176,T,C,exonic,.,synonymous SNV,LRRK2:NM_198578:exon1:c.T45C:p.T15T,.,...,0.0004,rs142399623,rs142399623,3,2,0.000764,0.000323,ARM,c.T45C,p.T15T
8,12,40225580,40225580,T,A,exonic,.,nonsynonymous SNV,LRRK2:NM_198578:exon2:c.T177A:p.N59K,15.63,...,7.349e-05,rs150422099,rs150422099,1,0,0.000255,0.000000,ARM,c.T177A,p.N59K
9,12,40225628,40225628,G,A,exonic,.,synonymous SNV,LRRK2:NM_198578:exon2:c.G225A:p.A75A,.,...,0,rs75054132,rs75054132,0,0,0.000000,0.000000,ARM,c.G225A,p.A75A
66,12,40232345,40232345,G,C,exonic,.,nonsynonymous SNV,LRRK2:NM_198578:exon3:c.G309C:p.Q103H,12.64,...,2.94e-05,rs200926937,rs200926937,0,0,0.000000,0.000000,ARM,c.G309C,p.Q103H
67,12,40232380,40232380,A,C,exonic,.,nonsynonymous SNV,LRRK2:NM_198578:exon3:c.A344C:p.H115P,17.96,...,2.94e-05,rs201439315,rs201439315,0,2,0.000000,0.000323,ARM,c.A344C,p.H115P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1080,12,40365042,40365042,C,T,exonic,.,nonsynonymous SNV,LRRK2:NM_198578:exon49:c.C7382T:p.A2461V,14.46,...,2.945e-05,rs201037550,rs201037550,0,0,0.000000,0.000000,WD40,c.C7382T,p.A2461V
1092,12,40367044,40367044,C,T,exonic,.,nonsynonymous SNV,LRRK2:NM_198578:exon50:c.C7429T:p.R2477W,8.486,...,1.477e-05,rs138780308,rs138780308,0,0,0.000000,0.000000,WD40,c.C7429T,p.R2477W
1093,12,40367045,40367045,G,A,exonic,.,nonsynonymous SNV,LRRK2:NM_198578:exon50:c.G7430A:p.R2477Q,10.35,...,0.0002,rs146428335,rs146428335,0,0,0.000000,0.000000,WD40,c.G7430A,p.R2477Q
1099,12,40367663,40367663,C,T,exonic,.,synonymous SNV,LRRK2:NM_198578:exon51:c.C7482T:p.T2494T,.,...,7.393e-05,rs140665554,rs140665554,0,1,0.000000,0.000162,WD40,c.C7482T,p.T2494T


In [None]:
# Remove variants where both N (Case) and N (Control) are 0

#Count the excluded
excluded_df = merged_df[(merged_df['N (Case)'] == 0) & (merged_df['N (Control)'] == 0)]
print(f'Number of excluded rows: {len(excluded_df)}')


merged_df = merged_df[~((merged_df['N (Case)'] == 0) & (merged_df['N (Control)'] == 0))]


Number of excluded rows: 52


In [None]:
# print column names
print(merged_df.columns)

Index(['Chromosome', 'Start Position', 'End Position', 'Reference Allele',
       'Alternate Allele', 'Functional Annotation', 'outside gene cDNA Change',
       'Exonic Function', 'cDNA and Amino Acid Change',
       'CADD v1.7 Phred Score', 'Clinical Significance (Varsome)',
       'gnomAD Allele Frequency', 'gnomAD AF Ashkenazi',
       'gnomAD AF Non Finnish European', 'rsID', 'SNP', 'N (Case)',
       'N (Control)', 'MAF (Case)', 'MAF (Control)', 'Domain', 'cDNA',
       'Amino Acid Change'],
      dtype='object')


In [None]:
# if the "outside gene cDNA Change" column is not empty then split by ":" then pick third element and store it inside the cDNA of that row
for index, row in merged_df.iterrows():
    if pd.notna(row['outside gene cDNA Change']):
        split_values = row['outside gene cDNA Change'].split(':')
        if len(split_values) >= 3:
            merged_df.at[index, 'cDNA'] = split_values[2]
            # print cDNA at index
            print(merged_df.at[index, 'cDNA'])

In [None]:
# reorder columns such that
# Domain	Chromosome	Position	Reference Allele	Alternate Allele	cDNA	Amino Acid Change	Functional Annotation	Exonic Function	N (Case)	N (Control)	MAF (Case)	MAF (Control)	CADD Phred Score	Clinical Significance (Varsome)	gnomAD Allele Frequency	gnomAD AF Ashkenazi	gnomAD AF Non Finnish European
ordered_merged_df = merged_df[[
    'Domain', 'Chromosome', 'Start Position', 'Reference Allele',
    'Alternate Allele', 'cDNA', 'Amino Acid Change','rsID', 'Functional Annotation',
    'Exonic Function', 'N (Case)', 'N (Control)', 'MAF (Case)',
    'MAF (Control)', 'CADD v1.7 Phred Score', 'Clinical Significance (Varsome)',
    'gnomAD Allele Frequency', 'gnomAD AF Ashkenazi',
    'gnomAD AF Non Finnish European'
]]

ordered_merged_df.head()

Unnamed: 0,Domain,Chromosome,Start Position,Reference Allele,Alternate Allele,cDNA,Amino Acid Change,rsID,Functional Annotation,Exonic Function,N (Case),N (Control),MAF (Case),MAF (Control),CADD v1.7 Phred Score,Clinical Significance (Varsome),gnomAD Allele Frequency,gnomAD AF Ashkenazi,gnomAD AF Non Finnish European
2,ARM,12,40225176,T,C,c.T45C,p.T15T,rs142399623,exonic,synonymous SNV,3,2,0.000764,0.000323,.,Likely_benign,0.0002,0.0,0.0004
8,ARM,12,40225580,T,A,c.T177A,p.N59K,rs150422099,exonic,nonsynonymous SNV,1,0,0.000255,0.0,15.63,Conflicting_classifications_of_pathogenicity,9.855e-05,0.0014,7.349e-05
67,ARM,12,40232380,A,C,c.A344C,p.H115P,rs201439315,exonic,nonsynonymous SNV,0,2,0.0,0.000323,17.96,Benign/Likely_benign,0.0001,0.0009,2.94e-05
92,ARM,12,40235634,T,C,c.T356C,p.L119P,rs33995463,exonic,nonsynonymous SNV,8,17,0.002038,0.002748,19.51,Conflicting_classifications_of_pathogenicity,0.0014,0.0,0.0023
93,ARM,12,40235642,C,T,c.C364T,p.L122L,rs41286468,exonic,synonymous SNV,2,1,0.000509,0.000162,.,Benign/Likely_benign,0.0003,0.0003,0.0005


In [None]:
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.meta_analysis import combine_effects

In [None]:
import numpy as np
from scipy.stats import chi2_contingency

cohort_totals = {
    "USA": {"cases": 1251, "controls": 559},
    "ISRAEL": {"cases": 1091, "controls": 528},
    "FRENCH": {"cases": 1282, "controls": 2358},
    "RUSSIA": {"cases": 485, "controls": 442},
    "UKBB": {"cases": 2848, "controls": 62463},
    "AMP_PD": {"cases": 1931, "controls": 3062}
}

def calculate_confidence_interval(a, b, c, d):
    """
    Calculate 95% CI for the Odds Ratio using the normal approximation.
    a = case_count
    b = case_total - case_count
    c = control_count
    d = control_total - control_count
    """
    # Calculate OR
    if a == 0 or b == 0 or c == 0 or d == 0:
        # If any cell is zero, directly return NaNs for CI
        return np.nan, np.nan

    or_val = (a * d) / (b * c)
    # Calculate standard error of ln(OR)
    se = np.sqrt((1/a) + (1/b) + (1/c) + (1/d))
    # CI on log scale
    log_or = np.log(or_val)
    ci_lower = np.exp(log_or - 1.96 * se)
    ci_upper = np.exp(log_or + 1.96 * se)
    return ci_lower, ci_upper

def calculate_odds_ratio_no_correction(data, cohort_name):
    """
    Calculate Odds Ratios without continuity correction for a specific cohort.
    Set rows with infinite or zero OR/P-values to blank (NaN).
    Also compute 95% CI for the OR.
    """
    case_total = cohort_totals[cohort_name]["cases"]
    control_total = cohort_totals[cohort_name]["controls"]

    odds_ratios = []
    p_values = []
    ci_lowers = []
    ci_uppers = []

    for _, row in data.iterrows():
        case_count = row['N (Case)']
        control_count = row['N (Control)']

        a = case_count
        b = case_total - case_count
        c = control_count
        d = control_total - control_count

        table = np.array([
            [a, b],
            [c, d]
        ])

        try:
            # Calculate OR
            odds_ratio = (a * d) / (b * c)
            chi2, p, _, _ = chi2_contingency(table, correction=False)

            # Calculate CI
            ci_lower, ci_upper = calculate_confidence_interval(a, b, c, d)

            # Check for invalid OR
            if np.isinf(odds_ratio) or odds_ratio == 0:
                odds_ratio = np.nan
                p = np.nan
                ci_lower, ci_upper = np.nan, np.nan
            if p == 0:
                p = np.nan

        except ZeroDivisionError:
            odds_ratio, p = np.nan, np.nan
            ci_lower, ci_upper = np.nan, np.nan

        odds_ratios.append(odds_ratio)
        p_values.append(p)
        ci_lowers.append(ci_lower)
        ci_uppers.append(ci_upper)

    data['OR'] = odds_ratios
    data['P-value'] = p_values
    data['OR 95% CI Lower'] = ci_lowers
    data['OR 95% CI Upper'] = ci_uppers
    return data

def calculate_odds_ratio_ratio_correction(data, cohort_name):
    """
    Calculate Odds Ratios with ratio-based continuity correction for a specific cohort.
    Also compute 95% CI for the corrected OR.
    """
    case_total = cohort_totals[cohort_name]["cases"]
    control_total = cohort_totals[cohort_name]["controls"]
    total = case_total + control_total

    # Ratio-based correction factors
    case_correction = 1 * (case_total / total)
    control_correction = 1 * (control_total / total)

    odds_ratios = []
    p_values = []
    ci_lowers = []
    ci_uppers = []

    for _, row in data.iterrows():
        case_count = row['N (Case)']
        control_count = row['N (Control)']

        a = case_count + case_correction
        b = (case_total - case_count) + case_correction
        c = control_count + control_correction
        d = (control_total - control_count) + control_correction

        table = np.array([
            [a, b],
            [c, d]
        ])

        try:
            # Calculate OR
            odds_ratio = (a * d) / (b * c)
            chi2, p, _, _ = chi2_contingency(table, correction=False)

            # Calculate CI
            # For the CI calculation with continuity correction, we approximate:
            # Round to the nearest integer (or just use a small correction)
            # when calculating variance for CI. This is a heuristic.
            # You may choose a different approach, but here's a simple approximation:
            a_int, b_int, c_int, d_int = max(1, round(a)), max(1, round(b)), max(1, round(c)), max(1, round(d))
            ci_lower, ci_upper = calculate_confidence_interval(a_int, b_int, c_int, d_int)

        except ZeroDivisionError:
            odds_ratio, p = np.nan, np.nan
            ci_lower, ci_upper = np.nan, np.nan

        odds_ratios.append(odds_ratio)
        p_values.append(p)
        ci_lowers.append(ci_lower)
        ci_uppers.append(ci_upper)

    data['OR (Continuity Corrected)'] = odds_ratios
    data['P-value (Continuity Corrected)'] = p_values
    data['OR (Continuity Corrected) 95% CI Lower'] = ci_lowers
    data['OR (Continuity Corrected) 95% CI Upper'] = ci_uppers
    return data

# Example usage:
# corrected_OR_df = calculate_odds_ratio_ratio_correction(ordered_merged_df, cohort_name)
# non_corrected_OR_df = calculate_odds_ratio_no_correction(corrected_OR_df, cohort_name)

# Now both corrected and non-corrected dataframes will have OR, P-value, and 95% CI columns.


In [None]:
# Calculate corrected and non-corrected ORs and P-values
corrected_OR_df = calculate_odds_ratio_ratio_correction(ordered_merged_df, cohort_name)
non_corrected_OR_df = calculate_odds_ratio_no_correction(corrected_OR_df, cohort_name)

# Insert corrected and non-corrected columns after 'MAF (Control)'
columns_to_reorder = (
    list(non_corrected_OR_df.columns[:14]) +  # Columns up to 'MAF (Control)'
    [
    'OR (Continuity Corrected)', 'P-value (Continuity Corrected)',
    'OR (Continuity Corrected) 95% CI Lower', 'OR (Continuity Corrected) 95% CI Upper',
    'OR', 'P-value',
    'OR 95% CI Lower', 'OR 95% CI Upper'
    ] +
    list(non_corrected_OR_df.columns[14 :-8])  # Remaining columns
)

# Reorder the DataFrame
ordered_OR_df = non_corrected_OR_df[columns_to_reorder]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['OR (Continuity Corrected)'] = odds_ratios
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['P-value (Continuity Corrected)'] = p_values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['OR (Continuity Corrected) 95% CI Lower'] = ci_lowers


In [None]:
ordered_OR_df

Unnamed: 0,Domain,Chromosome,Start Position,Reference Allele,Alternate Allele,cDNA,Amino Acid Change,rsID,Functional Annotation,Exonic Function,...,OR (Continuity Corrected) 95% CI Upper,OR,P-value,OR 95% CI Lower,OR 95% CI Upper,CADD v1.7 Phred Score,Clinical Significance (Varsome),gnomAD Allele Frequency,gnomAD AF Ashkenazi,gnomAD AF Non Finnish European
2,ARM,12,40225176,T,C,c.T45C,p.T15T,rs142399623,exonic,synonymous SNV,...,7.874314,2.380705,0.327252,0.39743,14.261039,.,Likely_benign,0.0002,0,0.0004
8,ARM,12,40225580,T,A,c.T177A,p.N59K,rs150422099,exonic,nonsynonymous SNV,...,25.389067,,,,,15.63,Conflicting_classifications_of_pathogenicity,9.855e-05,0.0014,7.349e-05
67,ARM,12,40232380,A,C,c.A344C,p.H115P,rs201439315,exonic,nonsynonymous SNV,...,5.083607,,,,,17.96,Benign/Likely_benign,0.0001,0.0009,2.94e-05
92,ARM,12,40235634,T,C,c.T356C,p.L119P,rs33995463,exonic,nonsynonymous SNV,...,1.622238,0.745159,0.492116,0.32096,1.730002,19.51,Conflicting_classifications_of_pathogenicity,0.0014,0,0.0023
93,ARM,12,40235642,C,T,c.C364T,p.L122L,rs41286468,exonic,synonymous SNV,...,11.278457,3.173665,0.319315,0.28757,35.025038,.,Benign/Likely_benign,0.0003,0.0003,0.0005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1074,WD40,12,40364884,G,A,c.G7224A,p.M2408I,rs60545352,exonic,nonsynonymous SNV,...,8.750043,,,,,10.48,Uncertain_significance,2.633e-05,0,1.473e-05
1076,WD40,12,40364975,C,A,c.C7315A,p.L2439I,rs72547983,exonic,nonsynonymous SNV,...,8.750043,,,,,11.92,Uncertain_significance,1.975e-05,0,4.417e-05
1077,WD40,12,40364984,C,T,c.C7324T,p.R2442C,rs199893519,exonic,nonsynonymous SNV,...,8.750043,,,,,15.44,Uncertain_significance,1.317e-05,0,1.473e-05
1099,WD40,12,40367663,C,T,c.C7482T,p.T2494T,rs140665554,exonic,synonymous SNV,...,8.750043,,,,,.,Likely_benign,5.944e-05,0,7.393e-05


In [None]:
# save into excel file
#ordered_merged_df.to_excel(f'{directory}/{cohort_name}_annotated.xlsx', index=False)
ordered_OR_df.to_excel(f'{directory}/{cohort_name}_annotated_OR_CI.xlsx', index=False)