This script prepares summary statistics files to be later used as input for Popcorn, a program for estimating the correlation of causal variant effect, on the Pre-Confluence Project Data

In [1]:
import os
import pandas as pd

!dx ls

[1m[34m.Notebook_archive/[0m
[1m[34m.Notebook_snapshots/[0m
[1m[34mAim1_Heritability/[0m
[1m[34mAim2_Polygenicity/[0m
[1m[34mAim3_WithinAncestryGeneticCorrelation/[0m
[1m[34mAim4_CrossAncestryGeneticCorrelation/[0m
[1m[34mAim5_HeritabilityByFunctionalAnnotations/[0m
[1m[34mGCTB_LD_AA/[0m
[1m[34mGCTB_LD_AMR/[0m
[1m[34mGCTB_LD_BBJ/[0m
[1m[34mLD_Score/[0m
[1m[34mPopcorn/[0m
[1m[34mimages/[0m
[1m[34mworkflows/[0m
AA_ERneg_sumdata.txt
cloud_workstation_nextflow_snapshot
[1m[32mhello[0m
nextflow-job-Gf5VyJj4BzFv40fFvQ7v841G.log
nextflow-job-Gg85kqQ4BzFz411Yzvk46x05.log
nextflow-job-Gg863Xj4BzFfJ3qxjqGkk8KG.log
[1m[32mnf-gwas[0m


In [2]:
!dx cd
!dx cd Aim4_CrossAncestryGeneticCorrelation/jahagirdarob/sumstats_files

# Prepare Population Overall Breast Cancer Meta Summary Statistics Data

In [3]:
!ls "/mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update"     # lists contents of cleaned summary statistics folder

AA_Overall_HM3_01_2024.txt	 EUR_ERNeg_HM3_01.txt
AA_Overall_HM3_05_2024.txt	 EUR_ERNeg_HM3_05.txt
AA_Overall_HM3_MEGA_01_2024.txt  EUR_ERNeg_HM3_MEGA_01.txt
AA_Overall_HM3_MEGA_05_2024.txt  EUR_ERNeg_HM3_MEGA_05.txt
EAS_ERNeg_HM3_01.txt		 EUR_ERPos_HM3_01.txt
EAS_ERNeg_HM3_05.txt		 EUR_ERPos_HM3_05.txt
EAS_ERNeg_HM3_MEGA_01.txt	 EUR_ERPos_HM3_MEGA_01.txt
EAS_ERNeg_HM3_MEGA_05.txt	 EUR_ERPos_HM3_MEGA_05.txt
EAS_ERPos_HM3_01.txt		 EUR_Overall_HM3_01.txt
EAS_ERPos_HM3_05.txt		 EUR_Overall_HM3_05.txt
EAS_ERPos_HM3_MEGA_01.txt	 EUR_Overall_HM3_MEGA_01.txt
EAS_ERPos_HM3_MEGA_05.txt	 EUR_Overall_HM3_MEGA_05.txt
EAS_Overall_HM3_01.txt		 Latina_Overall_HM3_01.txt
EAS_Overall_HM3_05.txt		 Latina_Overall_HM3_05.txt
EAS_Overall_HM3_MEGA_01.txt	 Latina_Overall_HM3_MEGA_01.txt
EAS_Overall_HM3_MEGA_05.txt	 Latina_Overall_HM3_MEGA_05.txt


In [4]:
def sort_alleles(sumstats_df):
    """
    sorts the a1 and a2 alphabetically and changes beta and AF values as necessary such that the identity of
    a1 and a2 are consistent across different summary stat files no matter how they were created
    
    sumstats_df: summary statistics pandas dataframe w/ a1 and a2 columns

    returns the sumstats_df with sorted allele columns a1 and a2
    """
    # creates an update_df dataframe including all rows where a1 comes alphabetically after a2
    not_sorted = sumstats_df["a1"] > sumstats_df["a2"]
    update_df = sumstats_df.loc[not_sorted].copy()
    
    # modifies update_df such that alleles are sorted alphabetically
    update_df.rename(columns={"a1": "a2", "a2": "a1"}, inplace=True)
    update_df["beta"] = -update_df["beta"]
    if "AF" in update_df.columns:
        update_df["AF"] = 1 - update_df["AF"]
    
    # updates sumstats_df to sort alleles of unsorted rows
    sumstats_df.loc[not_sorted] = update_df
    
    return sumstats_df
    

## African Data

Take a look at the summary statistics format.

In [15]:
!head -2 /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/AA_Overall_HM3_*_2024.txt

==> /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/AA_Overall_HM3_01_2024.txt <==
SNPID CHR POS effect_allele non_effect_allele Freq_effect BETA SE P N_eff
rs12562034 1 768448 A G 0.0802 0.0101 0.0289 0.7275 8115

==> /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/AA_Overall_HM3_05_2024.txt <==
SNPID CHR POS effect_allele non_effect_allele Freq_effect BETA SE P N_eff
rs12562034 1 768448 A G 0.0802 0.0101 0.0289 0.7275 8115

==> /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/AA_Overall_HM3_MEGA_01_2024.txt <==
SNPID CHR POS effect_allele non_effect_allele Freq_effect BETA SE P N_eff
rs139221807 1 746189 A G 0.7337 -0.0122 0.018 0.4955 7898

==> /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/AA_Overall_HM3_MEGA_05_2024.txt <==
SNPID CHR POS effect_allele non_effect_allele Freq_effect BETA SE P N_eff
rs139221807 1 746189 A G 0.7337 -0.0122 0.018 0.4955 7898


Modify the format of these summary statistics to be compatible as popcorn input.

In [19]:
def get_popcorn_input_from_afr_meta_sumstats(sumstats_dir, out_dir, use_tot_samples=False):
    """
    sumstats_dir: directory of African summary statistics file
    out_dir: directory to which the popcorn input will be written
    bim_df: (optional) pandas dataframe w/ chr, pos, and rsid columns; rsid column is added to output file if included
    """
    print(f"get_popcorn_input_from_afr_meta_sumstats: reading sumstats from {sumstats_dir}")
    sumstats_df = pd.read_csv(sumstats_dir, delim_whitespace=True)
    
    # renames columns to conform to popcorn input format
    # CHR POS effect_allele non_effect_allele Freq_effect BETA SE P imputation_r2 N_case N_control N_eff
    print(f"get_popcorn_input_from_afr_meta_sumstats: processing summary statistics")
    col_rename_dict = {"SNPID": "rsid", "CHR": "chr", "POS": "pos", "non_effect_allele": "a1", "effect_allele": "a2", "BETA": "beta", "Freq_effect": "AF", "N_eff": "N"}
    sumstats_df.rename(columns=col_rename_dict, inplace=True)
    
    if use_tot_samples:     # calculates N summing number of cases and controls if option specified
        sumstats_df['N'] = sumstats_df['N_case'] + sumstats_df['N_control']
        
    sort_alleles(sumstats_df)      # sorts alleles
    out_cols = ["rsid", "a1", "a2", "beta", "SE", "N", "AF"]
    
    # writes file in popcorn input format
    print(f"get_popcorn_input_from_afr_meta_sumstats: writing sumstats to {out_dir}")
    out_df = sumstats_df[out_cols]
    out_df.to_csv(out_dir, sep='\t', index=False)
    
    # print(out_df)

In [22]:
subtype = "Overall"
subsets = ["HM3", "HM3_MEGA"]
mafs = ["01", "05"]
for maf in mafs:
    for subset in subsets:
        sumstats_pref_dir = "/mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update"
        afr_meta_sumstats_dir = f"{sumstats_pref_dir}/AA_{subtype}_{subset}_{maf}_2024.txt"
        get_popcorn_input_from_afr_meta_sumstats(afr_meta_sumstats_dir, f"AFR_{subtype}_{subset}_{maf}.sumstats.txt")

get_popcorn_input_from_afr_meta_sumstats: reading sumstats from /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/AA_Overall_HM3_01_2024.txt
get_popcorn_input_from_afr_meta_sumstats: processing summary statistics
get_popcorn_input_from_afr_meta_sumstats: writing sumstats to AFR_Overall_HM3_01.sumstats.txt
get_popcorn_input_from_afr_meta_sumstats: reading sumstats from /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/AA_Overall_HM3_MEGA_01_2024.txt
get_popcorn_input_from_afr_meta_sumstats: processing summary statistics
get_popcorn_input_from_afr_meta_sumstats: writing sumstats to AFR_Overall_HM3_MEGA_01.sumstats.txt
get_popcorn_input_from_afr_meta_sumstats: reading sumstats from /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/AA_Overall_HM3_05_2024.txt
get_popcorn_input_from_afr_meta_sumstats: processing summary statistics
get_popcorn_input_from_afr_meta_sumstats: writing sumstats to AFR_Overall_HM3_05.sumstats.txt
get_popcorn_inp

Take a look at the African summary statistics cleaned for popcorn.

In [12]:
!head -3 AFR_*.sumstats.txt

==> AFR_Overall_HM3_01.sumstats.txt <==
rsid	a1	a2	beta	SE	N	AF
rs12562034	A	G	-0.0101	0.0289	8115	0.9198
rs4040617	A	G	-0.0132	0.0161	8062	0.3963

==> AFR_Overall_HM3_05.sumstats.txt <==
rsid	a1	a2	beta	SE	N	AF
rs12562034	A	G	-0.0101	0.0289	8115	0.9198
rs4040617	A	G	-0.0132	0.0161	8062	0.3963

==> AFR_Overall_HM3_MEGA_01.sumstats.txt <==
rsid	a1	a2	beta	SE	N	AF
rs139221807	A	G	0.0122	0.018	7898	0.2663
rs114339855	G	T	-0.0664	0.0636	7483	0.9832

==> AFR_Overall_HM3_MEGA_05.sumstats.txt <==
rsid	a1	a2	beta	SE	N	AF
rs139221807	A	G	0.0122	0.018	7898	0.2663
rs12562034	A	G	-0.0101	0.0289	8115	0.9198


In [23]:
!dx upload AFR_*.sumstats.txt    # uploads the popcorn input formatted file

ID                          file-GkkqGpQ4BzFX1z65JX25yPP1
Class                       file
Project                     project-GYBZ3yj4BzFgB61K7vX9Fq9K
Folder                      /Aim4_CrossAncestryGeneticCorrelation/jahagirdarob
                            /sumstats_files
Name                        AFR_Overall_HM3_01.sumstats.txt
State                       [33mclosing[0m
Visibility                  visible
Types                       -
Properties                  -
Tags                        -
Outgoing links              -
Created                     Thu Jun 20 04:56:10 2024
Created by                  omjahagirdar
 via the job                job-GkkpqGj4BzFZ8yX5gKpB2xqb
Last modified               Thu Jun 20 04:56:12 2024
Media type                  
archivalState               "live"
cloudAccount                "cloudaccount-dnanexus"
ID                          file-GkkqGq04BzFX1z65JX25yPP3
Class                       file
Project                     project-GYBZ3yj4BzFgB61K7

## American Data

Take a look at the summary statistics format.

In [17]:
!head -2 /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/Latina_*.txt

==> /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/Latina_Overall_HM3_01.txt <==
unique_SNP_id effect_allele non_effect_allele Freq_effect BETA SE P N_eff SNP
1_100000827_C_T T C 0.3167 -0.0338670695115773 0.04056 0.4041 1404 rs6678176

==> /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/Latina_Overall_HM3_05.txt <==
unique_SNP_id effect_allele non_effect_allele Freq_effect BETA SE P N_eff SNP
1_100000827_C_T T C 0.3167 -0.0338670695115773 0.04056 0.4041 1404 rs6678176

==> /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/Latina_Overall_HM3_MEGA_01.txt <==
unique_SNP_id effect_allele non_effect_allele Freq_effect BETA SE P N_eff SNP
1_100000827_C_T T C 0.3167 -0.0338670695115773 0.04056 0.4041 1404 rs6678176

==> /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/Latina_Overall_HM3_MEGA_05.txt <==
unique_SNP_id effect_allele non_effect_allele Freq_effect BETA SE P N_eff SNP
1_100000827_C_T T C 0.3167 -0.033867

Modify the format of these summary statistics to be compatible as popcorn input.

In [29]:
def get_popcorn_input_from_amr_meta_sumstats(sumstats_dir, out_dir, use_tot_samples=False):
    """
    sumstats_dir: directory of African summary statistics file
    out_dir: directory to which the popcorn input will be written
    bim_df: (optional) pandas dataframe w/ chr, pos, and rsid columns; rsid column is added to output file if included
    """
    print(f"get_popcorn_input_from_amr_meta_sumstats: reading sumstats from {sumstats_dir}")
    sumstats_df = pd.read_csv(sumstats_dir, delim_whitespace=True)
    
    # renames columns to conform to popcorn input format
    # unique_SNP_id effect_allele non_effect_allele Freq_effect BETA SE P N_eff SNP
    print(f"get_popcorn_input_from_afr_meta_sumstats: processing summary statistics")
    col_rename_dict = {"SNP": "rsid", "non_effect_allele": "a1", "effect_allele": "a2", "BETA": "beta", "Freq_effect": "AF", "N_eff": "N"}
    sumstats_df.rename(columns=col_rename_dict, inplace=True)
    sort_alleles(sumstats_df)      # sorts alleles
    out_cols = ["rsid", "a1", "a2", "beta", "SE", "N", "AF"]
    
    # writes file in popcorn input format
    print(f"get_popcorn_input_from_amr_meta_sumstats: writing sumstats to {out_dir}")
    
    out_df = sumstats_df[out_cols]
    if use_tot_samples:
        out_df["N"] = 2396 + 7468    # 2,396 cases & 7,468 controls
    out_df.to_csv(out_dir, sep='\t', index=False)
    
    # print(out_df)

In [30]:
subtype = "Overall"
subsets = ["HM3", "HM3_MEGA"]
mafs = ["01", "05"]
for maf in mafs:
    for subset in subsets:
        sumstats_pref_dir = "/mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update"
        amr_meta_sumstats_dir = f"{sumstats_pref_dir}/Latina_{subtype}_{subset}_{maf}.txt"
        get_popcorn_input_from_amr_meta_sumstats(amr_meta_sumstats_dir, f"AMR_{subtype}_{subset}_{maf}.sumstats.txt")

get_popcorn_input_from_amr_meta_sumstats: reading sumstats from /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/Latina_Overall_HM3_01.txt
get_popcorn_input_from_afr_meta_sumstats: processing summary statistics
get_popcorn_input_from_amr_meta_sumstats: writing sumstats to AMR_Overall_HM3_01.sumstats.txt
get_popcorn_input_from_amr_meta_sumstats: reading sumstats from /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/Latina_Overall_HM3_MEGA_01.txt
get_popcorn_input_from_afr_meta_sumstats: processing summary statistics
get_popcorn_input_from_amr_meta_sumstats: writing sumstats to AMR_Overall_HM3_MEGA_01.sumstats.txt
get_popcorn_input_from_amr_meta_sumstats: reading sumstats from /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/Latina_Overall_HM3_05.txt
get_popcorn_input_from_afr_meta_sumstats: processing summary statistics
get_popcorn_input_from_amr_meta_sumstats: writing sumstats to AMR_Overall_HM3_05.sumstats.txt
get_popcorn_input_

Take a look at the American summary statistics cleaned for popcorn.

In [36]:
!head -3 AMR_*.sumstats.txt

==> AMR_Overall_HM3_01.sumstats.txt <==
rsid	a1	a2	beta	SE	N	AF
rs6678176	C	T	-0.0338670695115773	0.04056	1404	0.3167
rs12069019	A	G	0.0365602553118024	0.04485	1476	0.7857000000000001

==> AMR_Overall_HM3_05.sumstats.txt <==
rsid	a1	a2	beta	SE	N	AF
rs6678176	C	T	-0.0338670695115773	0.04056	1404	0.3167
rs12069019	A	G	0.0365602553118024	0.04485	1476	0.7857000000000001

==> AMR_Overall_HM3_MEGA_01.sumstats.txt <==
rsid	a1	a2	beta	SE	N	AF
rs6678176	C	T	-0.0338670695115773	0.04056	1404	0.3167
rs12069019	A	G	0.0365602553118024	0.04485	1476	0.7857000000000001

==> AMR_Overall_HM3_MEGA_05.sumstats.txt <==
rsid	a1	a2	beta	SE	N	AF
rs6678176	C	T	-0.0338670695115773	0.04056	1404	0.3167
rs12069019	A	G	0.0365602553118024	0.04485	1476	0.7857000000000001


In [37]:
!dx upload AMR_*.sumstats.txt    # uploads the popcorn input formatted file

ID                          file-GkkqVP04BzFQgPGBkVP871Z4
Class                       file
Project                     project-GYBZ3yj4BzFgB61K7vX9Fq9K
Folder                      /Aim4_CrossAncestryGeneticCorrelation/jahagirdarob
                            /sumstats_files
Name                        AMR_Overall_HM3_01.sumstats.txt
State                       [33mclosing[0m
Visibility                  visible
Types                       -
Properties                  -
Tags                        -
Outgoing links              -
Created                     Thu Jun 20 05:06:04 2024
Created by                  omjahagirdar
 via the job                job-GkkpqGj4BzFZ8yX5gKpB2xqb
Last modified               Thu Jun 20 05:06:06 2024
Media type                  
archivalState               "live"
cloudAccount                "cloudaccount-dnanexus"
ID                          file-GkkqVPQ4BzFQgPGBkVP871Z6
Class                       file
Project                     project-GYBZ3yj4BzFgB61K7

## European Data

Take a look at the summary statistics format.

In [40]:
!head -2 /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/EUR_Overall*.txt

==> /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/EUR_Overall_HM3_01.txt <==
var_name SNP_ID CHR POS effect_allele_iCOGs non_effect_allele_iCOGs Freq_effect_iCOGs Imputation_r2_iCOGs BETA_iCOGs SE_iCOGs P_iCOGs effect_allele_Onco non_effect_allele_Onco Freq_effect_Onco Imputation_r2_Onco BETA_Onco SE_Onco P_Onco effect_allele_meta non_effect_allele_meta BETA_meta SE_meta P_meta N_eff_meta
1_918573_A_G rs2341354 1 918573 G A 0.58874 0.510991 -0.00761739 0.0148852 0.608832 G A 0.582867 1 -0.00447053 0.00796232 0.574483 G A -0.0051706315 0.0070209595 0.46145291 41757

==> /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/EUR_Overall_HM3_05.txt <==
var_name SNP_ID CHR POS effect_allele_iCOGs non_effect_allele_iCOGs Freq_effect_iCOGs Imputation_r2_iCOGs BETA_iCOGs SE_iCOGs P_iCOGs effect_allele_Onco non_effect_allele_Onco Freq_effect_Onco Imputation_r2_Onco BETA_Onco SE_Onco P_Onco effect_allele_meta non_effect_allele_meta BETA_meta SE_meta P_meta N_e

Modify the format of these summary statistics to be compatible as popcorn input.

In [42]:
def get_popcorn_input_from_eur_meta_sumstats(sumstats_dir, out_dir, overall=True, use_tot_samples=False):
    """
    sumstats_dir: directory of Zhang2020 summary statistics file
    out_dir: directory to which the popcorn input will be written
    bim_df: (optional) pandas dataframe w/ chr, pos, and rsid columns; rsid column is added to output file if included
    overall: Should be set True (default) if summary statistics are overall (ER+ and ER- combined), False otherwise
    """
    print(f"get_popcorn_input_from_eur_meta_sumstats: reading sumstats from {sumstats_dir}")
    sumstats_df = pd.read_csv(sumstats_dir, delim_whitespace=True)
    
    # renames columns to conform to popcorn input format
    # var_name SNP_ID CHR POS effect_allele_iCOGs non_effect_allele_iCOGs Freq_effect_iCOGs Imputation_r2_iCOGs BETA_iCOGs SE_iCOGs P_iCOGs effect_allele_Onco non_effect_allele_Onco Freq_effect_Onco Imputation_r2_Onco BETA_Onco SE_Onco P_Onco effect_allele_meta non_effect_allele_meta BETA_meta SE_meta P_meta N_eff_meta
    print(f"get_popcorn_input_from_eur_meta_sumstats: processing summary statistics")
    pos_hdr = "POS" if overall else "position_b37"     # accounts for different input column header if not overall summary statistics
    rsid_hdr = "SNP_ID" if overall else "phase3_1kg_id"
    col_rename_dict = {rsid_hdr: "rsid", "CHR": "chr", pos_hdr: "pos", "non_effect_allele_meta": "a1", "effect_allele_meta": "a2", "BETA_meta": "beta", "SE_meta": "SE", "N_eff_meta": "N"}
    sumstats_df.rename(columns=col_rename_dict, inplace=True)
    
    sort_alleles(sumstats_df)      # sorts alleles
    
    out_cols = ["rsid", "a1", "a2", "beta", "SE", 'N']
    
    # writes file in popcorn input format
    print(f"get_popcorn_input_from_eur_meta_sumstats: writing sumstats to {out_dir}")
    out_df = sumstats_df[out_cols]
    if use_tot_samples:
        out_df["N"] =  118474 + 96201    # 118,474 cases & 96,201 controls
    out_df.to_csv(out_dir, sep='\t', index=False)
    
    # print(out_df)

In [44]:
subtype = "Overall"
subsets = ["HM3", "HM3_MEGA"]
mafs = ["01", "05"]
for maf in mafs:
    for subset in subsets:
        sumstats_pref_dir = "/mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update"
        eur_meta_sumstats_dir = f"{sumstats_pref_dir}/EUR_{subtype}_{subset}_{maf}.txt"
        get_popcorn_input_from_eur_meta_sumstats(eur_meta_sumstats_dir, f"EUR_{subtype}_{subset}_{maf}.sumstats.txt")

get_popcorn_input_from_eur_meta_sumstats: reading sumstats from /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/EUR_Overall_HM3_01.txt
get_popcorn_input_from_eur_meta_sumstats: processing summary statistics
get_popcorn_input_from_eur_meta_sumstats: writing sumstats to EUR_Overall_HM3_01.sumstats.txt
              rsid a1 a2      beta        SE      N
0        rs2341354  A  G -0.005171  0.007021  41757
1        rs4970393  A  G  0.015403  0.007021  41443
2        rs4275402  C  T -0.012847  0.007340  43605
3        rs2465136  C  T -0.013312  0.007305  44284
4        rs7526076  A  G -0.017056  0.007738  43794
...            ... .. ..       ...       ...    ...
1066133   rs739365  C  T -0.007319  0.007380  42690
1066134  rs6009951  C  T -0.009663  0.006943  42494
1066135  rs6010061  C  T -0.008774  0.006953  42391
1066136  rs6010063  A  G  0.003973  0.006972  41452
1066137    rs10451  A  G -0.002207  0.007532  41630

[1066138 rows x 6 columns]
get_popcorn_input_from_eur_

Take a look at the European summary statistics cleaned for popcorn.

In [46]:
!head -3 EUR_Overall*.sumstats.txt

==> EUR_Overall_HM3_01.sumstats.txt <==
rsid	a1	a2	beta	SE	N
rs2341354	A	G	-0.0051706315	0.0070209595	41757
rs4970393	A	G	0.015403406	0.0070211696	41443

==> EUR_Overall_HM3_05.sumstats.txt <==
rsid	a1	a2	beta	SE	N
rs2341354	A	G	-0.0051706315	0.0070209595	41757
rs4275402	C	T	-0.012846936	0.0073402916	43605

==> EUR_Overall_HM3_MEGA_01.sumstats.txt <==
rsid	a1	a2	beta	SE	N
rs1110052	G	T	0.0029611685	0.0078006487	38992
rs2272756	A	G	0.0041584471	0.0082890747	39322

==> EUR_Overall_HM3_MEGA_05.sumstats.txt <==
rsid	a1	a2	beta	SE	N
rs6694632	A	G	0.0052287659	0.0070854109	40995
rs13303118	G	T	-0.0034068035	0.0070468796	41115


In [47]:
!dx upload EUR_Overall*.sumstats.txt

ID                          file-GkkqgF04BzFpYKZGjgQJ5gxF
Class                       file
Project                     project-GYBZ3yj4BzFgB61K7vX9Fq9K
Folder                      /Aim4_CrossAncestryGeneticCorrelation/jahagirdarob
                            /sumstats_files
Name                        EUR_Overall_HM3_01.sumstats.txt
State                       [33mclosing[0m
Visibility                  visible
Types                       -
Properties                  -
Tags                        -
Outgoing links              -
Created                     Thu Jun 20 05:18:36 2024
Created by                  omjahagirdar
 via the job                job-GkkpqGj4BzFZ8yX5gKpB2xqb
Last modified               Thu Jun 20 05:18:37 2024
Media type                  
archivalState               "live"
cloudAccount                "cloudaccount-dnanexus"
ID                          file-GkkqgF84BzFpYKZGjgQJ5gxP
Class                       file
Project                     project-GYBZ3yj4BzFgB61K7

## East Asian Data

Take a look at the summary statistics format.

In [49]:
!head -2 /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/EAS_Overall*_05.txt

==> /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/EAS_Overall_HM3_05.txt <==
unique_SNP_id effect_allele_BBJ non_effect_allele_BBJ Freq_effect_BBJ BETA_BBJ SE_BBJ P_BBJ N_eff_BBJ SNPID effect_allele_BCAC non_effect_allele_BCAC Freq_effect_BCAC BETA_BCAC SE_BCAC P_BCAC N_eff_BCAC flip_sign w_BBJ w_BCAC BETA_meta SE_meta P_meta effect_allele_meta non_effect_allele_meta N_eff_meta
1_990280_C_T T C 0.833809625181585 0.0523785452990634 0.025953705216784 0.043575340330456 5356 rs4275402 T C 0.8007 0.0196 0.0239 0.4122 5485.2581986682 FALSE 1484.57199863154 1750.66963113391 0.0346412599971375 0.01758112407433 0.0487963684377109 T C 10841.2581986682

==> /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/EAS_Overall_HM3_MEGA_05.txt <==
unique_SNP_id effect_allele_BBJ non_effect_allele_BBJ Freq_effect_BBJ BETA_BBJ SE_BBJ P_BBJ N_eff_BBJ SNPID effect_allele_BCAC non_effect_allele_BCAC Freq_effect_BCAC BETA_BCAC SE_BCAC P_BCAC N_eff_BCAC flip_sign w_BBJ w_BC

Modify the format of these summary statistics to be compatible as popcorn input.

In [53]:
def get_popcorn_input_from_eas_meta_sumstats(sumstats_dir, out_dir, overall=True, use_tot_samples=False):
    """
    sumstats_dir: directory of BCAC summary statistics file
    out_dir: directory to which the popcorn input will be written
    bim_df: (optional) pandas dataframe w/ chr, pos, and rsid columns; rsid column is added to output file if included
    """
    print(f"get_popcorn_input_from_eas_meta_sumstats: reading sumstats from {sumstats_dir}")
    sumstats_df = pd.read_csv(sumstats_dir, delim_whitespace=True)
    
    # renames columns to conform to popcorn input format
    # unique_SNP_id effect_allele_BBJ non_effect_allele_BBJ Freq_effect_BBJ BETA_BBJ SE_BBJ P_BBJ N_eff_BBJ SNPID effect_allele_BCAC non_effect_allele_BCAC Freq_effect_BCAC BETA_BCAC SE_BCAC P_BCAC N_eff_BCAC flip_sign w_BBJ w_BCAC BETA_meta SE_meta P_meta effect_allele_meta non_effect_allele_meta N_eff_meta
    print(f"get_popcorn_input_from_eas_meta_sumstats: processing summary statistics")
    
    mod = "_meta" if overall else ""
    rsid_hdr = "SNPID" if overall else "unique_SNP_id"
    col_rename_dict = {rsid_hdr: "rsid", f"non_effect_allele{mod}": "a1", f"effect_allele{mod}": "a2", f"BETA{mod}": "beta", f"SE{mod}": "SE", f"N_eff{mod}": "N"}
    sumstats_df.rename(columns=col_rename_dict, inplace=True)
    
    sort_alleles(sumstats_df)      # sorts alleles
    
    out_cols = ["rsid", "a1", "a2", "beta", "SE", 'N']
    
    # writes file in popcorn input format
    print(f"get_popcorn_input_from_eas_meta_sumstats: writing sumstats to {out_dir}")
    out_df = sumstats_df[out_cols]
    if use_tot_samples:
        out_df["N"] =   20393 + 86329    # 20,393 cases & 86,329 controls
    out_df.to_csv(out_dir, sep='\t', index=False)
    
    # print(out_df)

In [54]:
subtype = "Overall"
subsets = ["HM3", "HM3_MEGA"]
mafs = ["01", "05"]
for maf in mafs:
    for subset in subsets:
        sumstats_pref_dir = "/mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update"
        eas_meta_sumstats_dir = f"{sumstats_pref_dir}/EAS_{subtype}_{subset}_{maf}.txt"
        get_popcorn_input_from_eas_meta_sumstats(eas_meta_sumstats_dir, f"EAS_{subtype}_{subset}_{maf}.sumstats.txt")

get_popcorn_input_from_eas_meta_sumstats: reading sumstats from /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/EAS_Overall_HM3_01.txt
get_popcorn_input_from_eas_meta_sumstats: processing summary statistics
get_popcorn_input_from_eas_meta_sumstats: writing sumstats to EAS_Overall_HM3_01.sumstats.txt
get_popcorn_input_from_eas_meta_sumstats: reading sumstats from /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/EAS_Overall_HM3_MEGA_01.txt
get_popcorn_input_from_eas_meta_sumstats: processing summary statistics
get_popcorn_input_from_eas_meta_sumstats: writing sumstats to EAS_Overall_HM3_MEGA_01.sumstats.txt
get_popcorn_input_from_eas_meta_sumstats: reading sumstats from /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/EAS_Overall_HM3_05.txt
get_popcorn_input_from_eas_meta_sumstats: processing summary statistics
get_popcorn_input_from_eas_meta_sumstats: writing sumstats to EAS_Overall_HM3_05.sumstats.txt
get_popcorn_input_from_eas_

Take a look at the East Asian summary statistics cleaned for popcorn.

In [58]:
!head -3 EAS_Overall*.sumstats.txt

==> EAS_Overall_HM3_01.sumstats.txt <==
rsid	a1	a2	beta	SE	N
rs13302982	A	G	-0.0078609494720791	0.0142257213848891	9989.23374776997
rs6694632	A	G	-0.0139440350955235	0.0182257048598489	10007.9563879244

==> EAS_Overall_HM3_05.sumstats.txt <==
rsid	a1	a2	beta	SE	N
rs4275402	C	T	0.0346412599971375	0.01758112407433	10841.2581986682
rs2465136	C	T	0.0357764269110665	0.0175888386619948	11031.0345278835

==> EAS_Overall_HM3_MEGA_01.sumstats.txt <==
rsid	a1	a2	beta	SE	N
rs4475691	C	T	-0.0069842591179897	0.0210142673079917	9775.87833847139
rs13302982	A	G	-0.0078609494720791	0.0142257213848891	9989.23374776997

==> EAS_Overall_HM3_MEGA_05.sumstats.txt <==
rsid	a1	a2	beta	SE	N
rs13302982	A	G	-0.0078609494720791	0.0142257213848891	9989.23374776997
rs6694632	A	G	-0.0139440350955235	0.0182257048598489	10007.9563879244


In [59]:
!dx upload EAS_Overall*.sumstats.txt    # uploads the popcorn input formatted file

ID                          file-Gkkv0q84BzFy6k7bJvpB0yZK
Class                       file
Project                     project-GYBZ3yj4BzFgB61K7vX9Fq9K
Folder                      /Aim4_CrossAncestryGeneticCorrelation/jahagirdarob
                            /sumstats_files
Name                        EAS_Overall_HM3_01.sumstats.txt
State                       [33mclosing[0m
Visibility                  visible
Types                       -
Properties                  -
Tags                        -
Outgoing links              -
Created                     Thu Jun 20 05:38:53 2024
Created by                  omjahagirdar
 via the job                job-GkkpqGj4BzFZ8yX5gKpB2xqb
Last modified               Thu Jun 20 05:38:55 2024
Media type                  
archivalState               "live"
cloudAccount                "cloudaccount-dnanexus"
ID                          file-Gkkv0v04BzFxP01gBY18j8X0
Class                       file
Project                     project-GYBZ3yj4BzFgB61K7

In [27]:
! dx ls

AFR.sumstats.txt
AFR_meta.sumstats.txt
AFR_meta_ERneg.sumstats.txt
AFR_meta_ERneg_totsamples.sumstats.txt
AFR_meta_ERneg_update.sumstats.txt
AFR_meta_ERpos.sumstats.txt
AFR_meta_ERpos_totsamples.sumstats.txt
AFR_meta_ERpos_update.sumstats.txt
AFR_meta_update.sumstats.txt
AMR_meta_update.sumstats.txt
EAS.sumstats.txt
EAS_meta.sumstats.txt
EAS_meta_ERneg_update.sumstats.txt
EAS_meta_ERpos_update.sumstats.txt
EAS_meta_update.sumstats.txt
EUR.sumstats.txt
EUR_meta.sumstats.txt
EUR_meta_ERneg.sumstats.txt
EUR_meta_ERneg_totsamples.sumstats.txt
EUR_meta_ERneg_update.sumstats.txt
EUR_meta_ERpos.sumstats.txt
EUR_meta_ERpos_totsamples.sumstats.txt
EUR_meta_ERpos_update.sumstats.txt
EUR_meta_update.sumstats.txt


# Prepare Population ER+/ER- Breast Cancer Meta Summary Statistics Data

## African Data

Take a look at the summary statistics format. Checks it is the same as the the overall summary statistics format.

In [36]:
!head -1 /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/AA_overall_sumdata.txt
!head -1 /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/AA_ERneg_sumdata.txt
!head -1 /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/AA_ERpos_sumdata.txt

ID CHR POS effect_allele non_effect_allele Freq_effect BETA SE P imputation_r2 N_case N_control N_eff
ID CHR POS effect_allele non_effect_allele Freq_effect BETA SE P imputation_r2 N_case N_control N_eff
ID CHR POS effect_allele non_effect_allele Freq_effect BETA SE P imputation_r2 N_case N_control N_eff


Modify the format of these summary statistics to be compatible as popcorn input.

In [37]:
afr_meta_sumstats_dir = "/mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/AA_ERneg_sumdata.txt"
get_popcorn_input_from_afr_meta_sumstats(afr_meta_sumstats_dir, "AFR_meta_ERneg_update.sumstats.txt")

get_popcorn_input_from_afr_meta_sumstats: reading sumstats from /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/AA_ERneg_sumdata.txt
get_popcorn_input_from_afr_meta_sumstats: processing summary statistics
get_popcorn_input_from_afr_meta_sumstats: writing sumstats to AFR_meta_ERneg_update.sumstats.txt
               rsid a1 a2      beta        SE     N        AF
0         rs4475691  C  T -0.068743  0.033384  1983  0.345812
1         rs1806509  A  C -0.034060  0.034301  1876  0.653508
2         rs7537756  A  G -0.021939  0.036142  1973  0.263261
3        rs13302982  A  G -0.043512  0.032819  1927  0.403980
4         rs4040604  G  T -0.051397  0.033055  1908  0.398978
...             ... .. ..       ...       ...   ...       ...
1062096   rs3750508  C  T -0.049347  0.049955  1871  0.121927
1062097   rs3750510  C  T -0.049845  0.043474  1888  0.831502
1062098   rs9410163  A  G -0.016093  0.041368  1862  0.194882
1062099   rs9777369  C  T -0.017299  0.042496  1845  0.183

Take a look at the African summary statistics cleaned for popcorn.

In [38]:
!head "AFR_meta_ERneg_update.sumstats.txt"

rsid	a1	a2	beta	SE	N	AF
rs4475691	C	T	-0.06874344	0.03338449	1983	0.34581174
rs1806509	A	C	-0.03406019	0.03430095	1876	0.6535084600000001
rs7537756	A	G	-0.02193885	0.03614214	1973	0.26326103
rs13302982	A	G	-0.04351242	0.03281877	1927	0.40398048
rs4040604	G	T	-0.05139747	0.03305516	1908	0.39897812
rs2340587	A	G	0.01778565	0.03618286	1913	0.72454343
rs1110052	G	T	0.02785722	0.03650743	1934	0.26325173
rs7523549	C	T	0.08069767	0.04020642	1941	0.19883097
rs3748592	A	G	-0.02492948	0.05687787	1875	0.90937778


In [39]:
!dx upload "AFR_meta_ERneg_update.sumstats.txt"    # uploads the popcorn input formatted file

ID                          file-GgKKvYQ4BzFvBzgV5Q71ZVxg
Class                       file
Project                     project-GYBZ3yj4BzFgB61K7vX9Fq9K
Folder                      /Aim4_CrossAncestryGeneticCorrelation/jahagirdarob
                            /sumstats_files
Name                        AFR_meta_ERneg_update.sumstats.txt
State                       [33mclosing[0m
Visibility                  visible
Types                       -
Properties                  -
Tags                        -
Outgoing links              -
Created                     Mon Feb 26 20:02:22 2024
Created by                  omjahagirdar
 via the job                job-GgKKBb84BzFvvy5XGvZGg7fB
Last modified               Mon Feb 26 20:02:24 2024
Media type                  
archivalState               "live"
cloudAccount                "cloudaccount-dnanexus"


In [40]:
afr_meta_sumstats_dir = "/mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/AA_ERpos_sumdata.txt"
get_popcorn_input_from_afr_meta_sumstats(afr_meta_sumstats_dir, "AFR_meta_ERpos_update.sumstats.txt")

get_popcorn_input_from_afr_meta_sumstats: reading sumstats from /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/AA_ERpos_sumdata.txt
get_popcorn_input_from_afr_meta_sumstats: processing summary statistics
get_popcorn_input_from_afr_meta_sumstats: writing sumstats to AFR_meta_ERpos_update.sumstats.txt
               rsid a1 a2      beta        SE     N        AF
0         rs4475691  C  T -0.025944  0.027916  2836  0.345812
1         rs1806509  A  C  0.023999  0.028806  2661  0.653508
2         rs7537756  A  G  0.032948  0.030306  2806  0.263261
3        rs13302982  A  G  0.004132  0.027640  2718  0.403980
4         rs4040604  G  T  0.005866  0.027788  2700  0.398978
...             ... .. ..       ...       ...   ...       ...
1061806   rs2606357  C  T  0.006050  0.037127  2719  0.158493
1061807   rs2739260  A  G -0.014554  0.041072  2748  0.122978
1061808   rs3750508  C  T -0.038805  0.040980  2780  0.121927
1061809   rs3750509  C  T  0.023409  0.041907  2663  0.121

Take a look at the African summary statistics cleaned for popcorn.

In [41]:
!head "AFR_meta_ERpos_update.sumstats.txt"

rsid	a1	a2	beta	SE	N	AF
rs4475691	C	T	-0.02594404	0.02791617	2836	0.34581174
rs1806509	A	C	0.0239995	0.02880601	2661	0.6535084600000001
rs7537756	A	G	0.03294794	0.03030599	2806	0.26326103
rs13302982	A	G	0.00413159	0.02763964	2718	0.40398048
rs4040604	G	T	0.00586597	0.02778764	2700	0.39897812
rs2340587	A	G	-0.01764364	0.03000101	2783	0.72454343
rs28576697	C	T	0.03075849	0.0563261	2847	0.94120396
rs1110052	G	T	0.01598663	0.03049234	2772	0.26325173
rs3748593	A	C	0.00064841	0.03607573	2626	0.82206141


In [42]:
!dx upload "AFR_meta_ERpos_update.sumstats.txt"    # uploads the popcorn input formatted file

ID                          file-GgKKx6j4BzFfZfy0gk83QVBP
Class                       file
Project                     project-GYBZ3yj4BzFgB61K7vX9Fq9K
Folder                      /Aim4_CrossAncestryGeneticCorrelation/jahagirdarob
                            /sumstats_files
Name                        AFR_meta_ERpos_update.sumstats.txt
State                       [33mclosing[0m
Visibility                  visible
Types                       -
Properties                  -
Tags                        -
Outgoing links              -
Created                     Mon Feb 26 20:03:39 2024
Created by                  omjahagirdar
 via the job                job-GgKKBb84BzFvvy5XGvZGg7fB
Last modified               Mon Feb 26 20:03:42 2024
Media type                  
archivalState               "live"
cloudAccount                "cloudaccount-dnanexus"


## European Data

Take a look at the summary statistics format. overall boolean parameter incorporated into function to account for differences.

In [44]:
!head -1 /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/European_BCAC_icogs_onco_sumdata.txt
!head -1 /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/European_ERneg_BCAC_icogs_onco_sumdata.txt
!head -1 /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/European_ERpos_BCAC_icogs_onco_sumdata.txt

var_name SNP_ID CHR POS effect_allele_iCOGs non_effect_allele_iCOGs Freq_effect_iCOGs Imputation_r2_iCOGs BETA_iCOGs SE_iCOGs P_iCOGs effect_allele_Onco non_effect_allele_Onco Freq_effect_Onco Imputation_r2_Onco BETA_Onco SE_Onco P_Onco effect_allele_meta non_effect_allele_meta BETA_meta SE_meta P_meta N_eff_meta
var_name phase3_1kg_id chr position_b37 effect_allele_meta non_effect_allele_meta Freq_effect_iCOGs Imputation_r2_iCOGs BETA_iCOGs SE_iCOGs P_iCOGs Freq_effect_Onco Imputation_r2_Onco BETA_Onco SE_Onco P_Onco BETA_meta SE_meta P_meta N_eff_meta
var_name phase3_1kg_id chr position_b37 effect_allele_meta non_effect_allele_meta Freq_effect_iCOGs Imputation_r2_iCOGs BETA_iCOGs SE_iCOGs P_iCOGs Freq_effect_Onco Imputation_r2_Onco BETA_Onco SE_Onco P_Onco BETA_meta SE_meta P_meta N_eff_meta


Load the European bim file into a dataframe.

In [45]:
eur_meta_sumstats_dir = "/mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/European_ERneg_BCAC_icogs_onco_sumdata.txt"
get_popcorn_input_from_eur_meta_sumstats(eur_meta_sumstats_dir, "EUR_meta_ERneg_update.sumstats.txt", overall=False)

get_popcorn_input_from_eur_meta_sumstats: reading sumstats from /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/European_ERneg_BCAC_icogs_onco_sumdata.txt
get_popcorn_input_from_eur_meta_sumstats: processing summary statistics
get_popcorn_input_from_eur_meta_sumstats: writing sumstats to EUR_meta_ERneg_update.sumstats.txt
              rsid a1 a2     beta       SE      N
0        rs2341354  A  G  0.01615  0.01407  10406
1        rs9777703  C  T -0.04529  0.03408  10528
2        rs4970393  A  G -0.01325  0.01387  10619
3        rs4275402  C  T -0.00991  0.01457  11026
4        rs2465136  C  T -0.00840  0.01451  11184
...            ... .. ..      ...      ...    ...
1066593   rs736334  C  T  0.00888  0.01589  11846
1066594   rs739365  C  T -0.01282  0.01463  10860
1066595  rs6009951  C  T -0.02541  0.01354  11181
1066596  rs6010061  C  T -0.02503  0.01356  11160
1066597  rs2285395  A  G -0.02040  0.02989  10820

[1066598 rows x 6 columns]


Take a look at the European summary statistics cleaned for popcorn.

In [46]:
!head "EUR_meta_ERneg_update.sumstats.txt"

rsid	a1	a2	beta	SE	N
rs2341354	A	G	0.01615	0.01407	10406
rs9777703	C	T	-0.04529	0.03408	10528
rs4970393	A	G	-0.01325	0.01387	10619
rs4275402	C	T	-0.00991	0.01457	11026
rs2465136	C	T	-0.0084	0.01451	11184
rs7526076	A	G	0.00583	0.01548	10991
rs4075116	C	T	0.0048	0.01544	10599
rs3766192	C	T	0.00467	0.01313	11690
rs9442372	A	G	0.00527	0.01298	12037


In [47]:
!dx upload "EUR_meta_ERneg_update.sumstats.txt"

ID                          file-GgKKz3Q4BzFzYkYvvj6vkV1q
Class                       file
Project                     project-GYBZ3yj4BzFgB61K7vX9Fq9K
Folder                      /Aim4_CrossAncestryGeneticCorrelation/jahagirdarob
                            /sumstats_files
Name                        EUR_meta_ERneg_update.sumstats.txt
State                       [33mclosing[0m
Visibility                  visible
Types                       -
Properties                  -
Tags                        -
Outgoing links              -
Created                     Mon Feb 26 20:07:42 2024
Created by                  omjahagirdar
 via the job                job-GgKKBb84BzFvvy5XGvZGg7fB
Last modified               Mon Feb 26 20:07:45 2024
Media type                  
archivalState               "live"
cloudAccount                "cloudaccount-dnanexus"


In [48]:
eur_meta_sumstats_dir = "/mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/European_ERpos_BCAC_icogs_onco_sumdata.txt"
get_popcorn_input_from_eur_meta_sumstats(eur_meta_sumstats_dir, "EUR_meta_ERpos_update.sumstats.txt", overall=False)

get_popcorn_input_from_eur_meta_sumstats: reading sumstats from /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/European_ERpos_BCAC_icogs_onco_sumdata.txt
get_popcorn_input_from_eur_meta_sumstats: processing summary statistics
get_popcorn_input_from_eur_meta_sumstats: writing sumstats to EUR_meta_ERpos_update.sumstats.txt
              rsid a1 a2     beta       SE      N
0        rs2341354  A  G -0.00763  0.00866  27438
1        rs4970393  A  G  0.01730  0.00850  28244
2        rs4275402  C  T -0.00873  0.00896  29160
3        rs2465136  C  T -0.00987  0.00892  29597
4        rs7526076  A  G -0.01955  0.00951  29153
...            ... .. ..      ...      ...    ...
1066598   rs736334  C  T  0.00703  0.00973  31575
1066599   rs739365  C  T  0.00227  0.00896  28926
1066600  rs6009951  C  T -0.00255  0.00829  29860
1066601  rs6010061  C  T -0.00069  0.00830  29807
1066602  rs2285395  A  G  0.00095  0.01854  28058

[1066603 rows x 6 columns]


Take a look at the European summary statistics cleaned for popcorn.

In [49]:
!head "EUR_meta_ERpos_update.sumstats.txt"

rsid	a1	a2	beta	SE	N
rs2341354	A	G	-0.00763	0.00866	27438
rs4970393	A	G	0.0173	0.0085	28244
rs4275402	C	T	-0.00873	0.00896	29160
rs2465136	C	T	-0.00987	0.00892	29597
rs7526076	A	G	-0.01955	0.00951	29153
rs4075116	C	T	-0.01917	0.00949	28066
rs3766192	C	T	-0.01175	0.00804	31161
rs9442372	A	G	-0.0122	0.00794	32147
rs6687776	C	T	0.0167	0.01123	27567


In [50]:
!dx upload "EUR_meta_ERpos_update.sumstats.txt"

ID                          file-GgKKzZQ4BzFqPxyqYP5Kqp4Y
Class                       file
Project                     project-GYBZ3yj4BzFgB61K7vX9Fq9K
Folder                      /Aim4_CrossAncestryGeneticCorrelation/jahagirdarob
                            /sumstats_files
Name                        EUR_meta_ERpos_update.sumstats.txt
State                       [33mclosing[0m
Visibility                  visible
Types                       -
Properties                  -
Tags                        -
Outgoing links              -
Created                     Mon Feb 26 20:08:50 2024
Created by                  omjahagirdar
 via the job                job-GgKKBb84BzFvvy5XGvZGg7fB
Last modified               Mon Feb 26 20:08:53 2024
Media type                  
archivalState               "live"
cloudAccount                "cloudaccount-dnanexus"


## East Asian Data

Take a look at the summary statistics format. overall boolean parameter incorporated into function to account for differences.

In [53]:
!head -2 /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/EAS_BCAC_BBJ_meta_sumdata.txt
!head -2 /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/EAS_BCAC_erneg_sumdata.txt
!head -2 /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/EAS_BCAC_erpos_sumdata.txt

unique_SNP_id effect_allele_BBJ non_effect_allele_BBJ Freq_effect_BBJ BETA_BBJ SE_BBJ P_BBJ N_eff_BBJ SNPID effect_allele_BCAC non_effect_allele_BCAC Freq_effect_BCAC BETA_BCAC SE_BCAC P_BCAC N_eff_BCAC flip_sign w_BBJ w_BCAC BETA_meta SE_meta P_meta effect_allele_meta non_effect_allele_meta N_eff_meta
1_861808_A_G G A 0.539864104057465 -0.0156922091431444 0.0191867387314715 0.41343287651503 5467 rs13302982 A G 0.4368 0.0017 0.0212 0.9365 4522.23374776997 TRUE 2716.42473583977 2224.9911000356 -0.00786094947207912 0.0142257213848891 0.580546208763636 G A 9989.23374776997
unique_SNP_id effect_allele non_effect_allele Freq_effect BETA SE P N_eff
rs16851691 A G 0.0251 -0.0265 0.0858 0.7576 2775.63040848111
unique_SNP_id effect_allele non_effect_allele Freq_effect BETA SE P N_eff
rs16851691 A G 0.0252 0.1082 0.0622 0.08189 5261.05901759297


In [57]:
eas_meta_sumstats_dir = "/mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/EAS_BCAC_erneg_sumdata.txt"
get_popcorn_input_from_eas_meta_sumstats(eas_meta_sumstats_dir, "EAS_meta_ERneg_update.sumstats.txt", overall=False)

get_popcorn_input_from_eas_meta_sumstats: reading sumstats from /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/EAS_BCAC_erneg_sumdata.txt
get_popcorn_input_from_eas_meta_sumstats: processing summary statistics
get_popcorn_input_from_eas_meta_sumstats: writing sumstats to EAS_meta_ERneg_update.sumstats.txt
              rsid a1 a2    beta      SE            N
0       rs16851691  A  G  0.0265  0.0858  2775.630408
1         rs745966  A  C -0.0138  0.0345  2989.781482
2        rs4692870  C  T  0.0411  0.0522  2760.456096
3        rs1893488  A  G  0.0237  0.0388  2424.836678
4        rs6958207  G  T -0.0028  0.0559  2206.838668
...            ... .. ..     ...     ...          ...
989869  rs10810293  C  T -0.0192  0.0280  2752.897661
989870   rs2745575  A  G -0.0136  0.0302  2270.076506
989871   rs9568879  C  T -0.0078  0.0402  2886.404449
989872   rs6009104  C  T  0.0483  0.0283  2862.915689
989873   rs2941709  A  G  0.0006  0.0289  2449.126467

[989874 rows x 6 column

Take a look at the East Asian summary statistics cleaned for popcorn.

In [58]:
!head "EAS_meta_ERneg_update.sumstats.txt"

rsid	a1	a2	beta	SE	N
rs16851691	A	G	0.0265	0.0858	2775.63040848111
rs745966	A	C	-0.0138	0.0345	2989.78148184335
rs4692870	C	T	0.0411	0.0522	2760.45609597809
rs1893488	A	G	0.0237	0.0388	2424.83667786311
rs6958207	G	T	-0.0028	0.0559	2206.83866813107
rs6793694	A	G	-0.0759	0.0354	2466.99701565745
rs612963	C	T	0.0132	0.0277	2901.4619513861
rs12582684	A	G	0.0131	0.0323	2875.76651004248
rs2314809	C	T	-0.0287	0.0283	2931.7857893249


In [59]:
!dx upload "EAS_meta_ERneg_update.sumstats.txt"

ID                          file-GgKP2kQ4BzFZjXZG0bf7vy46
Class                       file
Project                     project-GYBZ3yj4BzFgB61K7vX9Fq9K
Folder                      /Aim4_CrossAncestryGeneticCorrelation/jahagirdarob
                            /sumstats_files
Name                        EAS_meta_ERneg_update.sumstats.txt
State                       [33mclosing[0m
Visibility                  visible
Types                       -
Properties                  -
Tags                        -
Outgoing links              -
Created                     Mon Feb 26 20:15:34 2024
Created by                  omjahagirdar
 via the job                job-GgKKBb84BzFvvy5XGvZGg7fB
Last modified               Mon Feb 26 20:15:37 2024
Media type                  
archivalState               "live"
cloudAccount                "cloudaccount-dnanexus"


In [60]:
eas_meta_sumstats_dir = "/mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/EAS_BCAC_erpos_sumdata.txt"
get_popcorn_input_from_eas_meta_sumstats(eas_meta_sumstats_dir, "EAS_meta_ERpos_update.sumstats.txt", overall=False)

get_popcorn_input_from_eas_meta_sumstats: reading sumstats from /mnt/project/Aim2_Polygenicity/HZ/Results/Clean_summary_data_update/EAS_BCAC_erpos_sumdata.txt
get_popcorn_input_from_eas_meta_sumstats: processing summary statistics
get_popcorn_input_from_eas_meta_sumstats: writing sumstats to EAS_meta_ERpos_update.sumstats.txt
              rsid a1 a2    beta      SE            N
0       rs16851691  A  G -0.1082  0.0622  5261.059018
1         rs745966  A  C -0.0512  0.0272  4814.478642
2        rs4692870  C  T  0.0011  0.0408  4495.408946
3        rs1893488  A  G  0.0323  0.0302  4002.506610
4        rs6958207  G  T  0.0072  0.0437  3611.031915
...            ... .. ..     ...     ...          ...
989918  rs10810293  C  T  0.0072  0.0220  4459.239187
989919   rs2745575  A  G -0.0103  0.0237  3685.738727
989920   rs9568879  C  T -0.0453  0.0317  4661.583159
989921   rs6009104  C  T  0.0435  0.0220  4739.688892
989922   rs2941709  A  G  0.0272  0.0224  4076.471123

[989923 rows x 6 column

Take a look at the East Asian summary statistics cleaned for popcorn.

In [61]:
!head "EAS_meta_ERpos_update.sumstats.txt"

rsid	a1	a2	beta	SE	N
rs16851691	A	G	-0.1082	0.0622	5261.05901759297
rs745966	A	C	-0.0512	0.0272	4814.478641537
rs4692870	C	T	0.0011	0.0408	4495.40894601398
rs1893488	A	G	0.0323	0.0302	4002.50660971255
rs6958207	G	T	0.0072	0.0437	3611.03191542221
rs6793694	A	G	0.0132	0.0276	4053.953950615
rs612963	C	T	0.0226	0.0215	4814.77835774107
rs12582684	A	G	0.0019	0.0251	4760.59028129984
rs2314809	C	T	0.0006	0.022	4851.31801820748


In [62]:
!dx upload "EAS_meta_ERpos_update.sumstats.txt"

ID                          file-GgKP3K04BzFq3yxP24b58X9J
Class                       file
Project                     project-GYBZ3yj4BzFgB61K7vX9Fq9K
Folder                      /Aim4_CrossAncestryGeneticCorrelation/jahagirdarob
                            /sumstats_files
Name                        EAS_meta_ERpos_update.sumstats.txt
State                       [33mclosing[0m
Visibility                  visible
Types                       -
Properties                  -
Tags                        -
Outgoing links              -
Created                     Mon Feb 26 20:16:56 2024
Created by                  omjahagirdar
 via the job                job-GgKKBb84BzFvvy5XGvZGg7fB
Last modified               Mon Feb 26 20:16:58 2024
Media type                  
archivalState               "live"
cloudAccount                "cloudaccount-dnanexus"


In [64]:
!dx ls

AFR.sumstats.txt
AFR_meta.sumstats.txt
AFR_meta_ERneg.sumstats.txt
AFR_meta_ERneg_totsamples.sumstats.txt
AFR_meta_ERneg_update.sumstats.txt
AFR_meta_ERpos.sumstats.txt
AFR_meta_ERpos_totsamples.sumstats.txt
AFR_meta_ERpos_update.sumstats.txt
AFR_meta_update.sumstats.txt
EAS.sumstats.txt
EAS_meta.sumstats.txt
EAS_meta_ERneg_update.sumstats.txt
EAS_meta_ERpos_update.sumstats.txt
EAS_meta_update.sumstats.txt
EUR.sumstats.txt
EUR_meta.sumstats.txt
EUR_meta_ERneg.sumstats.txt
EUR_meta_ERneg_totsamples.sumstats.txt
EUR_meta_ERneg_update.sumstats.txt
EUR_meta_ERpos.sumstats.txt
EUR_meta_ERpos_totsamples.sumstats.txt
EUR_meta_ERpos_update.sumstats.txt
EUR_meta_update.sumstats.txt
