In [None]:
import pandas as pd
import sys
import time
import numpy as np

# Preprocessing Summary Statistics to get RSID based files needed for MAGMA
-----------------------------------------------------------------------------

## 1. Making bed files to use BEDOPS to map to rsid values
### A. Making bed files from the GWAS summary data
We realized that many of the SNPs that didn't map to an RSID were off by 1 BP from a true RSID with the same mutation. THerefore, we created multiple bed files with 1BP added or subtracted from the recorded position. All these files would be mapped to the SNP database to get proper rsid values.

In [None]:
# read in the data with the position & chromosome
sum_stats = pd.read_csv("/path/to/sumstats/file.txt", sep='\t')
sum_stats.iloc[1:2,]

Unnamed: 0,MarkerName,Allele1,Allele2,StdErr,Effect,P.value,Chr,Pos,Disease
1,1:10000006_G_A,a,g,0.141,0.0789,0.5757,1,10000006,UC


In [None]:
# Ensure Columns included include SNP, Chr, Pos, Pval
# Optional columns to include: A1/A2, Beta, SE
#example is shown below
sum_stats.columns = ['SNP', 'A1', 'A2', 'SE', 'Effect', 'Pval', 'Chr', 'Pos', 'Disease']
sum_stats.iloc[1:2,]

Unnamed: 0,SNP,A1,A2,SE,Effect,Pval,Chr,Pos,Disease
1,1:10000006_G_A,a,g,0.141,0.0789,0.5757,1,10000006,UC


In [None]:
## Add/Subtract 1/2 to original position
pos_plus_one = sum_stats.Pos + 1
pos_plus_two = sum_stats.Pos + 2
pos_minus_one = sum_stats.Pos - 1
pos_minus_two = sum_stats.Pos - 2

In [None]:
# CREATE BED FILES FOR EACH
# dot line used for quality metric
dot_line = len(sum_stats.Pos)*['.']

file_name = "06_16_2023_UC2017_original_start_pos.bed"
snp_dict = {"Chr":sum_stats.Chr, "Start":sum_stats.Pos, "End":pos_plus_one, 
            "SNP":sum_stats.SNP, "QUAL":dot_line, "REF":sum_stats.Allele2, "ALT":sum_stats.A1}
rsid_stats = pd.DataFrame(snp_dict)
rsid_stats.to_csv(file_name, header=False, index=False, sep="\t")

file_name = "06_16_2023_UC2017_plus_one_start_pos.bed"
snp_dict = {"Chr":sum_stats.Chr, "Start":pos_plus_one, "End":pos_plus_two,
            "SNP":sum_stats.SNP, "QUAL":dot_line, "REF":sum_stats.A2, "ALT":sum_stats.A1}
rsid_stats = pd.DataFrame(snp_dict)
rsid_stats.to_csv(file_name, header=False, index=False, sep="\t")

file_name = "06_16_2023_UC2017_minus_one_start_pos.bed"
snp_dict = {"Chr":sum_stats.Chr, "Start":pos_minus_one, "End":sum_stats.Pos,
            "SNP":sum_stats.SNP, "QUAL":dot_line, "REF":sum_stats.A2, "ALT":sum_stats.A1}
rsid_stats = pd.DataFrame(snp_dict)
rsid_stats.to_csv(file_name, header=False, index=False, sep="\t")

file_name = "06_16_2023_UC2017_minus_two_start_pos.bed"
snp_dict = {"Chr":sum_stats.Chr, "Start":pos_minus_two, "End":pos_minus_one, 
            "SNP":sum_stats.SNP, "QUAL":dot_line, "REF":sum_stats.A2, "ALT":sum_stats.A1}
rsid_stats = pd.DataFrame(snp_dict)
rsid_stats.to_csv(file_name, header=False, index=False, sep="\t")

____________________________________________________________________________________
## 2. Use script get_rsIDs.sh to map these SNPs to the SNP database

____________________________________________________________________________________
## 3. With output from get_rsIDs.sh, get final rsid values with preference to original position
Some summary statistics are not written in the same format, so we need to first determine which of the changes (orig, -1, +1, etc) are the true original placements as based on having significantly more matches (we got about 35 times more matches in the "true original" compared to the next highest).

From there, we can favor which RSIDs to keep to ensure all SNPs are accounted for.

In [None]:
# read in rsID results (BEDOPS reesults)
col_values = ["Chr", "Start", "End", "SNP_name", "QUAL", "REF", "ALT", "RSIDs"]
minus_two_rsids = pd.read_csv("/path/to/files/rsID_pos_minus_two.bed", 
                              sep='\t')
minus_two_rsids.columns = col_values
minus_one_rsids = pd.read_csv("/path/to/files/rsID_pos_minus_one.bed", 
                              sep='\t')
minus_one_rsids.columns = col_values
original_rsids = pd.read_csv("/path/to/files/rsID_pos_original.bed", 
                              sep='\t')
original_rsids.columns = col_values
plus_one_rsids = pd.read_csv("/path/to/files/rsID_pos_plus_one.bed", 
                              sep='\t')
plus_one_rsids.columns = col_values

In [None]:
# see which RSID file has the most (therefore is the true original)
print(minus_two_rsids.shape[0] + minus_one_rsids.shape[0] + original_rsids.shape[0] + plus_one_rsids.shape[0])
print("The number of rsIDs paired:\n-2: ", minus_two_rsids.shape[0], 
    "\n-1: ", minus_one_rsids.shape[0], 
    "\nog: ", original_rsids.shape[0], 
    "\n+1: ", plus_one_rsids.shape[0])

In [None]:
# change index to be SNP name
minus_two_rsids.index = minus_two_rsids["SNP_name"]
minus_one_rsids.index = minus_one_rsids["SNP_name"]
original_rsids.index = original_rsids["SNP_name"]
plus_one_rsids.index = plus_one_rsids["SNP_name"]
print(minus_two_rsids.shape, minus_one_rsids.shape, 
      original_rsids.shape, plus_one_rsids.shape)

#### B. Get the rsids for each SNP
Note: BEDOPS considers the end position as the BP position of the SNP, therefore the minus_one_rsid is actually the true original format, original is +1 and minus_two is -1.

Favor the one with he greatest number of overlaps first, with the general favors beyond that being in this order: [original, -1, +1, -2]:

    1. For RA, that was favor pos_minus_one first, then original, then minus_two.
    2. For UC, that was favor pos_plus_one first, then original, then minus_one

In [None]:
# mark the favor order here (example below follows UC):
first_favor = plus_one_rsids
second_favor = original_rsids
third_favor = minus_one_rsids
fourth_favor = minus_two_rsids


In [None]:
# Get the SNP names of the SNPs with rsids found with the second favored 
# that did not have an rsid found with the one with the first favored
t0 = time.time()
only_second_favor_rsids = second_favor["SNP_name"][~second_favor["SNP_name"].isin(first_favor["SNP_name"])]
tf = time.time()
print(len(only_second_favor_rsids))
print(tf-t0, "seconds")

# remove all SNPs already addressed in first favored
second_favor = second_favor.loc[only_second_favor_rsids,]
second_favor.index = second_favor["SNP_name"]

# concatenate these SNPs to one dataframe
full_rsids = pd.concat([first_favor, second_favor])
full_rsids.index = full_rsids["SNP_name"]

# ensure proper shape
print(full_rsids.shape)
print(len(only_second_favor_rsids) + first_favor.shape[0])


In [None]:
# Then remove all SNPs already addressed in previously favored in next favored
only_third_favor_rsids = third_favor["SNP_name"][~third_favor["SNP_name"].isin(full_rsids["SNP_name"])]
print(len(only_third_favor_rsids))
third_favor = third_favor.loc[only_third_favor_rsids,]
third_favor.index = third_favor["SNP_name"]

# concatenate these SNPs to the previously favored dataframe
full_rsids = pd.concat([full_rsids, third_favor])
print(full_rsids.shape)
print(len(only_third_favor_rsids) + len(only_second_favor_rsids) + first_favor.shape[0])
full_rsids.head()

### 4. Remove duplicate rsids, and address if SNP has multiple RSIDs
1. Keep the shortest RSID if the same SNP has multiple RSIDs. The shortest was chosen as it is usually
    the most updated version of the RSID with the longer version being merged into it.
2. Any duplicate rsids were identified with only the SNP with the lowest p-value's stats being used for that rsid

#### Keep shortest RSID if a SNP mapped to multiple RSIDs (usually most updated)

In [None]:
# create a new column with the set version of the SNP list
t0 = time.time()
full_rsids['RSID_set'] = full_rsids.apply (lambda row: set(row.RSIDs.split(';')), axis=1)
tf= time.time()
print(tf-t0)
t0 = time.time()
full_rsids['RSID_first'] = full_rsids.apply (lambda row: list(row.RSID_set)[0], axis=1)
tf= time.time()
print(tf-t0)

101.26584196090698
84.33529496192932


In [None]:
t0 = time.time()
full_rsids['RSID_shortest'] = full_rsids.apply (lambda row: min(list(row.RSID_set), key=len), axis=1)
tf= time.time()
print(tf-t0)

84.22631597518921


In [None]:
# see which SNPs have more than one SNP value
mult_rsids = full_rsids.loc[full_rsids.RSID_set.str.len() > 1].reset_index(drop=True)
print("SNPs with more than one RSID assigned")
print(mult_rsids.shape)
print((mult_rsids.shape[0]/13268935) * 100)

print("SNPs with only one RSID assigned")
# see which SNPs have more than one SNP value
single_rsids = full_rsids.loc[full_rsids.RSID_set.str.len() == 1].reset_index(drop=True)
print(single_rsids.shape)
print((single_rsids.shape[0]/13268935) * 100)

(708292, 11)
5.33797173623957


In [None]:
# Add statistics back in
full_rsids_stats = pd.merge(full_rsids, sum_stats, left_index=True, right_index=True)
print(full_rsids.shape, sum_stats.shape, full_rsids_stats.shape)

In [None]:
## IF RSIDs are the same length (so NULL), make RSID_SHORTEST the RSID FIRST
# get the values where NA value
na_values = full_rsids_stats[full_rsids_stats['RSID_shortest'].isnull()]
non_na_values = distinct_full_rsids_stats[distinct_full_rsids_stats['RSID_shortest'].notnull()]
print(na_values.shape, non_na_values.shape)

if nrow(na_values) > 0:
    # change these NA values to just be the shortest
    t0 = time.time()
    na_values['RSID_shortest'] = na_values.apply (lambda row: row.RSID_first, axis=1)
    tf= time.time()
    print(tf-t0)
    print(na_values[na_values['RSID_shortest'].isnull()].shape)

    # recombine the dataframes
    full_rsids_stats = pd.concat([na_values, non_na_values],
                                               ignore_index=True)
    print(full_rsids_stats.shape)

#### If duplicates, keep one with lowest p-value

In [None]:
# get duplicates
t0 = time.time()
duplicate_rsid_set = full_rsids_stats[full_rsids_stats.duplicated(['RSID_set'])]
tf = time.time()
print(tf-t0, "seconds")
print(duplicate_rsid_set.shape)
non_duplicate_rsid_short = full_rsids_stats[~full_rsids_stats['RSID_shortest'].isin(duplicate_rsid_set['RSID_shortest'])]


In [None]:
# make a list of the SNP names to keep for duplicates
keep_SNP_names = []
count=0
print(len(set(duplicate_rsid_set['RSID_shortest'])))
t0 = time.time()
# for each duplicated rsid, get the SNP name
for dup_rsid in test_list:
    # get the SNPs for this dup_rsid
    temp = full_rsids_stats[full_rsids_stats['RSID_shortest']==dup_rsid]
    rsid_dict[dup_rsid] = temp['SNP_name']
    # get the index of the SNP with the minimum pvalue
    array = temp['P.value']
    min_index = array.argmin()
    # add the SNP name of the SNP with the lowest P value
    keep_SNP_names.append(list(temp['SNP_name'])[min_index])
    count += 1
tf = time.time()
print(tf-t0)
print(len(keep_SNP_names))
print(count)

In [None]:
# then concatenate this list with the list of nonduplicated rsids
full_keep_SNP_names = list(keep_SNP_names) + list(non_duplicate_rsid_short['SNP_name'])

print(len(keep_SNP_names))
print(len(non_duplicate_rsid_short['SNP_name']))
print(len(full_keep_SNP_names))

In [None]:
# get the complete rsid stats with just distinct RSIDs
distinct_full_rsids_stats = full_rsids_stats[full_rsids_stats['SNP_name_x'].isin(full_keep_SNP_names)]
distinct_full_rsids_stats.shape

### 5. Remove MHC region

In [None]:
bad_list = distinct_full_rsids_stats['End'][distinct_full_rsids_stats['End'].between(28477797, 33448354)]
print(len(bad_list))
six = distinct_full_rsids_stats[distinct_full_rsids_stats['Chr_x']==6]
bad = six[six['End'].isin(bad_list)]
print(bad.shape)
bad.iloc[1:2,]
SNP_bad = bad['SNP_name']
distinct_full_rsids_stats = distinct_full_rsids_stats[~distinct_full_rsids_stats['SNP_name'].isin(SNP_bad)]
print(distinct_full_rsids_stats.shape)
print(len(SNP_bad))

### 6*. If using with Munge Statistics python (as with scDRS), see if pandas identifies any very small P values as 0 and change to 1e-300 instead since otherwise Munge will throw out the SNP believing it to have an improper p-value

In [None]:
print(distinct_full_full_rsids_stats.loc[distinct_full_full_rsids_stats["Pval"] == 0,].shape)
print(distinct_full_full_rsids_stats.loc[distinct_full_full_rsids_stats["Pval"] == 1e-300,].shape)
pd_rsidchrposa1a2.loc[pd_rsidchrposa1a2['Pval'] == 0, 'Pval'] = 1e-300
print(distinct_full_full_rsids_stats.loc[distinct_full_full_rsids_stats["Pval"] == 0,].shape)
print(distinct_full_full_rsids_stats.loc[distinct_full_full_rsids_stats["Pval"] == 1e-300,].shape)

### 7. Save the appropriate files


In [None]:
pd_rsidchrposa1a2 = fixed_distinct_complete_rsids_stats[["RSID_shortest", "Chr_x", "End", "Allele1", "Allele2", "Effect", "StdErr", "P.value"]]
pd_rsidchrposa1a2.columns = ["SNP", "Chr_x", "BP", "A1", "A2", "Effect", "SE", "Pval"]
file_name = "/Users/hopekirby/Desktop/SC_GWAS_Bench/data/rsid_based_files/06_20_2023_pd_rsidchrposa1a2_UC2017_noMHC.txt"
pd_rsidchrposa1a2.to_csv(file_name, header=True, index=False, sep="\t")