In [None]:
import glob
import os

import pandas as pd
import numpy as np

# Combine Significant GWAS Results 

In [None]:
# Sometimes long trait names get in the way of the GWAS analysis software
# Use this to convert the trait names back to theur full name
trait_dict = pd.read_csv("trait_conversion.txt", sep="\t", header=None)
trait_dict = dict(zip(trait_dict[0], trait_dict[1]))

In [None]:
# Calculate the adjusted p-value. Use custom n or count rows in full result file
from statsmodels.sandbox.stats.multicomp import multipletests
# Create a list of the adjusted p-values - for bonferroni only need list of SNPs length
p_adjusted = multipletests(range(178316), alpha=0.1, method='bonferroni')
-np.log10(p_adjusted[-1])

### TODO: Results are still producing duplicate rows which needs to be fixed.

In [None]:
# Get a list of the MLM result files in the folder
result_files = glob.glob('MLM_results/*.tsv')
result_dict = {}
for file_name in result_files:
    # Open the GWAS result file
    result_df = pd.read_csv(file_name, sep="\t")
    result_name = result_df["Trait"].iloc[0]
    result_df = result_df.loc[:, ["Chr", "Pos", "p"]]
    result_df = result_df[result_df["p"]<10**-5]
    result_df = result_df.sort_values(by="p", axis=0)
    for row in result_df.values.tolist():
        tname = trait_dict[result_name]
        if tname not in result_dict.keys():
            result_dict[tname] = row[:2]
        else:
            # compare chromosome names first. If not in, add to list
            if row[0] not in result_dict[tname]:
                result_dict[tname] += row[:2]
            # if exists, check if it's within 100kb
            else:
                # if chromosome appears multiple times get all indexes of loc
                ix_list = [i+1 for i in range(len(result_dict[tname])) if result_dict[tname][i] == row[0]]
                for ix in ix_list:
                    if row[1] > result_dict[tname][ix] + 500000 or row[1] < result_dict[tname][ix] - 500000:
                        result_dict[tname] += row[:2]

In [None]:
snp_list = []
for feature in result_dict.keys():
    for i in range(0, len(result_dict[feature]), 2):
        # seperate nested lists into pairs of chromosome and location
        snp_list.append([feature] + result_dict[feature][i:i+2])
# creating the combined DataFrame of all SNPs
comb_db = pd.DataFrame(snp_list, columns =["name", "chrom", "loc"])
comb_db.to_csv("combined_snp_results.tsv", sep="\t")