In [1]:
import pandas as pd
import numpy as np
import re
import itertools as it
import os

# METAL RUNS

```bash
# all ancestry
cd /Users/deeprobanerjee/Documents/bmi_project/BMI_monogenic/data/metal/all_ancestry
/Users/deeprobanerjee/Documents/bmi_project/BMI_monogenic/data/metal/METAL/build/bin/metal
SOURCE /Users/deeprobanerjee/Documents/bmi_project/BMI_monogenic/data/metal/all_ancestry/metal_run.sh

# eur
cd /Users/deeprobanerjee/Documents/bmi_project/BMI_monogenic/data/metal/eur
/Users/deeprobanerjee/Documents/bmi_project/BMI_monogenic/data/metal/METAL/build/bin/metal
SOURCE /Users/deeprobanerjee/Documents/bmi_project/BMI_monogenic/data/metal/eur/metal_run_eur.sh

# non eur
cd /Users/deeprobanerjee/Documents/bmi_project/BMI_monogenic/data/metal/noneur
/Users/deeprobanerjee/Documents/bmi_project/BMI_monogenic/data/metal/METAL/build/bin/metal
SOURCE /Users/deeprobanerjee/Documents/bmi_project/BMI_monogenic/data/metal/noneur/metal_run_noneur.sh

```

In [2]:
def get_most_deleterious_idx(ser):
    all_masks = set(ser.unique())
    most_del = "Missense_lenient"
    if "pLoF" in all_masks:
        most_del = "pLoF"
    elif "Missense_strict" in all_masks:
        most_del = "Missense_strict"
    most_del_idx = ser.loc[ser==most_del].index[0]
    return most_del_idx


In [3]:
PROJECT_DIR = "/Users/deeprobanerjee/Documents/bmi_project/BMI_monogenic"
filename = os.path.join(PROJECT_DIR, "data/meta/results/all_ancestry/ivw_fixed/bmi_rint_monogenic_meta.tsv")
sig_meta_res_df = pd.read_csv(filename, sep="\t")


In [4]:
most_del_sig_meta_df = sig_meta_res_df.loc[sig_meta_res_df.groupby("gene")["gene_mask"].apply(get_most_deleterious_idx)].sort_values("p_value").reset_index(drop=True)


In [5]:
def get_metal_stats(filename, markers):
    metal_df = pd.read_csv(filename, sep="\t")
    metal_df = metal_df.loc[metal_df["MarkerName"].isin(markers), ["MarkerName", "Effect", "StdErr", "P-value"]]
    metal_df.columns = ["MarkerName", "beta", "se", "p_value"]
    return metal_df    


In [6]:
metal_dir = os.path.join(PROJECT_DIR, "data/metal/")
analyses = ["all_ancestry", "eur", "noneur"]
markers = most_del_sig_meta_df.gene + "::" + most_del_sig_meta_df.gene_mask

metal_dfs = pd.DataFrame()
for analysis in analyses:
    metal_filename = os.path.join(metal_dir, analysis, "METAANALYSIS1.TBL")
    metal_df = get_metal_stats(metal_filename, markers)
    metal_df["analysis"] = analysis
    metal_dfs = pd.concat([metal_dfs, metal_df])


In [7]:
metal_dfs[["gene", "gene_mask"]] = metal_dfs.MarkerName.str.split("::", expand=True)
stats_cols = ["beta", "se", "p_value"]
metal_pivot = metal_dfs.pivot(index="analysis", columns=["gene", "gene_mask"], values=stats_cols)

metal_pivot.columns = pd.MultiIndex.from_tuples(
    [(gene, mask, stat) for stat, gene, mask in metal_pivot.columns],
    names=["Gene", "Gene Mask", "Statistic"]
)

metal_pivot.loc[
    ["eur", "noneur", "all_ancestry"], [(g,m,s) for g,m in most_del_sig_meta_df.sort_values("beta", ascending=False).loc[:, ["gene", "gene_mask"]].values for s in stats_cols]
].to_excel(os.path.join(PROJECT_DIR, "data/metal/bmi_rint_monogenic_meta_metal.xlsx"))
