# Calculate statistics for phenome and SNPs

In [1]:
import os
import pathlib
import pandas as pd

base_path = pathlib.Path(os.getcwd().replace("/synthetics", ""))
experiment_path = base_path / 'mice_data_set' / 'out' 

In [2]:
# Enumerate phenotypes from experiment

phenotype_list = sorted([x.stem.replace('pheno_', '') for x in experiment_path.glob('pheno_*.csv')])

print(f"Found {len(phenotype_list)} phenotypes.")
print(f"Examples: {', '.join(phenotype_list[:10])}...")


Found 68 phenotypes.
Examples: AvAltContextD3, AvContextD2, AvToneD1, AvToneD3, BMD, D1TOTDIST10, D1TOTDIST15, D1TOTDIST20, D1TOTDIST25, D1TOTDIST30...


In [3]:
from collections import defaultdict

def find_phenotype_associations(phenotype_list: list, path: pathlib.Path) -> list:
    """ Using GEMMA and LM results, determine which phenotypes have snp 
        associations greater than the threshold of 10e-8
    """
    matches = defaultdict(list)

    for phenotype in phenotype_list:
        
        # Test for LM associations
        lm_path = experiment_path / f"lm_{phenotype}_1_79646.csv"
        for index, row in pd.read_csv(lm_path).iterrows():
            if row['p'] < 1e-7:
                matches[phenotype].append(row['snp'])
            else:
                break

        # Test for GEMMA associations
        gemma_path = experiment_path / f"ge_{phenotype}_1_79646.csv"
        for index, row in pd.read_csv(gemma_path).iterrows():
            if row['log10p'] >= 8:
                matches[phenotype].append(row['snp'])
            else:
                break
                
    return matches
        
phenotype_snps = find_phenotype_associations(phenotype_list, experiment_path)

for key, value in phenotype_snps.items():
    print(f"{key}:{len(value)}")

D1TOTDIST10:1
D1ctrtime0to30:5
D2TOTDIST25:1
abBMD:467
soleus:1
testisweight:123
tibia:7


In [4]:
def count_phenotype_snps(phenotype_list: list, phenotype_snps: defaultdict) -> dict:
    snp_counts = {}
    for phenotype1 in phenotype_list:
        set1 = set(phenotype_snps[phenotype1])
        for phenotype2 in phenotype_list:
            set2 = set(phenotype_snps[phenotype2])

            overlap = set1.intersection(set2)
            if len(overlap) > 0:
                snp_counts[phenotype1] = len(overlap)
                print(f"Phenotype {phenotype1} has associations with {len(overlap)} SNPs")
    
    return snp_counts

snp_counts = count_phenotype_snps(phenotype_list, phenotype_snps)

Phenotype D1TOTDIST10 has associations with 1 SNPs
Phenotype D1ctrtime0to30 has associations with 5 SNPs
Phenotype D2TOTDIST25 has associations with 1 SNPs
Phenotype abBMD has associations with 261 SNPs
Phenotype soleus has associations with 1 SNPs
Phenotype testisweight has associations with 78 SNPs
Phenotype tibia has associations with 7 SNPs


In [17]:
# For the pheno's that passed the above test, gather all the snps for each that have
# assosications greater than the threshold of 10e-8

matches = defaultdict(list)
for phenotype in snp_counts.keys():
    
    df = pd.read_csv(experiment_path / f'ge_{phenotype}_1_79646.csv')
    for index, row in df.iterrows():
        if row['log10p'] >= 8:
            matches[phenotype].append(row['snp'])
        else:
            break

for key, value in matches.items():
    print(f"{key} has {len(value)} associated SNPs")

abBMD has 206 associated SNPs
testisweight has 46 associated SNPs


In [None]:
# TODO: For each associated SNP, get the pearson's correlation values
# Possible bug: Need to make sure map.ipynb X_ genotypes are being generated correctly.