# ANOVA test with genotype -> phenotype data

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

# Importing the data

In [None]:
# Config for accessing the data on the s3 storage
storage_options = {'anon':True, 'client_kwargs':{'endpoint_url':'https://os.unil.cloud.switch.ch'}}
s3_path = 's3://lts2-graphnex/BXDmice/'

In [None]:
# Load the data
genotype_path = os.path.join(s3_path, 'geno_reduced.csv.gz')
#genotype_path = os.path.join(s3_path, 'genotype_BXD.csv.gz')
genotype = pd.read_csv(genotype_path, storage_options=storage_options)
print('File {} Opened.'.format(genotype_path))
phenotype_path = os.path.join(s3_path, 'Phenotype.txt.gz')
phenotype = pd.read_csv(phenotype_path, sep='\t', storage_options=storage_options)
print('File {} Opened.'.format(phenotype_path))
# Phenotype description
phenotypeinfo_path = os.path.join(s3_path, 'phenotypes_id_aligner.txt.gz')
phenotypeinfo = pd.read_csv(phenotypeinfo_path, sep='\t', storage_options=storage_options)
print('File {} Opened.'.format(phenotypeinfo_path))

## Example on one phenotype
We choose the phenotype with id 'X122'. This phenotype is highly dependent on a small set of SNPs. This dependence is clearly visible with an ANOVA test.

In [None]:
pheno_id = 'X122'

print('Phenotype description:')
description = phenotypeinfo[phenotypeinfo['PhenoID']==pheno_id]['Phenotype'].values
print(description)
print('----------')
pheno_BXD = phenotype[phenotype['PhenoID']==pheno_id].dropna(axis=1).drop('PhenoID', axis=1)
mouse_list = list(pheno_BXD.columns)
print('Phenotype values:')
pheno_BXD

In [None]:
# For each SNP, we separate the mice in two groups:
# the one with -1 and the one with +1
# and we compute the p-value
geno_BXD = genotype[mouse_list]
fvalues = []
pvalues = []
for SNP,row in geno_BXD.iterrows():
    population1 = row[row==-1]
    population2 = row[row==1]
    x = pheno_BXD[population1.keys()].values
    y = pheno_BXD[population2.keys()].values
    fvalue, pvalue  = stats.f_oneway(x.T, y.T)
    fvalues += [fvalue[0]]
    pvalues += [pvalue[0]]

In [None]:
# We create a dataframe with the results
df = pd.DataFrame()
df['fvalues'] = fvalues
df['pvalues'] = pvalues
df['Chr'] = genotype['Chr'].values
df['Pos'] = genotype['Pos'].values
# Turn the index as a column with a name
df.reset_index(inplace=True)
df.rename(columns={'index' : 'SNP index'}, inplace=True)
df.head()

In [None]:
# Plot the results of the ANOVA test
f, ax = plt.subplots(figsize=(10, 10))
ax.set(yscale="log")
sns.scatterplot(x="SNP index", y="pvalues", data=df.reset_index(), hue="Chr").invert_yaxis()
ax.axhline(0.05, ls='--', c='red')