In [None]:
import itertools

import pandas as pd
import numpy as np
import statsmodels.api as sm
from matplotlib import pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

In [None]:
hmp = pd.read_csv("test_hapmap.txt", sep="\t", index_col=0)
hmp = hmp.iloc[:,10:] # HapMap files come with 10 unused columns
hmp = hmp.replace("N", np.nan)

trait = pd.read_csv("test_trait.txt", sep="\t", index_col=0)
trait.columns

# 1. OLS GWAS

Currently only implemented for a single SNP but the result match TASSEL GLM output for the tested SNP.\
Thia is also likely to be too slow to calculate the test statistics for thousands of SNPs.

In [None]:
# Merge genotype column and trait column in a single dataframe
data = pd.DataFrame(hmp.loc["SNP_Name"]).join(trait["Trait_Name"])
data.columns = ["SNP", "trait"]
data = pd.DataFrame(data)
data["SNP"] = data["SNP"].replace("N", np.nan)
data = data.dropna()

In [None]:
from sklearn.preprocessing import OrdinalEncoder
ordinalenc = OrdinalEncoder()
data.SNP = ordinalenc.fit_transform(pd.DataFrame(data.SNP))
data = data.dropna()
data.SNP = sm.add_constant(data.SNP)
model = sm.OLS(data.trait, data.SNP).fit()
model.summary()

# 2. Calculate Distance Matrix

https://bitbucket.org/tasseladmin/tassel-5-source/wiki/UserManual/DistanceMatrix/DistanceMatrix \
https://davetang.org/muse/2015/07/24/dna-sequencing-data/ \
TASSEL calculates distance as 1 - IBS (identity by state) similarity, with IBS defined as the probability that alleles drawn at random from two individuals at the same locus are the same. For clustering, the distance of an individual from itself is set to 0.

The calculation is based on the definition. For a bi-allelic locus with alleles A and B, probabilityIBS(AA,AA) = 1, pIBS(AA,BB) = 0, pIBS(AB, xx) = 0.5, where xx is any other genotype. For two taxa, pIBS is averaged over all non-missing loci. Distance is 1 - pIBS. The kinship calculation is related but different and is described in Endelman and Jannink (2012) Shrinkage Estimation of the Realized Relationship Matrix. G3 2:1405-1413, using the non-shrunk version under the assumption that generally, number of markers > number of individuals.

Below is a python implementation of the IBM calculation using numpy vectorization. It assumes the hapmap file has only mono-allelic sites and only works for standard homozygous ("G", "C", "A", "T") and missing ("N") alleles.

In [None]:
import pandas as pd
import numpy as np

In [None]:
# https://towardsdatascience.com/how-to-vectorize-pairwise-dis-similarity-metrics-5d522715fb4e
hmp = pd.read_csv("test_hapmap.txt", sep="\t", index_col=0)
hmp = hmp.iloc[:,10:]

# First step is to calculate all non-missing loci in a 2-d numpy array
X = hmp.copy()
X = X.replace(["G","C","A","T"],True)
X = X.replace("N",False)
X = np.array(X.T)
count_loc = np.empty((len(X), len(X)))
count_loc = (X[:, None, :]) & (X[None, :, :])
count_loc = count_loc.sum(axis=-1)

# Second step is to calculate all matching loci (np.nan==np.nan is false) in a 2-d numpy array
X = hmp.copy()
X = X.replace("N", np.nan)
X = np.array(X.T)
count_match = np.empty((len(X), len(X)))
count_match = X[:, None, :] == X[None, :, :]
count_match = count_match.sum(axis=-1)

# Third step is to calculate the 1 - IBS (IBS=matching/non-missing loci)
IBS = pd.DataFrame(1-(count_match/count_loc))
IBS.index = hmp.columns
IBS.columns = hmp.columns
IBS

In [None]:
# This solution to calculate non-missing loci technically works but the
# addition of the two vectorized numpy arrays is very non-memory efficient
# (required about 80gb of RAM with just 13k SNPs in the hapmap file)
# custom function to count non-missing loci
def N_in(x):
    return len(x) - sum('N' in s for s in x)

X = np.array(X.T)
count_loc = np.empty((len(X), len(X)))
count_loc = X[:, None, :] + X[None, :, :]
count_loc = np.apply_along_axis(N_in, -1, count_loc)

In [None]:
# This solution works but to calculate 1-IBS works but is slow due to nested loops
hmp = pd.read_csv("test_hapmap.txt", sep="\t", index_col=0)
hmp = hmp.iloc[:,10:]
hmp = hmp.replace("N", np.nan)
IBS  = pd.DataFrame(columns = hmp.columns, index=hmp.columns)
for ix1, row1 in hmp.T.iterrows():
    for ix2, row2 in hmp.T.iterrows():
        count_loc = (row1+row2).count()
        count_match = (row1==row2).sum()
        IBS.loc[ix1,ix2] = 1-count_match/count_loc