Author: Dan Shea  
Date: 2019.10.03
#### Re-analyze N17, N18, and N09
This time instead of performing $\chi^{2}$ testing, we are only looking for cases where pairs of loci are distributed in either coupling or repulsion, with nothing in between. By definition, they will be able to reject the $H_{0}$ of a $\chi^{2}$ test, so we forgo the test to save on computation time.

In [1]:
import pandas as pd
import numpy as np
import os.path
import os
import re
from collections import OrderedDict
import gzip
from IPython.display import display

In [2]:
samples = ['N01','N03','N04','N05','N06','N07','N08','N09','N10','N11','N12','N13','N14','N16','N17','N18','N19','N20','N21','N22',]
filenames = ['N01_imputed_SNP_filtered_skip10000.vcf.gz','N03_imputed_SNP_filtered_skip10000.vcf.gz','N04_imputed_SNP_filtered_skip10000.vcf.gz',
             'N05_imputed_SNP_filtered_skip10000.vcf.gz','N06_imputed_SNP_filtered_skip10000.vcf.gz','N07_imputed_SNP_filtered_skip10000.vcf.gz',
             'N08_imputed_SNP_filtered_skip10000.vcf.gz','N09_imputed_SNP_filtered_skip10000.vcf.gz','N10_imputed_SNP_filtered_skip10000.vcf.gz',
             'N11_imputed_SNP_filtered_skip10000.vcf.gz','N12_imputed_SNP_filtered_skip10000.vcf.gz','N13_imputed_SNP_filtered_skip10000.vcf.gz',
             'N14_imputed_SNP_filtered_skip10000.vcf.gz','N16_imputed_SNP_filtered_skip10000.vcf.gz','N17_imputed_SNP_filtered_skip10000.vcf.gz',
             'N18_imputed_SNP_filtered_skip10000.vcf.gz','N19_imputed_SNP_filtered_skip10000.vcf.gz','N20_imputed_SNP_filtered_skip10000.vcf.gz',
             'N21_imputed_SNP_filtered_skip10000.vcf.gz','N22_imputed_SNP_filtered_skip10000.vcf.gz',]

dfs = OrderedDict()
for key, f in zip(samples, filenames):
    with gzip.open(f, 'rt') as fh:
        dfs[key] = pd.read_csv(fh, sep='\t', header=None, comment='#')

In [3]:
for key in samples:
    dfs[key].drop(columns=[2,5,6,7,8], inplace=True)

In [4]:
for key in samples:
    dfs[key].rename(columns={0: 'CHROM', 1: 'POS', 3:'REF', 4:'ALT', 9:'HITOMEBORE', 10:'FOUNDER'}, inplace=True)

In [5]:
for key in samples:
    mapping = {k: 'RIL_{}'.format(v) for k, v in zip(dfs[key].columns[6:], range(0,len(dfs[key].columns[6:])))}
    dfs[key].rename(columns=mapping, inplace=True)

In [6]:
def parse_gt(x):
    match = re.match('([01]/[01])', x)
    if match is not None:
        return match.group()
    else:
        return np.NaN

In [7]:
for key in samples:
    tmp = dfs[key].iloc[:, 4:]
    for c in tmp.columns:
        tmp[c] = tmp[c].apply(parse_gt)
    dfs[key] = pd.concat([dfs[key].iloc[:, 0:4], tmp], axis=1)

In [8]:
recoded = OrderedDict()
for key in samples:
    tmp_df = list()
    for nt in dfs[key].itertuples(index=False, name=None):
        tmp_row = list()
        if nt[4] == '0/0' and nt[5] == '1/1':
            for ril in nt[4:]:
                if ril == '0/0':
                    tmp_row.append(0)
                elif ril == '1/1':
                    tmp_row.append(1)
                elif (ril == '0/1') or (ril == '1/0'):
                    tmp_row.append(0.5)
                else:
                    tmp_row.append(0.5)
        else:
            for ril in nt[4:]:
                if ril == '0/0':
                    tmp_row.append(1)
                elif ril == '1/1':
                    tmp_row.append(0)
                elif (ril == '0/1') or (ril == '1/0'):
                    tmp_row.append(0.5)
                else:
                    tmp_row.append(0.5)
        tmp_df.append(tmp_row)
    recoded[key] = pd.DataFrame(tmp_df, columns=dfs[key].columns[4:])

In [9]:
output_dfs = OrderedDict()
for key in samples:
    output_dfs[key] = pd.merge(dfs[key].iloc[:,0:4], recoded[key], how='inner', left_index=True, right_index=True)

#### Application of Fuzzy Logic to indeterminate matching
We want to match when the genotype at each locus is the same homozygous call (_i.e._ – $AA$ or $BB$).  
But we also want to ignore cmparisons where one of the loci had no call (_e.g._ – $A-$) and cases where one of the loci is called to be heterozygous (_e.g._ – $AH$).  
To accomplish this, we can vectorize the genotypes as float encodings where:

| Genotype | Value |
|----------|-------|
| A | 0.0 |
| H | 0.5 |
| – | 0.5 |
| B | 1.0 |

We now have two vectors of the genotype calls of each sample at two positions $\vec{I}$ and $\vec{J}$.  
To test equality you can simply ensure that given $\vec{K}=\vec{I}+-\vec{J}$ the $\sum^{n}_{i=0}\vec{K}_{i}=0$  
However since we wish to ignore instances where heterozygous calls or no calls occur, we instead ensure the following given $\vec{K}=\vec{I}+-\vec{J}$ that $min(\vec{K})\neq-1.0$ and $max(\vec{K})\neq1.0$. As the values $-1.0$ and $1.0$ only occur in cases where $AB$ or $BA$ pairwise additions would occur in $\vec{K}=\vec{I}+-\vec{J}$.

In [None]:
results_df = OrderedDict()
for key in ['N01']:
    results_df[key] = list()
    print('Started processing {}'.format(key))
    for i in range(0, output_dfs[key].shape[0]-1):
        for j in range(i+1, output_dfs[key].shape[0]):
            if output_dfs[key].iloc[i, 0] != output_dfs[key].iloc[j, 0]:
                tmp_a = np.array(output_dfs[key].iloc[i, 6:])
                tmp_b = -1 * np.array(output_dfs[key].iloc[j, 6:])
                tmp_sum = tmp_a + tmp_b
                tmp_min = np.min(tmp_sum)
                tmp_max = np.max(tmp_sum)
                flag = True
                if (tmp_min == -1.0) or (tmp_max == 1.0):
                    flag = False
                
                if flag:
                    tmp_row = list(output_dfs[key].iloc[i, :])
                    tmp_row.extend(list(output_dfs[key].iloc[j, :]))
                    results_df[key].append(tmp_row)
    results_df[key] = pd.DataFrame(results_df[key])
    print('Finished processing {}'.format(key))

Started processing N01


In [None]:
for key in candidates:
    results_df['N01']