Author: Dan Shea  
Date: 2019.10.03
#### Re-analyze N17, N18, and N09
This time instead of performing $\chi^{2}$ testing, we are only looking for cases where pairs of loci are distributed in either coupling or repulsion, with nothing in between. By definition, they will be able to reject the $H_{0}$ of a $\chi^{2}$ test, so we forgo the test to save on computation time.

In [1]:
import pandas as pd
import numpy as np
import os.path
import os
import re
from collections import OrderedDict
import gzip
from IPython.display import display

In [2]:
samples = ['N01','N03','N04','N05','N06','N07','N08','N09','N10','N11','N12','N13','N14','N16','N17','N18','N19','N20','N21','N22',]
filenames = ['N01_imputed_SNP_filtered_skip10000.vcf.gz','N03_imputed_SNP_filtered_skip10000.vcf.gz','N04_imputed_SNP_filtered_skip10000.vcf.gz',
             'N05_imputed_SNP_filtered_skip10000.vcf.gz','N06_imputed_SNP_filtered_skip10000.vcf.gz','N07_imputed_SNP_filtered_skip10000.vcf.gz',
             'N08_imputed_SNP_filtered_skip10000.vcf.gz','N09_imputed_SNP_filtered_skip10000.vcf.gz','N10_imputed_SNP_filtered_skip10000.vcf.gz',
             'N11_imputed_SNP_filtered_skip10000.vcf.gz','N12_imputed_SNP_filtered_skip10000.vcf.gz','N13_imputed_SNP_filtered_skip10000.vcf.gz',
             'N14_imputed_SNP_filtered_skip10000.vcf.gz','N16_imputed_SNP_filtered_skip10000.vcf.gz','N17_imputed_SNP_filtered_skip10000.vcf.gz',
             'N18_imputed_SNP_filtered_skip10000.vcf.gz','N19_imputed_SNP_filtered_skip10000.vcf.gz','N20_imputed_SNP_filtered_skip10000.vcf.gz',
             'N21_imputed_SNP_filtered_skip10000.vcf.gz','N22_imputed_SNP_filtered_skip10000.vcf.gz',]

dfs = OrderedDict()
for key, f in zip(samples, filenames):
    with gzip.open(f, 'rt') as fh:
        dfs[key] = pd.read_csv(fh, sep='\t', header=None, comment='#')

In [3]:
for key in samples:
    dfs[key].drop(columns=[2,5,6,7,8], inplace=True)

In [4]:
for key in samples:
    dfs[key].rename(columns={0: 'CHROM', 1: 'POS', 3:'REF', 4:'ALT', 9:'HITOMEBORE', 10:'FOUNDER'}, inplace=True)

In [5]:
for key in samples:
    mapping = {k: 'RIL_{}'.format(v) for k, v in zip(dfs[key].columns[6:], range(0,len(dfs[key].columns[6:])))}
    dfs[key].rename(columns=mapping, inplace=True)

In [6]:
def parse_gt(x):
    match = re.match('([01]/[01])', x)
    if match is not None:
        return match.group()
    else:
        return np.NaN

In [7]:
for key in samples:
    tmp = dfs[key].iloc[:, 4:]
    for c in tmp.columns:
        tmp[c] = tmp[c].apply(parse_gt)
    dfs[key] = pd.concat([dfs[key].iloc[:, 0:4], tmp], axis=1)

In [17]:
recoded = OrderedDict()
for key in samples:
    tmp_df = list()
    for nt in dfs[key].itertuples(index=False, name=None):
        tmp_row = list()
        if nt[4] == '0/0' and nt[5] == '1/1':
            for ril in nt[4:]:
                if ril == '0/0':
                    tmp_row.append('A')
                elif ril == '1/1':
                    tmp_row.append('B')
                elif (ril == '0/1') or (ril == '1/0'):
                    tmp_row.append('H')
                else:
                    tmp_row.append('-')
        else:
            for ril in nt[4:]:
                if ril == '0/0':
                    tmp_row.append('B')
                elif ril == '1/1':
                    tmp_row.append('A')
                elif (ril == '0/1') or (ril == '1/0'):
                    tmp_row.append('H')
                else:
                    tmp_row.append('-')
        tmp_df.append(tmp_row)
    recoded[key] = pd.DataFrame(tmp_df, columns=dfs[key].columns[4:])

In [18]:
def is_nohetero(r):
    for i in r:
        if i == 'H':
            return False
    return True

In [19]:
for key in samples:
    print('{} contains {} loci with 0 heterozygous calls.'.format(key, sum(recoded[key].apply(is_nohetero, axis=1))))

N01 contains 43 loci with 0 heterozygous calls.
N03 contains 1892 loci with 0 heterozygous calls.
N04 contains 281 loci with 0 heterozygous calls.
N05 contains 123 loci with 0 heterozygous calls.
N06 contains 808 loci with 0 heterozygous calls.
N07 contains 4904 loci with 0 heterozygous calls.
N08 contains 1470 loci with 0 heterozygous calls.
N09 contains 3 loci with 0 heterozygous calls.
N10 contains 0 loci with 0 heterozygous calls.
N11 contains 3333 loci with 0 heterozygous calls.
N12 contains 1069 loci with 0 heterozygous calls.
N13 contains 4330 loci with 0 heterozygous calls.
N14 contains 13719 loci with 0 heterozygous calls.
N16 contains 15692 loci with 0 heterozygous calls.
N17 contains 205 loci with 0 heterozygous calls.
N18 contains 0 loci with 0 heterozygous calls.
N19 contains 17 loci with 0 heterozygous calls.
N20 contains 6 loci with 0 heterozygous calls.
N21 contains 0 loci with 0 heterozygous calls.
N22 contains 0 loci with 0 heterozygous calls.


In [20]:
output_dfs = OrderedDict()
for key in samples:
    output_dfs[key] = pd.merge(dfs[key].iloc[:,0:4], recoded[key][recoded[key].apply(is_nohetero, axis=1)], how='inner', left_index=True, right_index=True)
    output_dfs[key].to_csv('{}_homozygous_loci.tsv.gz'.format(key), sep='\t', index=False)

In [24]:
for key in samples:
    S = sorted(set(output_dfs[key].CHROM))
    if len(S) > 1:
        print('{} contains purely homozygous loci on {}.'.format(key, S))

N03 contains purely homozygous loci on ['chr01', 'chr03', 'chr04', 'chr05', 'chr06', 'chr07', 'chr08', 'chr09', 'chr10', 'chr11', 'chr12'].
N04 contains purely homozygous loci on ['chr04', 'chr05', 'chr08', 'chr10', 'chr12'].
N05 contains purely homozygous loci on ['chr01', 'chr02', 'chr03', 'chr04', 'chr07', 'chr08', 'chr09', 'chr10'].
N06 contains purely homozygous loci on ['chr01', 'chr02', 'chr03', 'chr04', 'chr06', 'chr07', 'chr08', 'chr09', 'chr11'].
N07 contains purely homozygous loci on ['chr01', 'chr02', 'chr03', 'chr04', 'chr05', 'chr06', 'chr07', 'chr08', 'chr09', 'chr10', 'chr11', 'chr12'].
N08 contains purely homozygous loci on ['chr01', 'chr02', 'chr03', 'chr04', 'chr05', 'chr06', 'chr07', 'chr08', 'chr09', 'chr10', 'chr11', 'chr12'].
N11 contains purely homozygous loci on ['chr01', 'chr02', 'chr03', 'chr04', 'chr05', 'chr06', 'chr07', 'chr08', 'chr09', 'chr10', 'chr11', 'chr12'].
N12 contains purely homozygous loci on ['chr01', 'chr02', 'chr03', 'chr04', 'chr05', 'chr06'

In [59]:
candidates = ['N03', 'N04', 'N05', 'N06', 'N07', 'N08', 'N11', 'N12', 'N13', 'N14', 'N16', 'N17', 'N19']
results_df = OrderedDict()
for key in candidates:
    results_df[key] = list()
    print('Started processing {}'.format(key))
    for i in range(0, output_dfs[key].shape[0]-1):
        for j in range(i+1, output_dfs[key].shape[0]):
            if output_dfs[key].iloc[i, 0] != output_dfs[key].iloc[j, 0]:
                tmp = list(zip(output_dfs[key].iloc[i, 6:], output_dfs[key].iloc[j, 6:]))
                flag = True
                for ta, tb in tmp:
                    # If we get AB or BA move on to the next
                    if (ta == 'A') and (tb == 'B'):
                        flag = False
                        break
                    if (ta == 'B') and (tb == 'A'):
                        flag = False
                        break
                if flag:
                    tmp_row = list(output_dfs[key].iloc[i, :])
                    tmp_row.extend(list(output_dfs[key].iloc[j, :]))
                    results_df[key].append(tmp_row)
    results_df[key] = pd.DataFrame(results_df[key])
    print('Finished processing {}'.format(key))

Started processing N03
Finished processing N03
Started processing N04
Finished processing N04
Started processing N05
Finished processing N05
Started processing N06
Finished processing N06
Started processing N07
Finished processing N07
Started processing N08
Finished processing N08
Started processing N11
Finished processing N11
Started processing N12
Finished processing N12
Started processing N13
Finished processing N13
Started processing N14
Finished processing N14
Started processing N16
Finished processing N16
Started processing N17
Finished processing N17
Started processing N19
Finished processing N19


In [60]:
for key in candidates:
    results_df[key].to_csv('{}_nohetero_CouplingAnalyzer.tsv.gz'.format(key), sep='\t', header=False, index=False)