Author: Dan Shea  
Date: 2019.08.13
# $\chi^{2}$ goodness of fit for multiple categories using a contingency table
Here, we will make use of the scipy.stats function `chi2_contingency`. This is a $\chi^{2}$ test of independence of variables in a contingency table. Previously, we performed a $\chi^{2}$ goodness-of-fit test on allele frequences observed in the $F_{7}$ (RIL) populations of crosses derived from a common parent Hitomebore (ひとめぼれ) and one of our 20 founder lines.

| Sample | Founder  | Sample | Founder       |
|--------|----------|--------|---------------|
| N01 | Kasalath    | N12 | Urasan           |
| N03 | Keiboba     | N13 | Tupa 729         |
| N04 | Shoni       | N14 | Dee Jiao Hua Luo |
| N05 | Tupa 121-3  | N16 | Nerica 1         |
| N06 | Surjamukhi  | N17 | Takanari         |
| N07 | Ratul       | N18 | C8005            |
| N08 | Badari Dhan | N19 | Moukotou         |
| N09 | Kaluheenati | N20 | Nortai           |
| N10 | Jaguary     | N21 | Sesia            |
| N11 | Rexmont     | N22 | Hayayuki         |

This test allows us to perform analysis between the founder crosses, examining SNP loci that are shared between all of them.
We may then calculate the allele frequencies and construct a contingency table which will be used by `chi2_contingency` to test for independence of founders with respect to allele frequency.

In [1]:
import pandas as pd
from scipy import stats
import numpy as np
from statsmodels.stats import multitest
import os
import os.path
from collections import OrderedDict

In [2]:
samples = ['N01','N03','N04','N05','N06','N07','N08','N09','N10','N11',
           'N12','N13','N14','N16','N17','N18','N19','N20','N21','N22',]
founders = ['KASALATH','KEIBOBA','SHONI','TUPA_121-3','SURJAMUKHI','RATUL','BADARI_DHAN','KALUHEENATI','JAGUARY','REXMONT',
            'URASAN','TUPA_729','DEE_JIAO_HUA_LUO','NERICA_1','TAKANARI','C8005','MOUKOTOU','NORTAI','SESIA','HAYAYUKI',]
datadirs = ['_'.join([x, y]) for x, y in zip(samples, founders)]

In [3]:
datadirs

['N01_KASALATH',
 'N03_KEIBOBA',
 'N04_SHONI',
 'N05_TUPA_121-3',
 'N06_SURJAMUKHI',
 'N07_RATUL',
 'N08_BADARI_DHAN',
 'N09_KALUHEENATI',
 'N10_JAGUARY',
 'N11_REXMONT',
 'N12_URASAN',
 'N13_TUPA_729',
 'N14_DEE_JIAO_HUA_LUO',
 'N16_NERICA_1',
 'N17_TAKANARI',
 'N18_C8005',
 'N19_MOUKOTOU',
 'N20_NORTAI',
 'N21_SESIA',
 'N22_HAYAYUKI']

In [4]:
allele_frequency_files = [os.path.join('beagle_output', x, y+'_allele_frequencies.tsv') for x, y in zip(datadirs, samples)]
genotype_files = [os.path.join('beagle_output', x, y+'_genotypes.tsv') for x, y in zip(datadirs, samples)]

In [5]:
allele_frequency_files

['beagle_output/N01_KASALATH/N01_allele_frequencies.tsv',
 'beagle_output/N03_KEIBOBA/N03_allele_frequencies.tsv',
 'beagle_output/N04_SHONI/N04_allele_frequencies.tsv',
 'beagle_output/N05_TUPA_121-3/N05_allele_frequencies.tsv',
 'beagle_output/N06_SURJAMUKHI/N06_allele_frequencies.tsv',
 'beagle_output/N07_RATUL/N07_allele_frequencies.tsv',
 'beagle_output/N08_BADARI_DHAN/N08_allele_frequencies.tsv',
 'beagle_output/N09_KALUHEENATI/N09_allele_frequencies.tsv',
 'beagle_output/N10_JAGUARY/N10_allele_frequencies.tsv',
 'beagle_output/N11_REXMONT/N11_allele_frequencies.tsv',
 'beagle_output/N12_URASAN/N12_allele_frequencies.tsv',
 'beagle_output/N13_TUPA_729/N13_allele_frequencies.tsv',
 'beagle_output/N14_DEE_JIAO_HUA_LUO/N14_allele_frequencies.tsv',
 'beagle_output/N16_NERICA_1/N16_allele_frequencies.tsv',
 'beagle_output/N17_TAKANARI/N17_allele_frequencies.tsv',
 'beagle_output/N18_C8005/N18_allele_frequencies.tsv',
 'beagle_output/N19_MOUKOTOU/N19_allele_frequencies.tsv',
 'beagle_o

In [6]:
genotype_files

['beagle_output/N01_KASALATH/N01_genotypes.tsv',
 'beagle_output/N03_KEIBOBA/N03_genotypes.tsv',
 'beagle_output/N04_SHONI/N04_genotypes.tsv',
 'beagle_output/N05_TUPA_121-3/N05_genotypes.tsv',
 'beagle_output/N06_SURJAMUKHI/N06_genotypes.tsv',
 'beagle_output/N07_RATUL/N07_genotypes.tsv',
 'beagle_output/N08_BADARI_DHAN/N08_genotypes.tsv',
 'beagle_output/N09_KALUHEENATI/N09_genotypes.tsv',
 'beagle_output/N10_JAGUARY/N10_genotypes.tsv',
 'beagle_output/N11_REXMONT/N11_genotypes.tsv',
 'beagle_output/N12_URASAN/N12_genotypes.tsv',
 'beagle_output/N13_TUPA_729/N13_genotypes.tsv',
 'beagle_output/N14_DEE_JIAO_HUA_LUO/N14_genotypes.tsv',
 'beagle_output/N16_NERICA_1/N16_genotypes.tsv',
 'beagle_output/N17_TAKANARI/N17_genotypes.tsv',
 'beagle_output/N18_C8005/N18_genotypes.tsv',
 'beagle_output/N19_MOUKOTOU/N19_genotypes.tsv',
 'beagle_output/N20_NORTAI/N20_genotypes.tsv',
 'beagle_output/N21_SESIA/N21_genotypes.tsv',
 'beagle_output/N22_HAYAYUKI/N22_genotypes.tsv']

In [7]:
frequency_dfs = OrderedDict()
for key, value in zip(samples, allele_frequency_files):
    frequency_dfs[key] = pd.read_csv(value, sep='\t', index_col=0)
    
genotype_dfs = OrderedDict()
for key, value in zip(samples, genotype_files):
    genotype_dfs[key] = pd.read_csv(value, sep='\t', index_col=0)

  mask |= (ar1 == a)


In [8]:
data_dfs = OrderedDict()
for key in samples:
    data_dfs[key] = pd.concat([genotype_dfs[key], frequency_dfs[key]], axis=1)

In [9]:
data_dfs['N01']

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,N00_HITOMEBORE,...,N01_170,N01_171,N01_172,N01_173,Homozygous A,Homozygous B,statistic,pvalue,qvalue,significant
0,chr01,1411,.,A,G,.,PASS,.,GT,A,...,H,A,A,A,62,96,7.316456,0.006833,0.035956,True
1,chr01,1465,.,G,A,.,PASS,.,GT,A,...,H,A,A,A,62,96,7.316456,0.006833,0.035956,True
2,chr01,1573,.,C,T,.,PASS,.,GT,A,...,H,A,A,A,62,96,7.316456,0.006833,0.035956,True
3,chr01,1708,.,C,T,.,PASS,.,GT,A,...,H,A,A,A,62,96,7.316456,0.006833,0.035956,True
4,chr01,1729,.,A,G,.,PASS,.,GT,A,...,H,A,A,A,62,96,7.316456,0.006833,0.035956,True
5,chr01,1733,.,C,A,.,PASS,.,GT,A,...,H,A,A,A,62,96,7.316456,0.006833,0.035956,True
6,chr01,1751,.,T,A,.,PASS,.,GT,A,...,H,A,A,A,62,96,7.316456,0.006833,0.035956,True
7,chr01,1948,.,C,T,.,PASS,.,GT,A,...,H,A,A,A,62,96,7.316456,0.006833,0.035956,True
8,chr01,1953,.,G,A,.,PASS,.,GT,A,...,H,A,A,A,62,96,7.316456,0.006833,0.035956,True
9,chr01,2004,.,G,A,.,PASS,.,GT,A,...,H,A,A,A,62,96,7.316456,0.006833,0.035956,True


In [10]:
# We no longer require these dfs, so let's delete them to free up some memory
del(frequency_dfs)
del(genotype_dfs)

In [11]:
# Perform initial merge on the first two samples, then every other sample is merged to those results iteratively
# This gives us the SNPs present amongst all founders.
merged_data = pd.merge(data_dfs['N01'], data_dfs['N03'], how='inner', on=['CHROM', 'POS'])
for key in samples[2:]:
    merged_data = pd.merge(merged_data, data_dfs[key], how='inner', on=['CHROM', 'POS'])

In [21]:
merged_data

Unnamed: 0,CHROM,POS,ID_x,REF_x,ALT_x,QUAL_x,FILTER_x,INFO_x,FORMAT_x,N00_HITOMEBORE_x,...,N22_245,N22_248,N22_249,N22_250,Homozygous A_y,Homozygous B_y,statistic_y,pvalue_y,qvalue_y,significant_y
0,chr01,6110684,.,G,A,.,PASS,.,GT,A,...,B,A,H,B,111,116,0.110132,0.739994,0.976983,False
1,chr01,6604804,.,G,A,.,PASS,.,GT,A,...,B,A,H,B,114,115,0.004367,0.947313,0.976983,False
2,chr01,6824277,.,A,C,.,PASS,.,GT,A,...,B,A,H,B,115,114,0.004367,0.947313,0.976983,False
3,chr01,6951924,.,A,G,.,PASS,.,GT,A,...,B,A,H,B,114,116,0.017391,0.895082,0.976983,False
4,chr01,6963610,.,A,G,.,PASS,.,GT,A,...,B,A,H,B,114,116,0.017391,0.895082,0.976983,False
5,chr01,7006014,.,A,T,.,PASS,.,GT,A,...,B,A,H,B,111,117,0.157895,0.691102,0.976983,False
6,chr01,8036400,.,T,C,.,PASS,.,GT,A,...,A,A,H,B,115,121,0.152542,0.696118,0.976983,False
7,chr01,8496054,.,T,C,.,PASS,.,GT,A,...,A,A,B,B,110,119,0.353712,0.552019,0.970019,False
8,chr01,9407717,.,T,G,.,PASS,.,GT,A,...,A,A,B,B,116,115,0.004329,0.947541,0.976983,False
9,chr01,9585370,.,T,C,.,PASS,.,GT,A,...,A,A,B,B,118,116,0.017094,0.895978,0.976983,False


In [22]:
# Dump the data to a tsv file before we further prune the data
merged_data.to_csv('beagle_output/Common_SNPS_all_founders.tsv', sep='\t')    

In [25]:
# We only need the common loci, we extract the other information from our previously constructed data_dfs
merged_data = merged_data.loc[:, ['CHROM', 'POS']]

In [31]:
filtered_data_dfs = OrderedDict()
for key in samples:
    filtered_data_dfs[key] = pd.merge(merged_data, data_dfs[key], how='inner', on=['CHROM', 'POS'])

In [34]:
# We quickly sanity check this by ensuring the shapes are all 3,745 rows long
for key in filtered_data_dfs.keys():
    print('{} is {}'.format(key, filtered_data_dfs[key].shape))

N01 is (3745, 188)
N03 is (3745, 94)
N04 is (3745, 142)
N05 is (3745, 219)
N06 is (3745, 95)
N07 is (3745, 81)
N08 is (3745, 96)
N09 is (3745, 264)
N10 is (3745, 163)
N11 is (3745, 204)
N12 is (3745, 158)
N13 is (3745, 104)
N14 is (3745, 44)
N16 is (3745, 55)
N17 is (3745, 156)
N18 is (3745, 267)
N19 is (3745, 266)
N20 is (3745, 261)
N21 is (3745, 267)
N22 is (3745, 260)


A brief word about `chi2_contingency`. The function accepts a contingency table where a contingency table is defined as having a different number of observations for each population (row), but a similar proportion across each group (column). Given the similar proportions, we would expect the test to find that the groups are similar and that the variables are independent (fail to reject the null hypothesis, or $H_{0}$).

In our case, the countingency will looks something like this:

| Founder  | Homozygous A | Homozygous B |
|----------|--------------|--------------|
| Kasalath | $\sum_{i=0}^{n}A_{i}$ | $\sum_{i=0}^{n}B_{i}$ |
| $\vdots$ | $\vdots$              | $\vdots$              |

Which will simply be a list of lists in python of the counts of Homozygous for A and Homozygous for B for each of the 20 RILs:

```
[[49, 60],
 [32, 24],]
```



In [51]:
result_rows = list()
for idx in range(0, merged_data.shape[0]):
    chi2, p, dof, ex = stats.chi2_contingency([filtered_data_dfs[key].loc[idx, ['Homozygous A', 'Homozygous B']].to_numpy()
                                               for key in samples])
    result_rows.append([chi2, p])
chisquare_df = pd.DataFrame(data=result_rows, columns=['chisquare', 'pvalue'])

In [52]:
chisquare_df

Unnamed: 0,chisquare,pvalue
0,76.979980,6.129777e-09
1,68.003703,1.968493e-07
2,74.698083,1.497472e-08
3,84.105970,3.608249e-10
4,83.755060,4.154245e-10
5,83.574459,4.466461e-10
6,104.052330,9.822481e-14
7,126.765290,5.953355e-18
8,100.246453,4.832456e-13
9,100.295639,4.734312e-13


In [56]:
_tmp = multitest.fdrcorrection(chisquare_df.pvalue)
chisquare_df = pd.concat([chisquare_df, pd.DataFrame(zip(_tmp[1], _tmp[0]), columns=['qvalue', 'significant'])], axis=1)
del(_tmp)

These are the Benjamini-Hochberg corrected p-values (i.e. - qvalues) that result from testing contingency tables derived from the observed allele frequencies for all 20 RIL groups (i.e. - founders). Out qvalues indicate that at least one of the founder's is not independent of the allele frequency. Thereofre, we conduct a _post-hoc_ test by examining the frequencies of the indivudal founder groups, in relation to the sum total of the remaining 19 founder groups. We do this for each founder, and for each SNP locus.

In [57]:
chisquare_df

Unnamed: 0,chisquare,pvalue,qvalue,significant
0,76.979980,6.129777e-09,1.740411e-08,True
1,68.003703,1.968493e-07,4.578886e-07,True
2,74.698083,1.497472e-08,4.111460e-08,True
3,84.105970,3.608249e-10,1.190563e-09,True
4,83.755060,4.154245e-10,1.365904e-09,True
5,83.574459,4.466461e-10,1.462141e-09,True
6,104.052330,9.822481e-14,4.491476e-13,True
7,126.765290,5.953355e-18,3.798180e-17,True
8,100.246453,4.832456e-13,1.941797e-12,True
9,100.295639,4.734312e-13,1.912213e-12,True


In [65]:
# First obtain the sum total observed frequencies at each SNP locus by summing the observed frequencies of all founders.
_tmp_sum_A = np.zeros(merged_data.shape[0])
_tmp_sum_B = np.zeros(merged_data.shape[0])
for key in samples:
    _tmp_sum_A += filtered_data_dfs[key].loc[:, 'Homozygous A'].to_numpy()
    _tmp_sum_B += filtered_data_dfs[key].loc[:, 'Homozygous B'].to_numpy()

sum_totals = [[a, b] for a, b in zip(_tmp_sum_A, _tmp_sum_B)]
del(_tmp_sum_A)
del(_tmp_sum_B)

In [67]:
# Now perform post-hoc tests for each founder, and each SNP locus
post_hoc = OrderedDict()
for key in samples:
    _tmp_results = list()
    _tmp_obs = filtered_data_dfs[key].loc[:, ['Homozygous A', 'Homozygous B']].to_numpy()
    _tmp_obs_others = sum_totals - _tmp_obs
    _ctables = [[x, y] for x, y in zip(_tmp_obs, _tmp_obs_others)]
    for ctable in _ctables:
        chi2, p, dof, ex = stats.chi2_contingency(ctable)
        _tmp_results.append([chi2, p])
    post_hoc[key] = pd.DataFrame(_tmp_results, columns=['chisquare', 'pvalue'])

In [76]:
# Now I want to do the Benjamini-Hochberg FDR correction, but I need to correct for all loci tests in all founders.
# So we concatenate all the pvalues into a giant list and run the correction, then split it bac up into the founder lists
# And add the FDR corrected qvalues to the post-hoc dataframe for each founder
all_pvals = list()
for key in samples:
    all_pvals.extend(post_hoc[key].loc[:, 'pvalue'])
significance, qvalues = multitest.fdrcorrection(all_pvals)
for offset, key in enumerate(samples):
    start = offset*merged_data.shape[0]
    stop = start + merged_data.shape[0]
    post_hoc[key] = pd.concat([post_hoc[key],
                               pd.DataFrame(zip(qvalues[start:stop], significance[start:stop]),
                                            columns=['qvalues', 'significant'])], axis=1)
    

In [81]:
# Now, we can combine the shared loci information, with the results of the post-hoc analyses for each founder and dump the
# resulting dataframe to a tsv file.
for key, founder in zip(samples, founders):
    _tmp = '_'.join([key, founder])
    _df = pd.concat([merged_data, post_hoc[key]], axis=1)
    _df.to_csv(os.path.join('beagle_output',
                            _tmp,
                            '{}_chisquare_contigency_post_hoc_results.tsv'.format(_tmp)),
               sep='\t')
del(_tmp)
del(_df)

## Here, we examine the complete set of SNPs for each founder and determine the recombination frequency at each SNP locus.

In [103]:
high_density_by_chrom = OrderedDict()
chroms = [''.join(['chr','{0:02d}'.format(n)]) for n in range(1, 13)]
for key in samples:
    if key not in high_density_by_chrom.keys():
        high_density_by_chrom[key] = OrderedDict()
    for c in chroms:
        high_density_by_chrom[key][c] = pd.concat([data_dfs[key][data_dfs[key].CHROM==c].iloc[:,0:2],
                                                   data_dfs[key][data_dfs[key].CHROM==c].iloc[:,11:-6]], axis=1)


In [126]:
breakpoints = OrderedDict()
# for every sample
for key in samples:
    # for every chromosome in the genome
    breakpoints[key] = OrderedDict()
    for c in chroms:
        breakpoints[key][c] = list()
        for col in range(2, high_density_by_chrom[key][c].shape[1]):
            # treat the sample plant as a numpy array
            plant = high_density_by_chrom[key][c].iloc[:, col].to_numpy()
            prev='H'
            prev_idx = -1
            current_idx = 0
            for current in plant:
                if current != prev:
                    if (current == 'A' and prev == 'B') or (current == 'B' and prev == 'A'):
                        prev_c, prev_pos = high_density_by_chrom[key][c].iloc[prev_idx, 0:2].to_numpy()
                        current_c, current_pos = high_density_by_chrom[key][c].iloc[current_idx, 0:2].to_numpy()
                        breakpoints[key][c].append([c, prev_pos+(current_pos-prev_pos)/2, 
                                                    '{}{}'.format(prev,current),
                                                    high_density_by_chrom[key][c].columns[col]])
                prev = current
                prev_idx += 1
                current_idx +=1
        
        breakpoints[key][c] = pd.DataFrame(data=breakpoints[key][c], columns=['CHROM', 'POS', 'TRANSITION', 'SOURCE'])
                    

In [130]:
# Sort the breakpoint data inplace by the POS column
# Do this for all samples and all chromosomes in each sample
for key in samples:
    for c in chroms:
        breakpoints[key][c].sort_values(by=['POS'], inplace=True)

The size of rice genome breaks down as follows:

| Chromosome | Size (bp) |
|------------|-----------|
| chr01      | 43,270,923|
| chr02      | 35,937,250|
| chr03      | 36,413,819|
| chr04      | 35,502,694|
| chr05      | 29,958,434|
| chr06      | 31,248,787|
| chr07      | 29,697,621|
| chr08      | 28,443,022|
| chr09      | 23,012,720|
| chr10      | 23,207,287|
| chr11      | 29,021,106|
| chr12      | 27,531,856|

This gives us a mean chromosome length of 31,103,793.25 bp. Therefore, I have selected a `window_size` of 100 kbp for sliding window analysis, since that gives us, on average, 311 windows per chromosome, which should be sufficient resolution for plotting recombination hotspots.

In [133]:
genome_size = OrderedDict({'chr01': 43270923,
                           'chr02': 35937250,
                           'chr03': 36413819,
                           'chr04': 35502694,
                           'chr05': 29958434,
                           'chr06': 31248787,
                           'chr07': 29697621,
                           'chr08': 28443022,
                           'chr09': 23012720,
                           'chr10': 23207287,
                           'chr11': 29021106,
                           'chr12': 27531856,})

In [184]:
recombination_frequencies = OrderedDict()
window_size = 100000
for key in samples:
    recombination_frequencies[key] = OrderedDict()
    for c in chroms:
        recombination_frequencies[key][c] = list()
        num_windows = (genome_size[c]//window_size) + 1
        for offset in range(0, num_windows):
            win_start = offset * window_size
            win_stop  = win_start + window_size
            count = sum((breakpoints[key][c].POS >= win_start) & (breakpoints[key][c].POS < win_stop))
            recombination_frequencies[key][c].append([offset, win_start, win_stop, count])
        recombination_frequencies[key][c] = pd.DataFrame(data=recombination_frequencies[key][c],
                                                         columns=['Window', 'Start', 'Stop', 'Count'])

In [185]:
# Add in the size of the population for each sample
for key in samples:
    N = len(data_dfs[key].iloc[:,11:-6].columns)
    for c in chroms:
        recombination_frequencies[key][c]['N'] = N

In [186]:
# Calculate the recombination frequencies and add them to the dataframes as 'Frequency'
for key in samples:
    for c in chroms:
        recombination_frequencies[key][c]['Frequency'] = (recombination_frequencies[key][c].Count / recombination_frequencies[key][c].N)


In [187]:
# Dump the recombination frequencies for all samples as tsv files
for key, fkey in zip(samples, founders):
    output_df = pd.DataFrame()
    for c in chroms:
        # Use of .copy() method here ensures we don't alter recombination_frequencies' dataframes
        rf_df = recombination_frequencies[key][c].copy() 
        rf_df['CHROM'] = c
        rf_df = rf_df.reindex(columns=['CHROM', 'Window', 'Start', 'Stop', 'Count', 'N', 'Frequency'])
        output_df = pd.concat([output_df, rf_df], axis=0)
    output_df.to_csv('recombination_frequency/{}_{}_recombination_frequency.tsv'.format(key, fkey), sep='\t', index=False)