__Author:__ Dan Shea  
__Date:__ 2020.01.06  
__Description:__ Cross-species marker analysis between _lilium longiflorum_ and _lilium ???_  
The first set of GRAS-Di (Genotyping by Random Amplicon Sequencing-Direct) markers are dominant (presence / absence).  
The second set of GRAS-Di markers are co-dominant (A / B / H).  

To combine the dominant markers and the co-dominant markers for analysis, we need to re-code the dominant markers as follows:

| Original coding | Meaning | New coding | Reason |
|-----------------|---------|------------|--------|
| H | Presence | H | Heterozygous |
| A | Absence | A for C2 marker |  Homozygous for C1 |
| A | Absence | B for C1 marker | Homozygous for C2 |


In [1]:
import pandas as pd

In [2]:
# Load in the data as a pandas dataframe
dominant_infile = 'GRAS-Di_RESULT.csv'
codominant_infile = 'codominant.csv'
dominant_df = pd.read_csv(dominant_infile)
codominant_df = pd.read_csv(codominant_infile)

In [3]:
# Take a look at the first 10 rows of the dominant marker results
dominant_df[0:10]

Unnamed: 0,PARENT,MARKER,SEQ1,SEQ2,LEN,QUALITY,AK1P1,AK2P1,PU1P2,PU2P2,...,L136.1,L137.1,L138.1,L139.1,L140.1,Unnamed: 267,GD,H,A,Avr
0,C2,AMP0000011,AAAAAAACACAGTATTTCAAACAGAAAGTTCATTTTGTTTTCAGAT...,AAGCAAAAGAGTACAAAATTCACCCTTCATAAAGATTACTATCTGA...,86,D,0.0,0.0,43.6,39.9,...,H,H,H,H,A,,HHHAAHAHAHHHHHHHHHHHHHAHHAHHHHHHAHAHHHHHHHHHHH...,99,27,57.989389
1,C2,AMP0000018,AAAAAAACTATCCTCAAATAGCAGAAATGTGCGCGTATTCTAAAAT...,ATATTGCGCGCATTCATTACCCAGTATATGCGTATTACAGTAAAAA...,104,D,0.0,0.0,44.7,8.2,...,A,H,A,A,H,,HHHHHAHHAHHAHHHHAAAHHHHHHHHAHAHHHHHHHHHAHHHHHH...,98,28,36.543579
2,C1,AMP0000020,AAAAAAAGAACTGAAAAAATCTAGCACAAAAACCTACATCCTCCAT...,TGGAGAGCAAGGTGATCCAACCAAATGAGTTTTGTATGGGAAGACC...,114,D,54.4,17.5,0.0,0.0,...,H,H,H,H,H,,HHHHHAHHHAHHHHHAHHAHHHAHHAHHAAHHHAHHHAHHAAHHHA...,96,30,84.466562
3,C2,AMP0000037,AAAAAAATGAGACATACATGGTACTCTGAATTCTCCGACATCCTAT...,GTACGGTAGGGCTATAGGATGTCGGAGAATTCAGAGTACCATGTAT...,59,D,0.0,0.0,26.8,19.9,...,H,H,H,H,H,,HAHAHHHHHHAHHHHHHHHHHAHAHAHHHHAAHHHHAHHHHHAHHH...,96,30,39.390339
4,C1,AMP0000052,AAAAAACCATTTAATAAGTAAATCTAAGTACCAAACTCAGGGATAG...,GAAGGAGGATATAAATCTAAGGGTCTTTTCAATTACTACATAGGAG...,111,D,97.4,135.3,0.0,0.0,...,H,H,H,H,H,,HHHHHHHHHHHHHHHHHAHAHAHAHAHHAHHHHHAAHHHHHAHAHH...,100,26,82.165053
5,C2,AMP0000104,AAAAAAGTTATTGCTAGTTTGCCTATTAAAAGTTTTTATTTGGCTG...,AGAAGCAATAACTTTCAGTAAGATCAAAACCTTCAGCAAAAACATA...,114,E,0.0,0.0,157.5,275.7,...,H,H,A,H,A,,HHHHHHHHAHHAHAHAHAHHHHHHAAHAHHAHHHHAHAHAAAHHHH...,90,36,161.584213
6,C2,AMP0000111,AAAAAATAGCTATCATGTTACAAAGGATGAGATGTATTAAAGAGAA...,TGAGCATTCTCTTTAATACATCTCATCCTTTGTAACATGATAGCTA...,52,E,0.0,0.0,92.7,117.3,...,A,A,A,A,A,,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...,0,126,0.0
7,C1,AMP0000116,AAAAAATCACAAAATAGGTAAATCTAAGTACCTAACTCAAGAATAA...,TGGTGAGAAGGAGGATGCCGATGTGAGGGTCCTTCTGGCTATCGTA...,117,D,12.5,7.5,0.0,0.0,...,H,H,H,H,H,,HAAHAHHHHHHHHHAHHAHHHHAHAAAHAAHAHHHAHHHAHAHAAH...,87,39,18.908637
8,C1,AMP0000125,AAAAAATTACTACATAATATTAAGAATTAAAAATAATAGTGAATAT...,TGCATGTGCAGTTTATTTTAGTTATTAAAGGTGGTTTGACACCCAT...,90,D,145.0,259.2,0.0,0.0,...,H,H,H,H,A,,HAHHHHHHHHAHHHHAHHHHAHAHHAHHAHHAHAAHHAAHHHHHAH...,95,31,162.363581
9,C2,AMP0000139,AAAAACACAAAGACAAACATGCCTAGAAACATTCAACTCACCTCAA...,GGATGCCCATATAAGAGGGTCGGTGTGTGTATTCGTCAACCATAGT...,129,D,0.0,0.0,359.8,1018.4,...,A,H,H,H,H,,HHHHHHHHHHAHHHHHHHHHHAHHHHAHHHAHHHHHHHHHHAHHHA...,104,22,659.504893


In [4]:
# Take a look at the first 10 rows of the codominant marker results
codominant_df[0:10]

Unnamed: 0,PARENT,ID,seq1,seq2,length,quality,pair ID,GD,posi,nega,seq dist,GD dist
0,C2,AMP0000912,AAAAGTACAAAGGTAAATATGCCTAGAAATATTTAACTCTCCCCGT...,GGATACCCACATGAGAGGGTCGATGTATGTATTTACTAACCACAAT...,129,D,AMP0000913,HHBHBAHHHHHBHHBHAHABHBHAHBBBHBHBBHHAHHBAHHABHB...,106,20,1,78
1,C1,AMP0000913,AAAAGTACAAAGGTAAATATGCCTAGAAATATTTAACTCTCCCCGT...,GGATACCCACATGAGAGGGTCGATGTATGTATTTACTAACCACAAT...,129,D,AMP0000912,HHBHBAHHHHHBHHBHAHABHBHAHBBBHBHBBHHAHHBAHHABHB...,98,28,1,78
2,C1,AMP0001705,AAACCATGATGGTTTATATTTTAGTGACATATCAATATTTTCAATG...,CAAAATTGTAGACATTGAAAATATTGATATGTCACTAAAATATAAA...,58,D,AMP0001706,HHHHHBHAHABHBHHHHBBHAHHHBBHABHABBHBHBHHHBBHBHH...,82,44,1,63
3,C2,AMP0001706,AAACCATGATGGTTTATATTTTAGTGACATATCTATATTTTCAATG...,CAAAATTGTAGACATTGAAAATATAGATATGTCACTAAAATATAAA...,58,D,AMP0001705,HHHHHBHAHABHBHHHHBBHAHHHBBHABHABBHBHBHHHBBHBHH...,107,19,1,63
4,C2,AMP0001737,AAACCCCTACAAGCTCATAGAACAAACATGATGGGAAGCCCATGCA...,GCATGAGGGAAATATGGTATATGGGGGTATGAGCGATAATTGCATG...,86,D,AMP0001739,HAAHBBHAHHHHHABHBBAHHBHHHAAABHHAABBHBHAAHBHABB...,93,33,2,50
5,C1,AMP0001739,AAACCCCTACCAGCTCATAGAACAAACATGATGGGAAGCCCATGTA...,GCATGAGGGAAATATGGTATATGGGGGTATGAGCGATAATTACATG...,86,D,AMP0001737,HAAHBBHAHHHHHABHBBAHHBHHHAAABHHAABBHBHAAHBHABB...,83,43,2,50
6,C1,AMP0002145,AAACTTAACTTTCATTTCAAATTTTAAATATTTAGTACATATTACC...,GAGTAGGAGTAGGAGTAGAGTCACCTTGTAAGTTTAGCATAGCATG...,102,D,AMP0002146,HBBHBHAAHHHHHABHHBHHHABHBBBHBBHHAAHBAHHHABABHH...,91,35,3,62
7,C2,AMP0002146,AAACTTAACTTTCATTTCAAATTTTAAATATTTAGTACATATTACT...,GAGTAGGAGTAGGAGTAGAGTCACCTTGTAAGTTTAACATAACATG...,102,D,AMP0002145,HBBHBHAAHHHHHABHHBHHHABHBBBHBBHHAAHBAHHHABABHH...,97,29,3,62
8,C1,AMP0002174,AAACTTATTCGAAATCTGAAGAGGAAAACCCGAAGGATCTAAGATC...,GGTAGACGTGATCTTAGATCCTTCGGGTTTTCCTCTTCAGATTTCG...,55,D,AMP0002175,HAAHBBHAHHHHHABABBHHHBBHHHAABHBAABBHBAAAHBHABB...,81,45,2,49
9,C2,AMP0002175,AAACTTATTTGAAATCTGAAGAGGAAAACCCGAAGGATCTGAGATC...,GGTAGACGTGATCTCAGATCCTTCGGGTTTTCCTCTTCAGATTTCA...,55,D,AMP0002174,HAAHBBHAHHHHHABABBHHHBBHHHAABHBAABBHBAAAHBHABB...,94,32,2,49


In [5]:
def recode_genotype(df):
    '''
    recode_genotype takes a dataframe where the first column is the marker's parent code and remaining columns are the presence / absence (H/A) calls
    returns a new dataframe containing the calls consistent with a codominant interpretation of the marker genotypes
    '''
    data = []
    for row in df.itertuples(index=False):
        marker_code = row.PARENT
        new_row = [marker_code]
        for genotype_call in row[1:]:
            # If the call was 'H' (i.e. - marker amplified), then return 'H' (i.e. - heterozygous at this locus)
            if genotype_call == 'H':
                new_row.append('H')
            # If the call was 'A' (i.e. - marker did not amplify), then return homozygous for the opposing parent of the marker
            # Here, C1 (Cluster 1) is Parent A and C2 (Cluster 2) is Parent B
            elif marker_code == 'C1':
                if genotype_call == 'A':
                    new_row.append('B')
                else:
                    new_row.append(genotype_call)
            elif marker_code == 'C2':
                if genotype_call == 'A':
                    new_row.append('A')
                else:
                    new_row.append(genotype_call)
            else:
                new_row.append(genotype_call)
            
        data.append(new_row)
    return pd.DataFrame(data=data, columns = df.columns)

In [6]:
# For the dominant marker data:
# columns 137 - 266 contain the genotype calls for each sample in the data set
indices = list(range(137,267))
# we also want the parent column, which contains the marker_code for that locus
indices.insert(0, 0)
# subset the dataset using our desired indices, and keep that in dominant_genotypes
dominant_genotypes = dominant_df.iloc[:, indices]

In [7]:
dominant_genotypes[0:10]

Unnamed: 0,PARENT,AK1P1.1,AK2P1.1,PU1P2.1,PU2P2.1,F1.1,L6.1,L7.1,L8.1,L9.1,...,L131.1,L132.1,L133.1,L134.1,L135.1,L136.1,L137.1,L138.1,L139.1,L140.1
0,C2,A,A,H,H,H,H,H,A,A,...,H,H,H,A,H,H,H,H,H,A
1,C2,A,A,H,H,H,H,H,H,H,...,H,H,H,H,H,A,H,A,A,H
2,C1,H,H,A,A,H,H,H,H,H,...,H,H,A,H,H,H,H,H,H,H
3,C2,A,A,H,H,H,A,H,A,H,...,H,A,H,H,H,H,H,H,H,H
4,C1,H,H,A,A,H,H,H,H,H,...,A,H,H,H,H,H,H,H,H,H
5,C2,A,A,H,H,H,H,H,H,H,...,A,H,H,H,H,H,H,A,H,A
6,C2,A,A,H,H,A,A,A,A,A,...,A,A,A,A,A,A,A,A,A,A
7,C1,H,H,A,A,H,A,A,H,A,...,A,H,H,H,H,H,H,H,H,H
8,C1,H,H,A,A,H,A,H,H,H,...,H,H,H,H,A,H,H,H,H,A
9,C2,A,A,H,H,H,H,H,H,H,...,A,H,H,A,H,A,H,H,H,H


In [8]:
dominant_genotypes_recoded = recode_genotype(dominant_genotypes)

In [9]:
dominant_genotypes_recoded[0:10]

Unnamed: 0,PARENT,AK1P1.1,AK2P1.1,PU1P2.1,PU2P2.1,F1.1,L6.1,L7.1,L8.1,L9.1,...,L131.1,L132.1,L133.1,L134.1,L135.1,L136.1,L137.1,L138.1,L139.1,L140.1
0,C2,A,A,H,H,H,H,H,A,A,...,H,H,H,A,H,H,H,H,H,A
1,C2,A,A,H,H,H,H,H,H,H,...,H,H,H,H,H,A,H,A,A,H
2,C1,H,H,B,B,H,H,H,H,H,...,H,H,B,H,H,H,H,H,H,H
3,C2,A,A,H,H,H,A,H,A,H,...,H,A,H,H,H,H,H,H,H,H
4,C1,H,H,B,B,H,H,H,H,H,...,B,H,H,H,H,H,H,H,H,H
5,C2,A,A,H,H,H,H,H,H,H,...,A,H,H,H,H,H,H,A,H,A
6,C2,A,A,H,H,A,A,A,A,A,...,A,A,A,A,A,A,A,A,A,A
7,C1,H,H,B,B,H,B,B,H,B,...,B,H,H,H,H,H,H,H,H,H
8,C1,H,H,B,B,H,B,H,H,H,...,H,H,H,H,B,H,H,H,H,B
9,C2,A,A,H,H,H,H,H,H,H,...,A,H,H,A,H,A,H,H,H,H


In [10]:
dominant_marker_results = pd.concat([dominant_df.loc[:, 'MARKER'], dominant_genotypes_recoded.iloc[:, 1:]], axis=1)

In [11]:
dominant_marker_results[0:10]

Unnamed: 0,MARKER,AK1P1.1,AK2P1.1,PU1P2.1,PU2P2.1,F1.1,L6.1,L7.1,L8.1,L9.1,...,L131.1,L132.1,L133.1,L134.1,L135.1,L136.1,L137.1,L138.1,L139.1,L140.1
0,AMP0000011,A,A,H,H,H,H,H,A,A,...,H,H,H,A,H,H,H,H,H,A
1,AMP0000018,A,A,H,H,H,H,H,H,H,...,H,H,H,H,H,A,H,A,A,H
2,AMP0000020,H,H,B,B,H,H,H,H,H,...,H,H,B,H,H,H,H,H,H,H
3,AMP0000037,A,A,H,H,H,A,H,A,H,...,H,A,H,H,H,H,H,H,H,H
4,AMP0000052,H,H,B,B,H,H,H,H,H,...,B,H,H,H,H,H,H,H,H,H
5,AMP0000104,A,A,H,H,H,H,H,H,H,...,A,H,H,H,H,H,H,A,H,A
6,AMP0000111,A,A,H,H,A,A,A,A,A,...,A,A,A,A,A,A,A,A,A,A
7,AMP0000116,H,H,B,B,H,B,B,H,B,...,B,H,H,H,H,H,H,H,H,H
8,AMP0000125,H,H,B,B,H,B,H,H,H,...,H,H,H,H,B,H,H,H,H,B
9,AMP0000139,A,A,H,H,H,H,H,H,H,...,A,H,H,A,H,A,H,H,H,H


In [12]:
newcols = [name[0:len(name)-2] for name in dominant_marker_results.columns]
newcols[0] = 'ID'
dominant_marker_results.columns = newcols

In [13]:
dominant_marker_results[0:10]

Unnamed: 0,ID,AK1P1,AK2P1,PU1P2,PU2P2,F1,L6,L7,L8,L9,...,L131,L132,L133,L134,L135,L136,L137,L138,L139,L140
0,AMP0000011,A,A,H,H,H,H,H,A,A,...,H,H,H,A,H,H,H,H,H,A
1,AMP0000018,A,A,H,H,H,H,H,H,H,...,H,H,H,H,H,A,H,A,A,H
2,AMP0000020,H,H,B,B,H,H,H,H,H,...,H,H,B,H,H,H,H,H,H,H
3,AMP0000037,A,A,H,H,H,A,H,A,H,...,H,A,H,H,H,H,H,H,H,H
4,AMP0000052,H,H,B,B,H,H,H,H,H,...,B,H,H,H,H,H,H,H,H,H
5,AMP0000104,A,A,H,H,H,H,H,H,H,...,A,H,H,H,H,H,H,A,H,A
6,AMP0000111,A,A,H,H,A,A,A,A,A,...,A,A,A,A,A,A,A,A,A,A
7,AMP0000116,H,H,B,B,H,B,B,H,B,...,B,H,H,H,H,H,H,H,H,H
8,AMP0000125,H,H,B,B,H,B,H,H,H,...,H,H,H,H,B,H,H,H,H,B
9,AMP0000139,A,A,H,H,H,H,H,H,H,...,A,H,H,A,H,A,H,H,H,H


In [14]:
codominant_genotypes = pd.DataFrame([list(val) for val in codominant_df.iloc[:, 7]], columns=dominant_marker_results.columns[5:])

In [15]:
codominant_marker_results = pd.concat([codominant_df.ID, codominant_genotypes], axis=1)

In [16]:
codominant_marker_results[0:10]

Unnamed: 0,ID,F1,L6,L7,L8,L9,L10,L11,L12,L13,...,L131,L132,L133,L134,L135,L136,L137,L138,L139,L140
0,AMP0000912,H,H,B,H,B,A,H,H,H,...,H,H,B,H,H,A,A,A,B,H
1,AMP0000913,H,H,B,H,B,A,H,H,H,...,H,H,B,H,H,A,A,A,B,H
2,AMP0001705,H,H,H,H,H,B,H,A,H,...,B,H,H,H,H,H,H,H,A,B
3,AMP0001706,H,H,H,H,H,B,H,A,H,...,B,H,H,H,H,H,H,H,A,B
4,AMP0001737,H,A,A,H,B,B,H,A,H,...,B,H,B,H,A,A,B,H,B,B
5,AMP0001739,H,A,A,H,B,B,H,A,H,...,B,H,B,H,A,A,B,H,B,B
6,AMP0002145,H,B,B,H,B,H,A,A,H,...,H,A,A,H,A,B,H,H,A,H
7,AMP0002146,H,B,B,H,B,H,A,A,H,...,H,A,A,H,A,B,H,H,A,H
8,AMP0002174,H,A,A,H,B,B,H,A,H,...,B,H,B,H,A,H,B,H,B,B
9,AMP0002175,H,A,A,H,B,B,H,A,H,...,B,H,B,H,A,H,B,H,B,B


In [17]:
dominant_marker_results = pd.concat([dominant_marker_results.ID, dominant_marker_results.iloc[:, 5:]], axis=1)

In [18]:
dominant_marker_results

Unnamed: 0,ID,F1,L6,L7,L8,L9,L10,L11,L12,L13,...,L131,L132,L133,L134,L135,L136,L137,L138,L139,L140
0,AMP0000011,H,H,H,A,A,H,A,H,A,...,H,H,H,A,H,H,H,H,H,A
1,AMP0000018,H,H,H,H,H,A,H,H,A,...,H,H,H,H,H,A,H,A,A,H
2,AMP0000020,H,H,H,H,H,B,H,H,H,...,H,H,B,H,H,H,H,H,H,H
3,AMP0000037,H,A,H,A,H,H,H,H,H,...,H,A,H,H,H,H,H,H,H,H
4,AMP0000052,H,H,H,H,H,H,H,H,H,...,B,H,H,H,H,H,H,H,H,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7429,AMP0116629,H,B,H,H,H,H,B,H,H,...,H,H,H,H,B,H,H,B,H,H
7430,AMP0116637,H,A,H,H,H,H,H,A,H,...,H,H,H,H,A,H,H,A,H,H
7431,AMP0116642,H,H,H,H,H,H,H,H,H,...,A,H,H,A,H,A,H,H,H,H
7432,AMP0116652,H,B,H,H,B,H,H,H,H,...,H,H,H,H,H,B,H,H,H,B


In [23]:
# Find all dominant markers that do not appear in the codominant results
dominant_marker_results = dominant_marker_results[~dominant_marker_results.ID.isin(codominant_marker_results.ID)]
dominant_marker_results.reset_index(drop=True, inplace=True)

In [24]:
marker_results = pd.concat([dominant_marker_results, codominant_marker_results], axis=0)

In [25]:
marker_results.reset_index(drop=True, inplace=True)

In [26]:
marker_results

Unnamed: 0,ID,F1,L6,L7,L8,L9,L10,L11,L12,L13,...,L131,L132,L133,L134,L135,L136,L137,L138,L139,L140
0,AMP0000011,H,H,H,A,A,H,A,H,A,...,H,H,H,A,H,H,H,H,H,A
1,AMP0000018,H,H,H,H,H,A,H,H,A,...,H,H,H,H,H,A,H,A,A,H
2,AMP0000020,H,H,H,H,H,B,H,H,H,...,H,H,B,H,H,H,H,H,H,H
3,AMP0000037,H,A,H,A,H,H,H,H,H,...,H,A,H,H,H,H,H,H,H,H
4,AMP0000052,H,H,H,H,H,H,H,H,H,...,B,H,H,H,H,H,H,H,H,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7429,AMP0116396,H,A,H,H,A,H,H,H,H,...,A,A,H,H,A,H,H,H,H,A
7430,AMP0116606,H,H,H,H,H,B,H,H,H,...,H,H,H,H,A,A,B,H,B,H
7431,AMP0116608,H,H,H,H,H,B,H,H,H,...,H,H,H,H,A,A,B,H,B,H
7432,AMP0116624,H,B,A,A,A,A,B,H,H,...,A,A,H,H,B,H,A,B,H,H


In [27]:
marker_results = pd.concat([marker_results.ID, marker_results.iloc[:, 2:]], axis=1)

In [28]:
marker_results

Unnamed: 0,ID,L6,L7,L8,L9,L10,L11,L12,L13,L14,...,L131,L132,L133,L134,L135,L136,L137,L138,L139,L140
0,AMP0000011,H,H,A,A,H,A,H,A,H,...,H,H,H,A,H,H,H,H,H,A
1,AMP0000018,H,H,H,H,A,H,H,A,H,...,H,H,H,H,H,A,H,A,A,H
2,AMP0000020,H,H,H,H,B,H,H,H,B,...,H,H,B,H,H,H,H,H,H,H
3,AMP0000037,A,H,A,H,H,H,H,H,H,...,H,A,H,H,H,H,H,H,H,H
4,AMP0000052,H,H,H,H,H,H,H,H,H,...,B,H,H,H,H,H,H,H,H,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7429,AMP0116396,A,H,H,A,H,H,H,H,A,...,A,A,H,H,A,H,H,H,H,A
7430,AMP0116606,H,H,H,H,B,H,H,H,H,...,H,H,H,H,A,A,B,H,B,H
7431,AMP0116608,H,H,H,H,B,H,H,H,H,...,H,H,H,H,A,A,B,H,B,H
7432,AMP0116624,B,A,A,A,A,B,H,H,A,...,A,A,H,H,B,H,A,B,H,H


In [29]:
outfile = 'GRAS-Di_codominant_COMBINED_RESULTS_201909-21-04-1.csv'
marker_results.to_csv(outfile, index=False)