# Prepare the Final Year Project

In [33]:
from Serotype_Data import * #Dataset of Serotypes
from Serotype_Functions import *

#Biopython ver.1.76
from Bio.Seq import Seq #Represent biological sequences with alphabets
from Bio.pairwise2 import format_alignment #Functions to get global and local alignments between two sequences
from Bio import pairwise2

from IPython.display import Image

import numpy as np
import pandas as pd
import tabulate

In [34]:
import json
import os

In [35]:
path = os.path.relpath('./serotypes.json')
f = open(path, 'r')
serotypes = json.load(f)

### Pair X Pair

##### Build the rules in pairs
Previously, we build the rules one to one. 

Now, we will build the rules within the pair of genes and corresponding pair of sugars. 

In [36]:
simply_genes = [] #Store gene sequences of serotype which is simplified
lin_sugars = [] #Store sugar structure of serotype

wholePairs_data = dict() #Store each serotype:pairgene-pairs of sugars

for key in serotypes:
    # ignore serotypes which have no structure of sugar
    if len(serotypes[key]['sugars']) == 0:
        continue
    
    # with serotypes which have own sugar structure
    # simplify the sequence of genes
    simply_genes = simplify_genes(serotypes[key]['genes'])
    # copy the [key]['sugars'] as local 
    lin_sugars = serotypes[key]['sugars']
    
    sugars_pair = []
    for i in range(len(lin_sugars)-1): 
        li_sugars_pair = []
        li_sugars_pair.append(lin_sugars[i])
        li_sugars_pair.append(lin_sugars[i+1])
        
        sugars_pair.append(li_sugars_pair)
        
    
    genes_sugars_pairs = dict()
    for x in range(len(simply_genes)-1):
        genes_sugars_pairs[(simply_genes[x], simply_genes[x+1])] = sugars_pair
        
    wholePairs_data[key] = genes_sugars_pairs
            

In [37]:
# wholePairs_data

In [38]:
wholePairs = dict() #Store pairs of gene - pairs of sugar

for key in wholePairs_data:
    for pairgene in wholePairs_data[key]:
        for i in range(len(wholePairs_data[key][pairgene])):
            li_draft = []
            if not pairgene in wholePairs:
                wholePairs[pairgene] = []
                wholePairs[pairgene].append(wholePairs_data[key][pairgene][i])
            else:
                if wholePairs_data[key][pairgene][i] in wholePairs[pairgene]:
                    continue                
                else:
                    wholePairs[pairgene].append(wholePairs_data[key][pairgene][i])
                
# wholePairs #[key]"tuple" of pair of genes [value]List of "list" of pair of genes

In [39]:
pairgene_cols = []
pairgene_cols = list(wholePairs.keys())

pairsugar_rows = []

for pairgene in wholePairs:
    for pairsugar in wholePairs[pairgene]:
        if tuple(pairsugar) in pairsugar_rows:
            continue
        else:
            pairsugar_rows.append(tuple(pairsugar))       

In [40]:
num_rows = len(pairsugar_rows)
num_cols = len(pairgene_cols)
data = np.zeros(shape=(num_rows, num_cols), dtype=np.int32)

#show_compare(pairgene_cols, pairsugar_rows, data)

In [41]:
for pairgene in wholePairs:
    if not pairgene in pairgene_cols: # pairgene != pairgene_cols[i]
        continue
    else: # pairgene == pairgene_cols[i]
        store_colnum = 0
        store_colnum = pairgene_cols.index(pairgene)
        #print(pairgene, store_colnum)
        
        for pairsugar in wholePairs[pairgene]:
            if not tuple(pairsugar) in pairsugar_rows:
                continue
            else:
                store_rownum = 0
                store_rownum = pairsugar_rows.index(tuple(pairsugar))
                #print(pairsugar, store_rownum)
                data[store_rownum,store_colnum] = 1
                        
show_compare(pairgene_cols, pairsugar_rows, data)

Unnamed: 0,"('wchB', 'wchC')","('wchC', 'wchD')","('wchD', 'gla')","('gla', 'ugd')","('ugd', 'rmlA')","('rmlA', 'rmlC')","('rmlC', 'rmlB')","('rmlB', 'rmlD-')","('wchA', 'wchF')","('wchF', 'wchG')","('wchG', 'wchH')","('wchH', 'wchI')","('wchI', 'ugd')","('ugd', 'glf-')","('glf-', 'rmlA')","('rmlB', 'rmlD')","('ugd', 'wchE')","('wchE', 'galU')","('galU', 'pgm')","('HG261-', 'wciI')","('wciI', 'wciJ')","('wciJ', 'wciK')","('wciK', 'wciL')","('wciL', 'wciM')","('wciM', 'mnaA')","('mnaA', 'fnlA')","('fnlA', 'fnlB')","('fnlB', 'fnlC')","('wciJ', 'whaC')","('whaC', 'whaD')","('whaD', 'whaE')","('whaE', 'fnlA')","('fnlA', 'ugd')","('ugd', 'fnlB')","('wchA', 'wciN')","('wciN', 'HG262-')","('HG262-', 'wciO')","('wciO', 'wciP')","('wciP', 'rmlA')","('wciN', 'HG263-')","('HG263-', 'wciO')","('rmlD', 'glf-')","('wchF', 'wcwA')","('wcwA', 'wcwC')","('wcwC', 'wcwD')","('wcwD', 'HG140-')","('HG140-', 'wcwF')","('wcwF', 'wcwG')","('wcwG', 'wcwH')","('wcwH', 'rmlA')","('wcwC', 'wcwD-')","('wcwD-', 'HG140-')","('wchF', 'wcwI')","('wcwI', 'wcwL')","('wcwL', 'wcwK')","('wcwK', 'wcxU')","('wcxU', 'rbsF')","('rbsF', 'rmlA')","('wchA', 'wciQ')","('wciQ', 'wciR')","('wciR', 'wciS')","('wciS', 'wciT')","('wciT', 'ugd')","('ugd', 'HG265-')","('HG265-', 'HG266-')","('wchA', 'wchO')","('wchO', 'wcjA')","('wcjA', 'mnaA')","('mnaA', 'wcjB')","('wcjB', 'wcjC')","('wcjC', 'wcjD')","('wcjD', 'ugd')","('ugd', 'wcjE-')","('wcjC', 'ugd')","('ugd', 'wcjE')","('wcjG', 'wciB')","('wciB', 'wcrB')","('wcrB', 'wcrC')","('wcrC', 'wcrD-')","('wcrD-', 'wciF')","('wciF', 'wciG')","('wciG', 'glf')","('glf', 'wcrH')","('wcrC', 'wcrD')","('wcrD', 'wciF')","('wciF', 'wcrG')","('wcrG', 'glf')","('wchA', 'wchJ')","('wchJ', 'wchK')","('wchK', 'wcyK')","('wcyK', 'wcwC')","('wcwC', 'wcrL')","('wcrL', 'wcwT')","('wcwT', 'wcwU')","('wcwU', 'gct-')","('gct-', 'wcjE-')","('wcwU', 'gct')","('gct', 'wcjE')","('wcjE', 'aliA-')","('wcyK', 'wcwR')","('wcwR', 'wcrL')","('wciJ', 'wcxB')","('wcxB', 'wcxD')","('wcxD', 'wcxE')","('wcxE', 'wcxF')","('wcxF', 'mnaB')","('mnaB', 'mnaA')","('wchK', 'whaG-')","('whaG-', 'abp1')","('abp1', 'abp2')","('abp2', 'wciF')","('wciF', 'wcrD')","('wcrD', 'wciG')","('wchK', 'wchL')","('wchL', 'wchM')","('wchM', 'wchN')","('wchN', 'wciY')","('wciY', 'lrp')","('wchN', 'wciZ')","('wciZ', 'wchX')","('wchX', 'gtp1')","('gtp1', 'gtp2')","('gtp2', 'gtp3')","('gtp3', 'rmlB')","('rmlD', 'glf')","('glf', 'wcjE')","('wchN', 'wciZ-')","('wciZ-', 'wchX')","('wchF', 'wcxG')","('wcxG', 'abp1')","('abp2', 'wciP')","('wciP', 'wcrT')","('wcrT', 'wcrU')","('wcrU', 'wcrV')","('wcrV', 'rmlA')","('wchA', 'wciB')","('wciB', 'wcrP')","('wcrP', 'wcrQ')","('wcrQ', 'wcrR')","('wcrR', 'wcrT')","('wcrV', 'ugd')","('ugd', 'glf')","('glf', 'rmlA')","('wchF', 'wciU')","('wciU', 'wcxM')","('wcxM', 'wciV')","('wciV', 'wciW')","('wciW', 'wciX')","('wciX', 'wciY')","('wciY', 'gct')","('gct', 'HG94-')","('HG94-', 'rmlA')","('wciU', 'wciV')","('wciW', 'wciY')","('wciW', 'wciX-')","('wciX-', 'wciY')","('wchO', 'wchP')","('wchP', 'wchQ')","('wchQ', 'mnaA')","('mnaA', 'rmlA')","('wchQ', 'wchR')","('wchR', 'wchS')","('wchS', 'rbsF')","('rbsF', 'mnaA')","('mnaA', 'wchU')","('wchU', 'HG264-')","('HG264-', 'rmlA')","('wciB', 'whaJ')","('whaJ', 'wciL')","('wciL', 'wcwK')","('wcwK', 'wciD')","('wciD', 'whaF')","('whaF', 'wciG')","('wcwC', 'ugd')","('ugd', 'wcwV')","('wcwV', 'whaB')","('whaB', 'wcwX')","('wcwX', 'glf')","('wchF', 'wchV')","('wchV', 'wchW')","('wchW', 'wchX')","('gtp3', 'rmlA')","('wchF', 'whaK')","('whaK', 'whaL')","('whaL', 'wcyS')","('wcyS', 'wcrN')","('wcrN', 'HG270-')","('HG270-', 'rmlA')","('wcjH', 'wciB')","('wciB', 'wcrJ')","('wcrJ', 'wcrM')","('wcrM', 'wcrH')","('wcrH', 'glf')","('wcrP', 'wcrR')","('wcrR', 'wcrW')","('wcrW', 'wcrX')","('wcrX', 'ugd')","('wchF', 'wcyH')","('wcyH', 'wcyI')","('wcyI', 'wchQ')","('wchQ', 'wcyS')","('wcrN', 'HG272-')","('HG272-', 'rmlA')","('wcrN', 'HG271-')","('HG271-', 'rmlA')","('wciB', 'wciC')","('wciC', 'wciD')","('wciD', 'wciE')","('wciE', 'wciF')","('wciF', 'wciG*')","('wciG*', 'glf')","('glf', 'wcjE-')","('wciN', 'wciO')","('wciO', 'wcrC')","('wcrC', 'wciD')","('wciB', 'wcrO')","('wcrO', 'wcrC')","('wcrD', 'glf')","('glf', 'wcyO')","('wciB', 'wcrI')","('wcrI', 'wcrJ')","('wcrJ', 'wcrK-')","('wcrK-', 'mnp1')","('mnp1', 'wcrH')","('wcrH', 'mnp2')","('mnp2', 'wciG')","('wcrH', 'wciG')","('wciB', 'wciC-')","('wciC-', 'wciD')","('wciF', 'wciG-')","('wciG-', 'glf-')","('glf-', 'wcjE-')","('wcxB', 'wciL')","('wciL', 'wcyQ')","('wcyQ', 'wcxS')","('wcxS', 'wcyR')","('wcyR', 'gct')","('gct', 'fnlA')","('fnlC', 'HG273-')","('HG273-', 'rmlA')"
"('2-acetamido-4-amino-2,4,6-trideoxygalactose', 'galacturonic acid')",1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('galacturonic acid', 'galacturonic acid')",1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('glucose', 'rhamnose')",0,0,0,0,0,1,1,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('rhamnose', 'rhamnose')",0,0,0,0,0,1,1,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('rhamnose', 'glucose')",0,0,0,0,0,1,1,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('glucose', 'glucuronic acid')",0,0,0,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('glucose', 'galactose')",0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
"('galactose', 'ribitol')",0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('ribitol', 'rhamnose')",0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('rhamnose', 'galactose')",0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


##### Check frequency
Check the frequency of wholePairs

In [42]:
pairs_fre = dict()

def rules_frequency(pairgene):
    if pairgene in pairs_fre:
        pairs_fre[pairgene] += 1
    else:
        pairs_fre[pairgene] = 1


In [43]:
for key in serotypes: 
    # ignore serotypes which have no structure of sugar
    if len(serotypes[key]['sugars']) == 0:
        continue

    # with serotypes which have own sugar structure
    # simplify the gene sequence
    simply_genes = simplify_genes(serotypes[key]['genes'])
    # copy the [key]['sugars'] as local 
    lin_sugars = serotypes[key]['sugars']
    
    p_genes = [] # Store genes as pair for test
    for i in range(len(simply_genes)-1):
        p_genes.append((simply_genes[i], simply_genes[i+1]))
    
    p_sugars = [] # Store sugars as pair for test
    for j in range(len(lin_sugars)-1):
         p_sugars.append((lin_sugars[j], lin_sugars[j+1]))
    
    for k in range(len(p_genes)):
        for pairgene in wholePairs:
            
            if not (pairgene == p_genes[k]):
                continue
            else: # pairgene == p_genes[k]
                rules_frequency(pairgene)
                               
pairs_fre

{('wchB', 'wchC'): 1,
 ('wchC', 'wchD'): 1,
 ('wchD', 'gla'): 1,
 ('gla', 'ugd'): 1,
 ('ugd', 'rmlA'): 1,
 ('rmlA', 'rmlC'): 24,
 ('rmlC', 'rmlB'): 24,
 ('rmlB', 'rmlD-'): 1,
 ('wchA', 'wchF'): 14,
 ('wchF', 'wchG'): 1,
 ('wchG', 'wchH'): 1,
 ('wchH', 'wchI'): 1,
 ('wchI', 'ugd'): 1,
 ('ugd', 'glf-'): 1,
 ('glf-', 'rmlA'): 1,
 ('rmlB', 'rmlD'): 24,
 ('ugd', 'wchE'): 1,
 ('wchE', 'galU'): 1,
 ('galU', 'pgm'): 1,
 ('HG261-', 'wciI'): 1,
 ('wciI', 'wciJ'): 5,
 ('wciJ', 'wciK'): 1,
 ('wciK', 'wciL'): 1,
 ('wciL', 'wciM'): 1,
 ('wciM', 'mnaA'): 1,
 ('mnaA', 'fnlA'): 3,
 ('fnlA', 'fnlB'): 4,
 ('fnlB', 'fnlC'): 5,
 ('wciJ', 'whaC'): 1,
 ('whaC', 'whaD'): 1,
 ('whaD', 'whaE'): 1,
 ('whaE', 'fnlA'): 1,
 ('fnlA', 'ugd'): 1,
 ('ugd', 'fnlB'): 1,
 ('wchA', 'wciN'): 3,
 ('wciN', 'HG262-'): 1,
 ('HG262-', 'wciO'): 1,
 ('wciO', 'wciP'): 2,
 ('wciP', 'rmlA'): 2,
 ('wciN', 'HG263-'): 1,
 ('HG263-', 'wciO'): 1,
 ('rmlD', 'glf-'): 11,
 ('wchF', 'wcwA'): 3,
 ('wcwA', 'wcwC'): 3,
 ('wcwC', 'wcwD'): 1,
 ('w

Clear the data of dataFrame to check frequency

In [44]:
pairgene_cols = []
pairgene_cols = list(wholePairs.keys())

pairsugar_rows = []

for pairgene in wholePairs:
    for pairsugar in wholePairs[pairgene]:
        if tuple(pairsugar) in pairsugar_rows:
            continue
        else:
            pairsugar_rows.append(tuple(pairsugar))    

In [45]:
num_rows = len(pairsugar_rows)
num_cols = len(pairgene_cols)
data = np.zeros(shape=(num_rows, num_cols), dtype=np.int32)

In [46]:
for key in serotypes: 
    # ignore serotypes which have no structure of sugar
    if len(serotypes[key]['sugars']) == 0:
        continue

    # with serotypes which have own sugar structure
    # simplify the gene sequence
    simply_genes = simplify_genes(serotypes[key]['genes'])
    # copy the [key]['sugars'] as local 
    lin_sugars = serotypes[key]['sugars']
    
    p_genes = [] # Store genes as pair for test
    for i in range(len(simply_genes)-1):
        p_genes.append((simply_genes[i], simply_genes[i+1]))
    
    # print(p_genes) #List of tuple of pair of genes
    
    p_sugars = [] # Store sugars as pair for test
    for j in range(len(lin_sugars)-1):
         p_sugars.append((lin_sugars[j], lin_sugars[j+1]))
            
    # print(p_sugars) #List of tuples of pair of sugars
    
    for k in range(len(p_genes)):
        for pairgene in wholePairs:
            if not pairgene == p_genes[k]:
                continue
            else: #pairgene == p_genes[k]
                store_colnum = 0
                store_colnum = pairgene_cols.index(pairgene)
        
                for r in range(len(p_sugars)): 
                    #print(key, k ,r, p_sugars[r])
                    if list(p_sugars[r]) in wholePairs.get(pairgene):
                        store_rownum = 0
                        store_rownum = pairsugar_rows.index(p_sugars[r])
                        
                        data[store_rownum,store_colnum] += 1

                        
show_compare(pairgene_cols, pairsugar_rows, data)

Unnamed: 0,"('wchB', 'wchC')","('wchC', 'wchD')","('wchD', 'gla')","('gla', 'ugd')","('ugd', 'rmlA')","('rmlA', 'rmlC')","('rmlC', 'rmlB')","('rmlB', 'rmlD-')","('wchA', 'wchF')","('wchF', 'wchG')","('wchG', 'wchH')","('wchH', 'wchI')","('wchI', 'ugd')","('ugd', 'glf-')","('glf-', 'rmlA')","('rmlB', 'rmlD')","('ugd', 'wchE')","('wchE', 'galU')","('galU', 'pgm')","('HG261-', 'wciI')","('wciI', 'wciJ')","('wciJ', 'wciK')","('wciK', 'wciL')","('wciL', 'wciM')","('wciM', 'mnaA')","('mnaA', 'fnlA')","('fnlA', 'fnlB')","('fnlB', 'fnlC')","('wciJ', 'whaC')","('whaC', 'whaD')","('whaD', 'whaE')","('whaE', 'fnlA')","('fnlA', 'ugd')","('ugd', 'fnlB')","('wchA', 'wciN')","('wciN', 'HG262-')","('HG262-', 'wciO')","('wciO', 'wciP')","('wciP', 'rmlA')","('wciN', 'HG263-')","('HG263-', 'wciO')","('rmlD', 'glf-')","('wchF', 'wcwA')","('wcwA', 'wcwC')","('wcwC', 'wcwD')","('wcwD', 'HG140-')","('HG140-', 'wcwF')","('wcwF', 'wcwG')","('wcwG', 'wcwH')","('wcwH', 'rmlA')","('wcwC', 'wcwD-')","('wcwD-', 'HG140-')","('wchF', 'wcwI')","('wcwI', 'wcwL')","('wcwL', 'wcwK')","('wcwK', 'wcxU')","('wcxU', 'rbsF')","('rbsF', 'rmlA')","('wchA', 'wciQ')","('wciQ', 'wciR')","('wciR', 'wciS')","('wciS', 'wciT')","('wciT', 'ugd')","('ugd', 'HG265-')","('HG265-', 'HG266-')","('wchA', 'wchO')","('wchO', 'wcjA')","('wcjA', 'mnaA')","('mnaA', 'wcjB')","('wcjB', 'wcjC')","('wcjC', 'wcjD')","('wcjD', 'ugd')","('ugd', 'wcjE-')","('wcjC', 'ugd')","('ugd', 'wcjE')","('wcjG', 'wciB')","('wciB', 'wcrB')","('wcrB', 'wcrC')","('wcrC', 'wcrD-')","('wcrD-', 'wciF')","('wciF', 'wciG')","('wciG', 'glf')","('glf', 'wcrH')","('wcrC', 'wcrD')","('wcrD', 'wciF')","('wciF', 'wcrG')","('wcrG', 'glf')","('wchA', 'wchJ')","('wchJ', 'wchK')","('wchK', 'wcyK')","('wcyK', 'wcwC')","('wcwC', 'wcrL')","('wcrL', 'wcwT')","('wcwT', 'wcwU')","('wcwU', 'gct-')","('gct-', 'wcjE-')","('wcwU', 'gct')","('gct', 'wcjE')","('wcjE', 'aliA-')","('wcyK', 'wcwR')","('wcwR', 'wcrL')","('wciJ', 'wcxB')","('wcxB', 'wcxD')","('wcxD', 'wcxE')","('wcxE', 'wcxF')","('wcxF', 'mnaB')","('mnaB', 'mnaA')","('wchK', 'whaG-')","('whaG-', 'abp1')","('abp1', 'abp2')","('abp2', 'wciF')","('wciF', 'wcrD')","('wcrD', 'wciG')","('wchK', 'wchL')","('wchL', 'wchM')","('wchM', 'wchN')","('wchN', 'wciY')","('wciY', 'lrp')","('wchN', 'wciZ')","('wciZ', 'wchX')","('wchX', 'gtp1')","('gtp1', 'gtp2')","('gtp2', 'gtp3')","('gtp3', 'rmlB')","('rmlD', 'glf')","('glf', 'wcjE')","('wchN', 'wciZ-')","('wciZ-', 'wchX')","('wchF', 'wcxG')","('wcxG', 'abp1')","('abp2', 'wciP')","('wciP', 'wcrT')","('wcrT', 'wcrU')","('wcrU', 'wcrV')","('wcrV', 'rmlA')","('wchA', 'wciB')","('wciB', 'wcrP')","('wcrP', 'wcrQ')","('wcrQ', 'wcrR')","('wcrR', 'wcrT')","('wcrV', 'ugd')","('ugd', 'glf')","('glf', 'rmlA')","('wchF', 'wciU')","('wciU', 'wcxM')","('wcxM', 'wciV')","('wciV', 'wciW')","('wciW', 'wciX')","('wciX', 'wciY')","('wciY', 'gct')","('gct', 'HG94-')","('HG94-', 'rmlA')","('wciU', 'wciV')","('wciW', 'wciY')","('wciW', 'wciX-')","('wciX-', 'wciY')","('wchO', 'wchP')","('wchP', 'wchQ')","('wchQ', 'mnaA')","('mnaA', 'rmlA')","('wchQ', 'wchR')","('wchR', 'wchS')","('wchS', 'rbsF')","('rbsF', 'mnaA')","('mnaA', 'wchU')","('wchU', 'HG264-')","('HG264-', 'rmlA')","('wciB', 'whaJ')","('whaJ', 'wciL')","('wciL', 'wcwK')","('wcwK', 'wciD')","('wciD', 'whaF')","('whaF', 'wciG')","('wcwC', 'ugd')","('ugd', 'wcwV')","('wcwV', 'whaB')","('whaB', 'wcwX')","('wcwX', 'glf')","('wchF', 'wchV')","('wchV', 'wchW')","('wchW', 'wchX')","('gtp3', 'rmlA')","('wchF', 'whaK')","('whaK', 'whaL')","('whaL', 'wcyS')","('wcyS', 'wcrN')","('wcrN', 'HG270-')","('HG270-', 'rmlA')","('wcjH', 'wciB')","('wciB', 'wcrJ')","('wcrJ', 'wcrM')","('wcrM', 'wcrH')","('wcrH', 'glf')","('wcrP', 'wcrR')","('wcrR', 'wcrW')","('wcrW', 'wcrX')","('wcrX', 'ugd')","('wchF', 'wcyH')","('wcyH', 'wcyI')","('wcyI', 'wchQ')","('wchQ', 'wcyS')","('wcrN', 'HG272-')","('HG272-', 'rmlA')","('wcrN', 'HG271-')","('HG271-', 'rmlA')","('wciB', 'wciC')","('wciC', 'wciD')","('wciD', 'wciE')","('wciE', 'wciF')","('wciF', 'wciG*')","('wciG*', 'glf')","('glf', 'wcjE-')","('wciN', 'wciO')","('wciO', 'wcrC')","('wcrC', 'wciD')","('wciB', 'wcrO')","('wcrO', 'wcrC')","('wcrD', 'glf')","('glf', 'wcyO')","('wciB', 'wcrI')","('wcrI', 'wcrJ')","('wcrJ', 'wcrK-')","('wcrK-', 'mnp1')","('mnp1', 'wcrH')","('wcrH', 'mnp2')","('mnp2', 'wciG')","('wcrH', 'wciG')","('wciB', 'wciC-')","('wciC-', 'wciD')","('wciF', 'wciG-')","('wciG-', 'glf-')","('glf-', 'wcjE-')","('wcxB', 'wciL')","('wciL', 'wcyQ')","('wcyQ', 'wcxS')","('wcxS', 'wcyR')","('wcyR', 'gct')","('gct', 'fnlA')","('fnlC', 'HG273-')","('HG273-', 'rmlA')"
"('2-acetamido-4-amino-2,4,6-trideoxygalactose', 'galacturonic acid')",1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('galacturonic acid', 'galacturonic acid')",1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('glucose', 'rhamnose')",0,0,0,0,0,17,17,0,17,1,1,1,1,1,1,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,3,3,1,1,2,2,2,2,1,1,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1,4,1,1,4,2,2,4,4,4,3,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,5,1,1,0,0,0,0,0,0,0,0,0,4,4,4,4,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('rhamnose', 'rhamnose')",0,0,0,0,0,5,5,0,4,2,2,2,2,2,2,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('rhamnose', 'glucose')",0,0,0,0,0,6,6,0,6,1,1,1,1,1,1,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,3,2,2,3,3,3,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('glucose', 'glucuronic acid')",0,0,0,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('glucose', 'galactose')",0,0,0,0,0,6,6,0,3,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,1,2,2,1,1,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,1,0,0,0,10,10,4,2,2,4,4,2,1,2,1,1,2,2,0,0,0,0,0,0,1,1,1,1,1,1,5,5,5,1,1,2,2,4,4,4,1,1,3,2,2,0,0,0,0,1,1,0,6,1,1,1,1,1,1,1,3,1,1,3,2,2,3,3,3,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
"('galactose', 'ribitol')",0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,1,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,1,1,1,2,1,2,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('ribitol', 'rhamnose')",0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,1,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('rhamnose', 'galactose')",0,0,0,0,0,8,8,0,6,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,3,3,1,1,2,2,2,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,1,1,2,2,1,1,2,1,1,1,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [47]:
for n in range(num_rows):
    for m in range(num_cols):
        if data[n][m] == 0:
            continue
        else:
            data[n][m] += -1
            
show_compare(pairgene_cols, pairsugar_rows, data)

Unnamed: 0,"('wchB', 'wchC')","('wchC', 'wchD')","('wchD', 'gla')","('gla', 'ugd')","('ugd', 'rmlA')","('rmlA', 'rmlC')","('rmlC', 'rmlB')","('rmlB', 'rmlD-')","('wchA', 'wchF')","('wchF', 'wchG')","('wchG', 'wchH')","('wchH', 'wchI')","('wchI', 'ugd')","('ugd', 'glf-')","('glf-', 'rmlA')","('rmlB', 'rmlD')","('ugd', 'wchE')","('wchE', 'galU')","('galU', 'pgm')","('HG261-', 'wciI')","('wciI', 'wciJ')","('wciJ', 'wciK')","('wciK', 'wciL')","('wciL', 'wciM')","('wciM', 'mnaA')","('mnaA', 'fnlA')","('fnlA', 'fnlB')","('fnlB', 'fnlC')","('wciJ', 'whaC')","('whaC', 'whaD')","('whaD', 'whaE')","('whaE', 'fnlA')","('fnlA', 'ugd')","('ugd', 'fnlB')","('wchA', 'wciN')","('wciN', 'HG262-')","('HG262-', 'wciO')","('wciO', 'wciP')","('wciP', 'rmlA')","('wciN', 'HG263-')","('HG263-', 'wciO')","('rmlD', 'glf-')","('wchF', 'wcwA')","('wcwA', 'wcwC')","('wcwC', 'wcwD')","('wcwD', 'HG140-')","('HG140-', 'wcwF')","('wcwF', 'wcwG')","('wcwG', 'wcwH')","('wcwH', 'rmlA')","('wcwC', 'wcwD-')","('wcwD-', 'HG140-')","('wchF', 'wcwI')","('wcwI', 'wcwL')","('wcwL', 'wcwK')","('wcwK', 'wcxU')","('wcxU', 'rbsF')","('rbsF', 'rmlA')","('wchA', 'wciQ')","('wciQ', 'wciR')","('wciR', 'wciS')","('wciS', 'wciT')","('wciT', 'ugd')","('ugd', 'HG265-')","('HG265-', 'HG266-')","('wchA', 'wchO')","('wchO', 'wcjA')","('wcjA', 'mnaA')","('mnaA', 'wcjB')","('wcjB', 'wcjC')","('wcjC', 'wcjD')","('wcjD', 'ugd')","('ugd', 'wcjE-')","('wcjC', 'ugd')","('ugd', 'wcjE')","('wcjG', 'wciB')","('wciB', 'wcrB')","('wcrB', 'wcrC')","('wcrC', 'wcrD-')","('wcrD-', 'wciF')","('wciF', 'wciG')","('wciG', 'glf')","('glf', 'wcrH')","('wcrC', 'wcrD')","('wcrD', 'wciF')","('wciF', 'wcrG')","('wcrG', 'glf')","('wchA', 'wchJ')","('wchJ', 'wchK')","('wchK', 'wcyK')","('wcyK', 'wcwC')","('wcwC', 'wcrL')","('wcrL', 'wcwT')","('wcwT', 'wcwU')","('wcwU', 'gct-')","('gct-', 'wcjE-')","('wcwU', 'gct')","('gct', 'wcjE')","('wcjE', 'aliA-')","('wcyK', 'wcwR')","('wcwR', 'wcrL')","('wciJ', 'wcxB')","('wcxB', 'wcxD')","('wcxD', 'wcxE')","('wcxE', 'wcxF')","('wcxF', 'mnaB')","('mnaB', 'mnaA')","('wchK', 'whaG-')","('whaG-', 'abp1')","('abp1', 'abp2')","('abp2', 'wciF')","('wciF', 'wcrD')","('wcrD', 'wciG')","('wchK', 'wchL')","('wchL', 'wchM')","('wchM', 'wchN')","('wchN', 'wciY')","('wciY', 'lrp')","('wchN', 'wciZ')","('wciZ', 'wchX')","('wchX', 'gtp1')","('gtp1', 'gtp2')","('gtp2', 'gtp3')","('gtp3', 'rmlB')","('rmlD', 'glf')","('glf', 'wcjE')","('wchN', 'wciZ-')","('wciZ-', 'wchX')","('wchF', 'wcxG')","('wcxG', 'abp1')","('abp2', 'wciP')","('wciP', 'wcrT')","('wcrT', 'wcrU')","('wcrU', 'wcrV')","('wcrV', 'rmlA')","('wchA', 'wciB')","('wciB', 'wcrP')","('wcrP', 'wcrQ')","('wcrQ', 'wcrR')","('wcrR', 'wcrT')","('wcrV', 'ugd')","('ugd', 'glf')","('glf', 'rmlA')","('wchF', 'wciU')","('wciU', 'wcxM')","('wcxM', 'wciV')","('wciV', 'wciW')","('wciW', 'wciX')","('wciX', 'wciY')","('wciY', 'gct')","('gct', 'HG94-')","('HG94-', 'rmlA')","('wciU', 'wciV')","('wciW', 'wciY')","('wciW', 'wciX-')","('wciX-', 'wciY')","('wchO', 'wchP')","('wchP', 'wchQ')","('wchQ', 'mnaA')","('mnaA', 'rmlA')","('wchQ', 'wchR')","('wchR', 'wchS')","('wchS', 'rbsF')","('rbsF', 'mnaA')","('mnaA', 'wchU')","('wchU', 'HG264-')","('HG264-', 'rmlA')","('wciB', 'whaJ')","('whaJ', 'wciL')","('wciL', 'wcwK')","('wcwK', 'wciD')","('wciD', 'whaF')","('whaF', 'wciG')","('wcwC', 'ugd')","('ugd', 'wcwV')","('wcwV', 'whaB')","('whaB', 'wcwX')","('wcwX', 'glf')","('wchF', 'wchV')","('wchV', 'wchW')","('wchW', 'wchX')","('gtp3', 'rmlA')","('wchF', 'whaK')","('whaK', 'whaL')","('whaL', 'wcyS')","('wcyS', 'wcrN')","('wcrN', 'HG270-')","('HG270-', 'rmlA')","('wcjH', 'wciB')","('wciB', 'wcrJ')","('wcrJ', 'wcrM')","('wcrM', 'wcrH')","('wcrH', 'glf')","('wcrP', 'wcrR')","('wcrR', 'wcrW')","('wcrW', 'wcrX')","('wcrX', 'ugd')","('wchF', 'wcyH')","('wcyH', 'wcyI')","('wcyI', 'wchQ')","('wchQ', 'wcyS')","('wcrN', 'HG272-')","('HG272-', 'rmlA')","('wcrN', 'HG271-')","('HG271-', 'rmlA')","('wciB', 'wciC')","('wciC', 'wciD')","('wciD', 'wciE')","('wciE', 'wciF')","('wciF', 'wciG*')","('wciG*', 'glf')","('glf', 'wcjE-')","('wciN', 'wciO')","('wciO', 'wcrC')","('wcrC', 'wciD')","('wciB', 'wcrO')","('wcrO', 'wcrC')","('wcrD', 'glf')","('glf', 'wcyO')","('wciB', 'wcrI')","('wcrI', 'wcrJ')","('wcrJ', 'wcrK-')","('wcrK-', 'mnp1')","('mnp1', 'wcrH')","('wcrH', 'mnp2')","('mnp2', 'wciG')","('wcrH', 'wciG')","('wciB', 'wciC-')","('wciC-', 'wciD')","('wciF', 'wciG-')","('wciG-', 'glf-')","('glf-', 'wcjE-')","('wcxB', 'wciL')","('wciL', 'wcyQ')","('wcyQ', 'wcxS')","('wcxS', 'wcyR')","('wcyR', 'gct')","('gct', 'fnlA')","('fnlC', 'HG273-')","('HG273-', 'rmlA')"
"('2-acetamido-4-amino-2,4,6-trideoxygalactose', 'galacturonic acid')",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('galacturonic acid', 'galacturonic acid')",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('glucose', 'rhamnose')",0,0,0,0,0,16,16,0,16,0,0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2,2,0,0,1,1,1,1,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,3,1,1,3,3,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,3,3,3,3,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('rhamnose', 'rhamnose')",0,0,0,0,0,4,4,0,3,1,1,1,1,1,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('rhamnose', 'glucose')",0,0,0,0,0,5,5,0,5,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2,1,1,2,2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('glucose', 'glucuronic acid')",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('glucose', 'galactose')",0,0,0,0,0,5,5,0,2,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,9,9,3,1,1,3,3,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,4,4,4,0,0,1,1,3,3,3,0,0,2,1,1,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,2,0,0,2,1,1,2,2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('galactose', 'ribitol')",0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('ribitol', 'rhamnose')",0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"('rhamnose', 'galactose')",0,0,0,0,0,7,7,0,5,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,2,2,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [48]:
li_pairs_genes = list(wholePairs.keys()) #Pairs of genes

In [49]:
#Check how many times the pairs of genes appear in dataset
fre_pairgenes = dict()

for pairgenes in li_pairs_genes:
    fre_pairgenes[pairgenes] = 0

for key in serotypes:
    # ignore serotypes which have no structure of sugar
    if len(serotypes[key]['sugars']) == 0:
        continue
        
    # with serotypes which have own sugar structure
    # simplify the gene sequence
    simply_genes = simplify_genes(serotypes[key]['genes'])

    p_genes = [] # Store genes as pair for test
    for i in range(len(simply_genes)-1):
        p_genes.append((simply_genes[i], simply_genes[i+1]))
    
    for j in range(len(p_genes)):
        for pairgenes in li_pairs_genes:
            if not pairgenes == p_genes[j]:
                continue
            else: #pairgene == p_genes[k]
                fre_pairgenes[p_genes[j]] += 1
                

In [50]:
li_total_pair_genes_vals = list(fre_pairgenes.values())

In [51]:
store_genes = []
store_sugars = []
li_store_genes = []
li_store_sugars = []

for pairgenes in li_pairs_genes:     
    store_genes = pairgenes
    
    for pairsugars in wholePairs[pairgenes]:                
        store_sugars = pairsugars
        #print(store_sugars)
        
        li_store_genes.append(store_genes)
        li_store_sugars.append(store_sugars)

In [52]:
pair_occur = [0]*1194

for genes in fre_pairgenes:
    i = li_store_genes.index(genes)
    
    pair_occur[i] = fre_pairgenes[genes]

In [53]:
pair_processing = [0]*1194
pair_not_processing = [0]*1194

for key in serotypes: 
    # ignore serotypes which have no structure of sugar
    if len(serotypes[key]['sugars']) == 0:
        continue

    # with serotypes which have own sugar structure
    simply_genes = simplify_genes(serotypes[key]['genes']) # simplify the gene sequence
    lin_sugars = serotypes[key]['sugars'] # copy the [key]['sugars'] as local 
    
    p_genes = [] # Store genes as pair for test
    for i in range(len(simply_genes)-1):
        p_genes.append((simply_genes[i], simply_genes[i+1]))
    
    p_sugars = [] # Store sugars as pair for test
    for j in range(len(lin_sugars)-1):
         p_sugars.append((lin_sugars[j], lin_sugars[j+1]))
            
    for k in range(len(p_genes)):
        for d in range(len(li_store_genes)):
            if not p_genes[k] == li_store_genes[d]:
                continue
            
            else: # p_genes[k] == li_store_genes[d]
                for r in range(len(p_sugars)):
                    if not list(p_sugars[r]) == li_store_sugars[d]:
                        continue
                        #print("!=", p_genes[k], p_sugars[r], li_store_sugars[li_store_genes.index(x)])
                    else: # p_sugars[r] == li_store_sugars[x]
                        pair_processing[d] += 1
                        #print("==", p_genes[k], p_sugars[r], li_store_sugars[li_store_genes.index(x)])                       

In [54]:
li_pair_occur = list(pair_occur)
li_pair_processing = list(pair_processing)    

In [55]:
pair_processing_without_one = []
pair_processing_without_one = pair_processing

li_pair_processing_without_one = []


for i in pair_processing_without_one:
    if i == 1:
        i = 0
        li_pair_processing_without_one.append(i)
    else:
        i = i - 1
        li_pair_processing_without_one.append(i)


In [56]:
ser_pair_processing = ['']*1194
ser_pair_not_processing = []
    
for key in serotypes: 
    # ignore serotypes which have no structure of sugar
    if len(serotypes[key]['sugars']) == 0:
        continue

    # with serotypes which have own sugar structure
    simply_genes = simplify_genes(serotypes[key]['genes']) # simplify the gene sequence
    lin_sugars = serotypes[key]['sugars'] # copy the [key]['sugars'] as local 
    
    p_genes = [] # Store genes as pair for test
    for i in range(len(simply_genes)-1):
        p_genes.append((simply_genes[i], simply_genes[i+1]))
    
    p_sugars = [] # Store sugars as pair for test
    for j in range(len(lin_sugars)-1):
         p_sugars.append((lin_sugars[j], lin_sugars[j+1]))
            
    for k in range(len(p_genes)):
        #Check li_store_genes include p_genes[k] 
        if p_genes[k] in li_store_genes:
            #Store all index where p_genes[k] in li_store_genes
            pairgenes_index = [p for p, x in enumerate(li_store_genes) if x==p_genes[k]]
            #print(p_genes[k], pairgenes_index)
            for r in range(len(p_sugars)):
                for p in pairgenes_index:
                    #print(r,p)
                    if list(p_sugars[r])==li_store_sugars[p]: 
#                         print(p_sugars[r],"==", li_store_sugars[p])
#                     else:
#                        print(p_sugars[r],"!=", li_store_sugars[p])                        
                        if ser_pair_processing[p]==['']:
                            ser_pair_processing[p]=key
                            print(ser_pair_processing[p])
                        else:
                            toy_list=[]
                            toy_list.append(key)                            
                            ser_pair_processing[p] = list(ser_pair_processing[p]) + toy_list

In [57]:
modify_ser_pair_processing = ['']*1194
#restore_ser_pair_processing = 

for i in range(len(ser_pair_processing)):
    sers = ser_pair_processing[i]
    #print(sers)
    for ser in sers:
        #print((ser))
        if not ser in modify_ser_pair_processing[i]:
            toy_list = []
            toy_list.append(ser)
            modify_ser_pair_processing[i] = list(modify_ser_pair_processing[i]) + toy_list

#modify_ser_pair_processing
li_ser_pair_processing = modify_ser_pair_processing

In [58]:
total_pairgenes_df_with_ser = pd.DataFrame({
    'Pair of genes': li_store_genes, 
    'Pair of sugars': li_store_sugars,
    'Num. of occurrence of pair of gene (Written in front of each pair of genes)': li_pair_occur,
    'Num. of occurrence where pair of sugars processing (included observation)': li_pair_processing,
    'Num. of occurrence where pair of sugars processing (without observation)': li_pair_processing_without_one,
    'Serotype(s) where processing pair of genes and sugars': li_ser_pair_processing
})

total_pairgenes_df_with_ser

Unnamed: 0,Pair of genes,Pair of sugars,Num. of occurrence of pair of gene (Written in front of each pair of genes),Num. of occurrence where pair of sugars processing (included observation),Num. of occurrence where pair of sugars processing (without observation),Serotype(s) where processing pair of genes and sugars
0,"(wchB, wchC)","[2-acetamido-4-amino-2,4,6-trideoxygalactose, ...",1,1,0,[ser_1]
1,"(wchB, wchC)","[galacturonic acid, galacturonic acid]",0,1,0,[ser_1]
2,"(wchC, wchD)","[2-acetamido-4-amino-2,4,6-trideoxygalactose, ...",1,1,0,[ser_1]
3,"(wchC, wchD)","[galacturonic acid, galacturonic acid]",0,1,0,[ser_1]
4,"(wchD, gla)","[2-acetamido-4-amino-2,4,6-trideoxygalactose, ...",1,1,0,[ser_1]
...,...,...,...,...,...,...
1189,"(fnlC, HG273-)","[N-acetylfucosamine, galactose]",0,1,0,[ser_45]
1190,"(fnlC, HG273-)","[galactose, rhamnose]",0,1,0,[ser_45]
1191,"(HG273-, rmlA)","[N-acetylgalactosamine, N-acetylfucosamine]",1,1,0,[ser_45]
1192,"(HG273-, rmlA)","[N-acetylfucosamine, galactose]",0,1,0,[ser_45]


In [59]:
total_pairgenes_df_remove_fre = pd.DataFrame({
    'Pair of genes': li_store_genes, 
    'Pair of sugars': li_store_sugars,
    'Num. of occurrence where pair of sugars processing (included observation)': li_pair_processing,
    'Num. of occurrence where pair of sugars processing (without observation)': li_pair_processing_without_one,
    'Serotype(s) where processing pair of genes and sugars': li_ser_pair_processing
})

In [60]:
eli_li_store_genes = []#li_store_genes 
eli_li_store_sugars = [] #li_store_sugars
eli_li_pair_processing = [] #li_pair_processing
eli_li_pair_processing_without_one = [] #li_pair_processing_without_one
eli_li_ser_pair_processing = []

single_index = []
for i in range(len(li_ser_pair_processing)):
    # Store index of len(ser_pair_processing)==1
    if len(li_ser_pair_processing[i]) == 1:
        single_index.append(i) 
    
li_indexs = list(range(len(li_store_genes)))
for index in single_index:
    if index in li_indexs:
        li_indexs.remove(index)

for indexs in li_indexs:
    eli_li_store_genes.append(li_store_genes[indexs])
    eli_li_store_sugars.append(li_store_sugars[indexs])
    eli_li_pair_processing.append(li_pair_processing[indexs])
    eli_li_pair_processing_without_one.append(li_pair_processing_without_one[indexs])
    eli_li_ser_pair_processing.append(li_ser_pair_processing[indexs])


In [61]:
eli_total_pairgenes_df_remove_fre = pd.DataFrame({
    'Pair of genes': eli_li_store_genes, 
    'Pair of sugars': eli_li_store_sugars,
    'Num. of occurrence where pair of sugars processing (included observation)': eli_li_pair_processing,
    'Num. of occurrence where pair of sugars processing (without observation)': eli_li_pair_processing_without_one,
    'Serotype(s) where processing pair of genes and sugars': eli_li_ser_pair_processing
})

eli_total_pairgenes_df_remove_fre

Unnamed: 0,Pair of genes,Pair of sugars,Num. of occurrence where pair of sugars processing (included observation),Num. of occurrence where pair of sugars processing (without observation),Serotype(s) where processing pair of genes and sugars
0,"(rmlA, rmlC)","[glucose, rhamnose]",17,16,"[ser_2, ser_7F, ser_7A, ser_7B, ser_17F, ser_1..."
1,"(rmlA, rmlC)","[rhamnose, rhamnose]",5,4,"[ser_2, ser_7B, ser_17F, ser_17A]"
2,"(rmlA, rmlC)","[rhamnose, glucose]",6,5,"[ser_2, ser_18F, ser_18B, ser_18C, ser_32F, se..."
3,"(rmlA, rmlC)","[glucose, galactose]",6,5,"[ser_6A, ser_6B, ser_17A, ser_18F, ser_18B, se..."
4,"(rmlA, rmlC)","[galactose, ribitol]",2,1,"[ser_6A, ser_6B]"
...,...,...,...,...,...
393,"(wcyH, wcyI)","[rhamnose, glucose]",2,1,"[ser_32F, ser_32A]"
394,"(wcyI, wchQ)","[glucose, rhamnose]",4,3,"[ser_32F, ser_32A]"
395,"(wcyI, wchQ)","[rhamnose, glucose]",2,1,"[ser_32F, ser_32A]"
396,"(wchQ, wcyS)","[glucose, rhamnose]",4,3,"[ser_32F, ser_32A]"


In [62]:
eli_total_pairgenes_df_remove_fre.sort_values(by=['Num. of occurrence where pair of sugars processing (without observation)'], 
                                                ascending = False)

Unnamed: 0,Pair of genes,Pair of sugars,Num. of occurrence where pair of sugars processing (included observation),Num. of occurrence where pair of sugars processing (without observation),Serotype(s) where processing pair of genes and sugars
0,"(rmlA, rmlC)","[glucose, rhamnose]",17,16,"[ser_2, ser_7F, ser_7A, ser_7B, ser_17F, ser_1..."
19,"(rmlC, rmlB)","[glucose, rhamnose]",17,16,"[ser_2, ser_7F, ser_7A, ser_7B, ser_17F, ser_1..."
48,"(rmlB, rmlD)","[glucose, rhamnose]",17,16,"[ser_2, ser_7F, ser_7A, ser_7B, ser_17F, ser_1..."
38,"(wchA, wchF)","[glucose, rhamnose]",17,16,"[ser_2, ser_7F, ser_7A, ser_7B, ser_17F, ser_1..."
203,"(wchJ, wchK)","[glucose, galactose]",10,9,"[ser_11F, ser_11A, ser_11B, ser_11C, ser_13, s..."
...,...,...,...,...,...
169,"(wcjD, ugd)","[glucuronic acid, glucose]",2,1,"[ser_9A, ser_9V]"
168,"(wcjD, ugd)","[galactose, glucuronic acid]",2,1,"[ser_9A, ser_9V]"
167,"(wcjD, ugd)","[N-acetylmannosamine, galactose]",2,1,"[ser_9A, ser_9V]"
166,"(wcjD, ugd)","[glucose, N-acetylmannosamine]",2,1,"[ser_9A, ser_9V]"


In [63]:
condition = eli_total_pairgenes_df_remove_fre['Num. of occurrence where pair of sugars processing (included observation)'] > 4

eli_total_pairgenes_df = eli_total_pairgenes_df_remove_fre.sort_values(by=['Num. of occurrence where pair of sugars processing (without observation)'], 
                                                ascending = False)[condition]

  eli_total_pairgenes_df = eli_total_pairgenes_df_remove_fre.sort_values(by=['Num. of occurrence where pair of sugars processing (without observation)'],


In [64]:
eli_total_pairgenes_df 

Unnamed: 0,Pair of genes,Pair of sugars,Num. of occurrence where pair of sugars processing (included observation),Num. of occurrence where pair of sugars processing (without observation),Serotype(s) where processing pair of genes and sugars
0,"(rmlA, rmlC)","[glucose, rhamnose]",17,16,"[ser_2, ser_7F, ser_7A, ser_7B, ser_17F, ser_1..."
19,"(rmlC, rmlB)","[glucose, rhamnose]",17,16,"[ser_2, ser_7F, ser_7A, ser_7B, ser_17F, ser_1..."
48,"(rmlB, rmlD)","[glucose, rhamnose]",17,16,"[ser_2, ser_7F, ser_7A, ser_7B, ser_17F, ser_1..."
38,"(wchA, wchF)","[glucose, rhamnose]",17,16,"[ser_2, ser_7F, ser_7A, ser_7B, ser_17F, ser_1..."
203,"(wchJ, wchK)","[glucose, galactose]",10,9,"[ser_11F, ser_11A, ser_11B, ser_11C, ser_13, s..."
198,"(wchA, wchJ)","[glucose, galactose]",10,9,"[ser_11F, ser_11A, ser_11B, ser_11C, ser_13, s..."
95,"(rmlD, glf-)","[glucose, rhamnose]",9,8,"[ser_7F, ser_7A, ser_7B, ser_17F, ser_18F, ser..."
205,"(wchJ, wchK)","[galactose, N-acetylglucosamine]",8,7,"[ser_11F, ser_11B, ser_11C, ser_14, ser_15F, s..."
204,"(wchJ, wchK)","[galactose, galactose]",8,7,"[ser_11F, ser_11A, ser_11B, ser_11C, ser_15F, ..."
200,"(wchA, wchJ)","[galactose, N-acetylglucosamine]",8,7,"[ser_11F, ser_11B, ser_11C, ser_14, ser_15F, s..."
