In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline

#### This notebook analyses the variants thrown into various Reference Genomes then called with Pilon from simulated reads

In [4]:
#import necessary packages
import vcf
import os
import pandas as pd
import numpy as np

from itertools import compress
import ast
import itertools

import time
import sys

import Bio
from Bio.Alphabet import IUPAC
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import pairwise2
from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.SeqUtils import GC

from Bio.Align.Applications import MuscleCommandline
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq
from Bio import pairwise2
from shutil import copy

#import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import gridspec
import matplotlib as mpl

#for exporting to Adobe Illustrator
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

In [5]:
genome_assemblies = ['RW-TB008', 'N1274', 'N1272', 'N1202', 'N1177', 'N1176', 'N0155', 
                     'N0153', 'N0145', 'N0091', 'N0072', 'N0054', 'N0004', 'M0017522_5', 
                     'M0016737_0', 'M0016395_7', 'M0014888_3', 'M0011368_9', 'M0010874_7', 
                     'M0003941_3', 'DNA120', 'DNA091', 'DNA086', 'DNA075', 'DNA044', 'DNA020', 
                     'DNA019_Rose', 'AZE_02_042', '02_R1896', '02_R1708', '02_R1179', '02_R0894', '01_R1430']

In [6]:
# Load in repeat regions that need to be mapped
repeat_regions_to_map = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/HT_SSR_H37Rv_regions_to_map.pkl')

# sort by chromosomal position
repeat_regions_to_map.sort_values(by='chromStart', inplace = True)
repeat_regions_to_map.reset_index(inplace=True, drop=True)
repeat_regions_to_map = repeat_regions_to_map.loc[:,['chromStart','chromEnd','gene_name','locus_tag','type','polyNT','repeats','INDEL_homoplasy','num_isolates_with_INDEL']]

PPE13_HT_start = repeat_regions_to_map[repeat_regions_to_map.gene_name == 'PPE13'].chromStart.values[0]
PPE13_HT_end = repeat_regions_to_map[repeat_regions_to_map.gene_name == 'PPE13'].chromEnd.values[0]
PPE13_H37Rv_positions_scan = set(range(PPE13_HT_start-2, PPE13_HT_end+2, 1))

# iterate through each assembly
for Mtb_genome_tag in genome_assemblies:

    ##############################################################################################
    # load the H37Rv-assembly mappings with the sequences to alter in each HT/SSR region
    repeat_region_mapping_df =  pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/H37Rv_to_assembly_mappings_3/HT_SSR_'+Mtb_genome_tag+'_mappings.pkl')

    # re-order columns
    repeat_region_mapping_df = repeat_region_mapping_df.loc[:,['chromStart','chromEnd','gene_name','locus_tag','polyNT','INDEL_homoplasy','num_isolates_with_INDEL',
                                                               'repeats','type','assembly_chromStart','assembly_chromEnd','assembly_polyNT','seq_diff_wrt_H37Rv',
                                                               'add_seq_into_assembly','expected_variant_call']]

    # look for indels within +/- 2bp of HT or SSR region
    H37Rv_positions_scan = set([])
    for repeat_region_start, repeat_region_end in zip(repeat_region_mapping_df.chromStart, repeat_region_mapping_df.chromEnd):

        # get coords for HT/SSR region +/- 2bp
        repeat_region_pos_list = set(range(repeat_region_start-2, repeat_region_end+2, 1))
        H37Rv_positions_scan = H37Rv_positions_scan.union(repeat_region_pos_list)
    ##############################################################################################

    ##############################################################################################
    VCF_file = '/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/process_sim_reads_and_call_variants/'+Mtb_genome_tag+'/SmPipe_output_ALT_assembly/pilon/'+Mtb_genome_tag+'.vcf'
    vcf_reader = vcf.Reader(open(VCF_file , 'r'))

    #create dictionaries to store information for each call
    ref_bases = {}
    alt_bases = {}
    ref_positions = {}
    indel_type = {}
    indel_allele = {}
    INFO_for_call = {}

    #indexer for dataframe containing SNPs
    index = 0

    #iterate through each record from VCF file
    for record in vcf_reader:
        
        # check to see if PPE13 insertion 'G' appears as due to natrually occuring deletion 'T' in adjacent HT
        ##################################################################################################################
        ##################################################################################################################
        # check to see if variant is SNP (length of Reference Allele is 1 base ; there's only 1 alternate allele 
        # length of alternate allele is 1 base), and if the call is a PASS by Pilon
        if ((record.FILTER == []) and (len(record.REF) == 1) and (len(record.ALT) == 1) and (len(str(record.ALT[0])) == 1) 
            and (record.POS in PPE13_H37Rv_positions_scan) and (str(record.ALT[0]) == 'G') # insertion G in PPE13
            and (record.INFO['BQ'] > 20) and (record.INFO['MQ'] > 30)): # Mean Base Quality > 20, Mean Mapping Quality > 30
            
            ref_pos = int(record.POS)
            ref_allele = str( record.REF )  #allele on H37Rv
            alt_allele = str( record.ALT[0] ) #alternate allele supported by reads
            indel_i_type = 'insertion'
            indel_i_allele = '+G'

            ######## Retrieve Relevant information for filtering quality of Base Call ########
            # store all of the pertinent information about the Base Call
            ref_bases[index] = ref_allele
            alt_bases[index] = alt_allele
            ref_positions[index] = record.POS
            indel_type[index] = indel_i_type
            indel_allele[index] = indel_i_allele
            INFO_for_call[index] = record.INFO

            index += 1
        ##################################################################################################################
        ##################################################################################################################
        
        #check to see if the call is a PASS by Pilon (PASS by Pilon ; there's exactly 1 alternate allele ; call supports the alternate allele)
        ##################################################################################################################
        elif (record.FILTER == []) and (len(record.ALT) == 1) and (record.ALT != [None]):

            ref_pos = int( record.POS )
            ref_allele = str( record.REF )  #allele on H37Rv
            alt_allele = str( record.ALT[0] ) #alternate allele supported by reads

            # keep only variant calls that were in H37Rv positions of interest
            if (ref_pos in H37Rv_positions_scan):

                #check to see if variant is INDEL (either Refeference or Alternate Alleles are > 1 base but not both ; 1 allele is 1bp, the other allele is > 1bp)
                if ( (len(ref_allele) > 1) or (len(alt_allele) > 1) ) and ( (len(ref_allele) == 1) or (len(alt_allele) == 1) ):

                    #keep only INDELs <= 10bp long (1 allele is 1bp ; the other allele is <= 11bp)
                    if (len(ref_allele) <= 11) and (len(alt_allele) <= 11):

                        #analyze only INDELs with Depth metrics (larger structural variants don't have this)
                        quality_metrics = record.INFO.keys()
                        if ('MQ' in quality_metrics) and ('DC' in quality_metrics) and ('IC' in quality_metrics) and ('TD' in quality_metrics) and ('DP' in quality_metrics):

                            ##### Retrieve Relevant information for filtering quality of Base Call #####
                            # Mean Mapping Quality @ locus
                            MQ = record.INFO['MQ']
                            # Number of Reads w/ Deletion 
                            DC = record.INFO['DC']
                            # Number of Reads w/ Insertion
                            IC = record.INFO['IC']
                            # Depth of Valid Reads in Pileup
                            VD = record.INFO['DP']
                            # Total Depth in Pileup
                            TD = record.INFO['TD']

                            ### Filtering Criteria
                            #---> Mean Mapping Quality > 30
                            #---> Number of High Quality Reads >= 20
                            if (MQ > 30) and (VD >= 20): #INDEL passed filtering criteria!

                                # Calculate Alternate Allele Frequency
                                ########################################################################

                                #calculate INDEL (Alternate Allele) Frequency from Deletion/Insertion Count & Total Depth (all metrics include ALL reads)
                                alt_allele_frequency = float(max(DC , IC)) / float(TD)

                                ########################################################################

                                # use only consensus INDELs (where alternate allele frequency >= 75%)
                                if alt_allele_frequency >= 0.75:

                                    # deletion
                                    if len(ref_allele) > len(alt_allele):
                                        indel_i_type = 'deletion'
                                        indel_i_allele = '-' + ref_allele[1:]

                                    # insertion
                                    elif len(ref_allele) < len(alt_allele):
                                        indel_i_type = 'insertion'
                                        indel_i_allele = '+' + alt_allele[1:]

                                    ######## Retrieve Relevant information for filtering quality of Base Call ########
                                    # store all of the pertinent information about the Base Call
                                    ref_bases[index] = ref_allele
                                    alt_bases[index] = alt_allele
                                    ref_positions[index] = record.POS
                                    indel_type[index] = indel_i_type
                                    indel_allele[index] = indel_i_allele
                                    INFO_for_call[index] = record.INFO

                                    index += 1
        ##################################################################################################################
        
    #convert dictionaries to series
    ref_bases = pd.Series(ref_bases)
    alt_bases = pd.Series(alt_bases)
    ref_positions = pd.Series(ref_positions)
    indel_type = pd.Series(indel_type)
    indel_allele = pd.Series(indel_allele)
    INFO_for_call = pd.Series(INFO_for_call)

    #create DataFrame to hold all base calls for a given sample
    Variant_Call_DF = pd.DataFrame()
    Variant_Call_DF['ref_base'] = ref_bases
    Variant_Call_DF['alt_base'] = alt_bases
    Variant_Call_DF['ref_position'] = ref_positions
    Variant_Call_DF['indel_type'] = indel_type
    Variant_Call_DF['indel_allele'] = indel_allele
    Variant_Call_DF['INFO'] = INFO_for_call
    ##############################################################################################

    ##############################################################################################
    indel_call_pos = []
    indel_allele_list = []
    for repeat_region_start, repeat_region_end in zip(repeat_region_mapping_df.chromStart, repeat_region_mapping_df.chromEnd):

        # subset to variants called in this region
        Variant_Call_repeat_region_DF = Variant_Call_DF[(Variant_Call_DF.ref_position >= repeat_region_start-2) & (Variant_Call_DF.ref_position <= repeat_region_end+2)]
        Variant_Call_repeat_region_DF.reset_index(inplace = True, drop = True)

        # variant called in this region
        if np.shape(Variant_Call_repeat_region_DF)[0] == 1:

            for variant_i in Variant_Call_repeat_region_DF.index:
                indel_call_pos.append(Variant_Call_repeat_region_DF.loc[variant_i,'ref_position'])
                indel_allele_list.append(Variant_Call_repeat_region_DF.loc[variant_i,'indel_allele'])

        # multiple variant called in this region
        indel_call_pos_region_list = []
        indel_allele_region_list = []
        if np.shape(Variant_Call_repeat_region_DF)[0] > 1:

            for variant_i in Variant_Call_repeat_region_DF.index:
                indel_call_pos_region_list.append(Variant_Call_repeat_region_DF.loc[variant_i,'ref_position'])
                indel_allele_region_list.append(Variant_Call_repeat_region_DF.loc[variant_i,'indel_allele'])

            indel_call_pos.append(indel_call_pos_region_list)
            indel_allele_list.append(indel_allele_region_list)

        # variant not called in this region
        if np.shape(Variant_Call_repeat_region_DF)[0] == 0:
            indel_call_pos.append('')
            indel_allele_list.append('')

    repeat_region_mapping_df.loc[:,'vc_pos'] = indel_call_pos
    repeat_region_mapping_df.loc[:,'vc_allele'] = indel_allele_list

    repeat_region_mapping_df.sort_values(by='chromStart', inplace = True)
    repeat_region_mapping_df.reset_index(inplace = True, drop = True)
    ##############################################################################################

    ##############################################################################################
    # create col that indicates categorizes each HT/SSR region & variant call: (1) region didn't map, (2) correct variant call, (3) no variant call, (4) wrong variant call
    variant_recall_list = []

    for expected_variant_i, variant_called_i, gene_name_i in zip(repeat_region_mapping_df.expected_variant_call, 
                                                                 repeat_region_mapping_df.vc_allele, repeat_region_mapping_df.gene_name):

        # make exceptions for PPE13 adjacent HTs
        # if "+TG" called and expected "+G" then 1bp insertion was inserted and another occurred naturally in adjacent HT, correctly called together
        # if "-TG" called and expected "-G" then naturally occuring deletions in adjacent HTs were correctly called together
        if (gene_name_i == 'PPE13') & (len(variant_called_i[1:]) == 2):
            if (expected_variant_i == '-G') & (variant_called_i == '-TG'):
                variant_recall_list.append('correct_vc')
            elif (expected_variant_i == '+G') & (variant_called_i == '+TG'):
                variant_recall_list.append('correct_vc')
            elif (expected_variant_i == 'no match'):
                variant_recall_list.append('n_a')

        # no mapping between regions
        elif (expected_variant_i == 'no match'):
            variant_recall_list.append('n_a')
            
        # variant called correctly
        elif (expected_variant_i == variant_called_i):
            variant_recall_list.append('correct_vc')

        # no variant was called
        elif variant_called_i == '':
            variant_recall_list.append('no_vc')

        # variant called incorrectly
        else:
            variant_recall_list.append('incorrect_vc')

    repeat_region_mapping_df.loc[:,'variant_recall'] = variant_recall_list
    ##############################################################################################
    
    # append column to indicate whether variant is called correctly/incorrectly
    repeat_regions_to_map.loc[:, Mtb_genome_tag] = variant_recall_list
    
    #print(Mtb_genome_tag)
    #print(repeat_region_mapping_df.loc[11,:])
    #print('')
    #print('')
    print(Mtb_genome_tag)
    print('PPE13 ' + repeat_region_mapping_df.loc[10,'expected_variant_call'] + '  ' + repeat_region_mapping_df.loc[10,'vc_allele'])
    print('')

RW-TB008
PPE13 +G  +TG

N1274
PPE13 +G  +G

N1272
PPE13 -G  -TG

N1202
PPE13 +G  +G

N1177
PPE13 -G  -TG

N1176
PPE13 +G  +G

N0155
PPE13 +G  +G

N0153
PPE13 -G  -G

N0145
PPE13 +G  +G

N0091
PPE13 +G  +G

N0072
PPE13 -G  -TG

N0054
PPE13 -G  -G

N0004
PPE13 +G  +G

M0017522_5
PPE13 +G  +G

M0016737_0
PPE13 +G  +G

M0016395_7
PPE13 +G  +G

M0014888_3
PPE13 +G  +G

M0011368_9
PPE13 -G  -TG

M0010874_7
PPE13 +G  +G

M0003941_3
PPE13 +G  +G

DNA120
PPE13 +G  +G

DNA091
PPE13 +G  +G

DNA086
PPE13 +G  +G

DNA075
PPE13 no match  -GGGGG

DNA044
PPE13 no match  -GGGGGGGG

DNA020
PPE13 -G  -TG

DNA019_Rose
PPE13 no match  +GG

AZE_02_042
PPE13 no match  -GGGGG

02_R1896
PPE13 -G  -TG

02_R1708
PPE13 +G  +G

02_R1179
PPE13 no match  +GG

02_R0894
PPE13 +G  +G

01_R1430
PPE13 -G  -TG



In [7]:
# example of dataframe for one assembly
repeat_region_mapping_df.head()

Unnamed: 0,chromStart,chromEnd,gene_name,locus_tag,polyNT,INDEL_homoplasy,num_isolates_with_INDEL,repeats,type,assembly_chromStart,assembly_chromEnd,assembly_polyNT,seq_diff_wrt_H37Rv,add_seq_into_assembly,expected_variant_call,vc_pos,vc_allele,variant_recall
0,36470,36477,bioF2,Rv0032,CCCCCCC,140,4903,1,HT,36470.0,36478.0,CCCCCCC,same,C,+C,36470,+C,correct_vc
1,191391,191398,Rv0161,Rv0161,CCCCCCC,73,274,1,HT,192995.0,193003.0,CCCCCCC,same,C,+C,191391,+C,correct_vc
2,340617,340632,,,GCG,51,132,5,SSR,345095.0,345111.0,GCGGCGGCGGCGGCG,same,G,+G,340631,+G,correct_vc
3,364498,364505,inter_vapC2_Rv0302,inter_Rv0301_Rv0302,GGGGGGG,216,676,1,HT,368977.0,368985.0,GGGGGGG,same,G,+G,364498,+G,correct_vc
4,552495,552504,,,CTA,102,1657,3,SSR,557012.0,557022.0,CTACTACTA,same,C,+C,552504,+C,correct_vc


### (aggregate) count variant recall types for each repeat region across assemblies

In [8]:
correct_vc_list = []
no_mapping_list = []
no_vc_list = []
incorrect_vc_list = []

for repeat_region_i in repeat_regions_to_map.index:
    
    # subset to variant recall categories across repeat regions
    repeat_regions_variant_calls = repeat_regions_to_map.loc[repeat_region_i,genome_assemblies]
    
    # count each type and store in list
    correct_vc_list.append(np.sum(repeat_regions_variant_calls == 'correct_vc'))
    no_mapping_list.append(np.sum(repeat_regions_variant_calls == 'n_a'))
    no_vc_list.append(np.sum(repeat_regions_variant_calls == 'no_vc'))
    incorrect_vc_list.append(np.sum(repeat_regions_variant_calls == 'incorrect_vc'))
    
# append to repeat regions dataframe
repeat_regions_to_map.loc[:,'correct_vc'] = correct_vc_list
repeat_regions_to_map.loc[:,'n_a'] = no_mapping_list
repeat_regions_to_map.loc[:,'no_vc'] = no_vc_list
repeat_regions_to_map.loc[:,'incorrect_vc'] = incorrect_vc_list

# calculate proportion of correct variant calls
repeat_regions_to_map.loc[:,'prop_correct_vc'] = repeat_regions_to_map.correct_vc / np.array(33.0 - repeat_regions_to_map.n_a)

# sort by proportion of correct variant calls, then by chromosomal position
repeat_regions_to_map.sort_values(by=['prop_correct_vc','chromStart'], ascending = [False, True], inplace = True)
repeat_regions_to_map.reset_index(inplace = True, drop = True)

In [9]:
repeat_regions_to_map

Unnamed: 0,chromStart,chromEnd,gene_name,locus_tag,type,polyNT,repeats,INDEL_homoplasy,num_isolates_with_INDEL,RW-TB008,...,02_R1896,02_R1708,02_R1179,02_R0894,01_R1430,correct_vc,n_a,no_vc,incorrect_vc,prop_correct_vc
0,36470,36477,bioF2,Rv0032,HT,CCCCCCC,1,140,4903,correct_vc,...,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,33,0,0,0,1.0
1,191391,191398,Rv0161,Rv0161,HT,CCCCCCC,1,73,274,correct_vc,...,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,33,0,0,0,1.0
2,340617,340632,,,SSR,GCG,5,51,132,correct_vc,...,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,33,0,0,0,1.0
3,364498,364505,inter_vapC2_Rv0302,inter_Rv0301_Rv0302,HT,GGGGGGG,1,216,676,correct_vc,...,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,32,1,0,0,1.0
4,552495,552504,,,SSR,CTA,3,102,1657,correct_vc,...,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,33,0,0,0,1.0
5,613371,613386,,,SSR,TGA,5,15,28,correct_vc,...,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,33,0,0,0,1.0
6,691887,691894,mce2D,Rv0592,HT,CCCCCCC,1,198,5158,correct_vc,...,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,33,0,0,0,1.0
7,799136,799143,Rv0698,Rv0698,HT,CCCCCCC,1,83,4229,correct_vc,...,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,33,0,0,0,1.0
8,854252,854261,inter_Rv0759c_Rv0760c,inter_Rv0759c_Rv0760c,HT,CCCCCCCCC,1,776,28077,correct_vc,...,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,29,4,0,0,1.0
9,868160,868167,Rv0774c,Rv0774c,HT,GGGGGGG,1,64,151,correct_vc,...,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,33,0,0,0,1.0


In [11]:
repeat_regions_to_map.to_csv('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/HT_SSR_H37Rv_sims_results.csv')

In [10]:
repeat_regions_to_map.loc[[51,52,53], genome_assemblies]

Unnamed: 0,RW-TB008,N1274,N1272,N1202,N1177,N1176,N0155,N0153,N0145,N0091,...,DNA075,DNA044,DNA020,DNA019_Rose,AZE_02_042,02_R1896,02_R1708,02_R1179,02_R0894,01_R1430
51,no_vc,correct_vc,correct_vc,no_vc,no_vc,correct_vc,correct_vc,correct_vc,correct_vc,no_vc,...,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc
52,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,...,correct_vc,correct_vc,correct_vc,no_vc,correct_vc,correct_vc,no_vc,correct_vc,no_vc,correct_vc
53,correct_vc,no_vc,no_vc,correct_vc,correct_vc,no_vc,correct_vc,correct_vc,correct_vc,correct_vc,...,correct_vc,no_vc,correct_vc,no_vc,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc,correct_vc


In [29]:
##############################################################################################
Mtb_genome_tag = 'AZE_02_042'
# load the H37Rv-assembly mappings with the sequences to alter in each HT/SSR region
repeat_region_mapping_df =  pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/H37Rv_to_assembly_mappings_3/HT_SSR_'+Mtb_genome_tag+'_mappings.pkl')

# re-order columns
repeat_region_mapping_df = repeat_region_mapping_df.loc[:,['chromStart','chromEnd','gene_name','locus_tag','polyNT','INDEL_homoplasy','num_isolates_with_INDEL',
                                                           'repeats','type','assembly_chromStart','assembly_chromEnd','assembly_polyNT','seq_diff_wrt_H37Rv',
                                                           'add_seq_into_assembly','expected_variant_call']]

In [30]:
repeat_region_mapping_df

Unnamed: 0,chromStart,chromEnd,gene_name,locus_tag,polyNT,INDEL_homoplasy,num_isolates_with_INDEL,repeats,type,assembly_chromStart,assembly_chromEnd,assembly_polyNT,seq_diff_wrt_H37Rv,add_seq_into_assembly,expected_variant_call
0,36470,36477,bioF2,Rv0032,CCCCCCC,140,4903,1,HT,36470.0,36478.0,CCCCCCCC,+C,,+C
1,191391,191398,Rv0161,Rv0161,CCCCCCC,73,274,1,HT,191228.0,191236.0,CCCCCCC,same,C,+C
2,340617,340632,,,GCG,51,132,5,SSR,343392.0,343408.0,GCGGCGGCGGCGGCG,same,G,+G
3,364498,364505,inter_vapC2_Rv0302,inter_Rv0301_Rv0302,GGGGGGG,216,676,1,HT,367028.0,367036.0,GGGGGGG,same,G,+G
4,552495,552504,,,CTA,102,1657,3,SSR,552398.0,552408.0,CTACTACTA,same,C,+C
5,613371,613386,,,TGA,15,28,5,SSR,613135.0,613151.0,TGATGATGATGATGA,same,T,+T
6,691887,691894,mce2D,Rv0592,CCCCCCC,198,5158,1,HT,691727.0,691735.0,CCCCCCC,same,C,+C
7,799136,799143,Rv0698,Rv0698,CCCCCCC,83,4229,1,HT,799509.0,799517.0,CCCCCCC,same,C,+C
8,854252,854261,inter_Rv0759c_Rv0760c,inter_Rv0759c_Rv0760c,CCCCCCCCC,776,28077,1,HT,854842.0,854850.0,CCCCCCCC,-C,,-C
9,868160,868167,Rv0774c,Rv0774c,GGGGGGG,64,151,1,HT,868750.0,868758.0,GGGGGGG,same,G,+G


In [3]:
SSR_H37Rv_df = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/CSV_files/simple_sequence_repeat_regions/SSR_H37Rv_2-6bp_at_least_3_repeats.pkl')

In [7]:
SSR_H37Rv_df.shape

(18316, 4)

In [12]:
SSR_H37Rv_df[SSR_H37Rv_df.motif == 'GC']

Unnamed: 0,H37Rv_start,H37Rv_end,motif,repeats
3,1121,1126,GC,3
11,2932,2937,GC,3
16,4543,4550,GC,4
18,6204,6209,GC,3
19,6710,6715,GC,3
27,7417,7422,GC,3
28,9072,9077,GC,3
35,9747,9752,GC,3
45,13389,13394,GC,3
52,15093,15098,GC,3
