# Parsing the output of NucDiff (Across 31 clinical Mtb isolates)

### Maximillian Marin
### mgmarin@g.harvard.edu

### Date: 20/07/29

### Goal: Parsing the variant calling output of NucDiff from the dataset of 36 clinical Mtb isolates

#### 1) Parse NucDiff output regarding SVs in each isolate


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

%matplotlib inline

In [2]:
#import vcf

#import pickle
#from Bio import SeqIO
#import plotly.express as px

In [3]:
#!pip install bcbio-gff

In [4]:
#!conda install -c bioconda bcbio-gff -y 

In [5]:
#sys.path.append('/home/mm774/conda3/pkgs/bcbio-gff-0.6.6-pyh864c0ab_1/site-packages')
#from BCBio import GFF

#### Pandas Viewing Settings

In [6]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Parse metadata for the 36 total isolates processed (only 36 have circualr assemblies

In [7]:
Repo_DataDir = "../../Data"

PMP_SM_ResultsSummary_Dir_210108 = Repo_DataDir + "/210108_PMP_SM_50CI_V7_ResultsSummary"

PMP_36CI_CicularOnly_F2Filtered_AtLeast40XIllDepth_AssemblySummary_TSV_PATH = PMP_SM_ResultsSummary_Dir_210108 + "/210108_PMP_36CI_CircularOnly_F2Filtered_AtLeast40XMeanDepthIllumina_AssemblySummary_V7.tsv"       

PMP_36CI_CicularOnly_F2Filtered_AtLeast40XDepth_AssemblySummary = pd.read_csv(PMP_36CI_CicularOnly_F2Filtered_AtLeast40XIllDepth_AssemblySummary_TSV_PATH, sep = "\t")

PMP_36CI_AnalysisSet_AssemblySummary = PMP_36CI_CicularOnly_F2Filtered_AtLeast40XDepth_AssemblySummary


SampleIDs_36CI_SOI = list( PMP_36CI_AnalysisSet_AssemblySummary["SampleID"].values )


print(','.join(SampleIDs_36CI_SOI) )

# Make sample to metadata mapping dicts

ID_To_IlluminaAvrgCov_Dict = dict(PMP_36CI_AnalysisSet_AssemblySummary[['SampleID', 'IlluminaWGSToH37rv_AvrgCov']].values)                     
ID_To_Lineage_Dict = dict(PMP_36CI_AnalysisSet_AssemblySummary[['SampleID', 'PrimaryLineage_PB']].values)
ID_To_Dataset_Dict = dict(PMP_36CI_AnalysisSet_AssemblySummary[['SampleID', 'Dataset_Tag']].values)

M0011368_9,M0014888_3,M0016395_7,M0010874_7,01_R1430,02_R0894,02_R1708,02_R1896,M0016737_0,M0017522_5,01_R1134,M0003941_3,02_R1179,N1176,N0072,N0153,N0145,N0155,N0004,N1274,N0054,N1272,N0091,N1202,N1177,RW-TB008,DNA028,DNA075,DNA091,DNA044,DNA020,AZE_02_042,DNA019_Rose,DNA120,DNA188,DNA086


## Define PacManPipe output directory - V1

In [8]:
### Define directories to PMP-SM (PacBio assembly and analysis pipeline)

### Define varaint calling pipeline output directories

PacBio_ProjectDir = "/n/data1/hms/dbmi/farhat/mm774/Projects/PacBio_Evaluation_Project"

PMP_SM_Outputs_Dir = PacBio_ProjectDir + "/PacmanPipe_SM_Outputs"

PMP_SM_TB_Portals_R1_Illumina_WGS_201201_OutputDir = PMP_SM_Outputs_Dir + "/201201_PMP_SM_TB_Portals_R1_Output_V2"


## Standard fields of the GFF3 format (https://uswest.ensembl.org/info/website/upload/gff3.html#:~:text=GFF3%20File%20Format%20%2D%20Definition%20and,on%20the%20Version%203%20specifications.)

1. seqid - name of the chromosome or scaffold; chromosome names can be given with or without the 'chr' prefix. Important note: the seq ID must be one used within Ensembl, i.e. a standard chromosome name or an Ensembl identifier such as a scaffold ID, without any additional content such as species or assembly. See the example GFF output below.
2. source - name of the program that generated this feature, or the data source (database or project name)
3. type - type of feature. Must be a term or accession from the SOFA sequence ontology
4. start - Start position of the feature, with sequence numbering starting at 1.
5. end - End position of the feature, with sequence numbering starting at 1.
6. score - A floating point value.
7. strand - defined as + (forward) or - (reverse).
8. phase - One of '0', '1' or '2'. '0' indicates that the first base of the feature is the first base of a codon, '1' that the second base is the first base of a codon, and so on..
9. attributes - A semicolon-separated list of tag-value pairs, providing additional information about each feature. Some of these tags are predefined, e.g. ID, Name, Alias, Parent - see the GFF documentation for more details.


### Notes regarding NucDiff GFF formatting for SVs

Types of Length attributes

1. ins_len
2. del_len
3. subst_len (substitution)
4. blk_len (Inversion)


In [9]:
def GFF_AttributesCol_To_Dict(input_GFF_AttributesCol_Str):
    
    ###For each line of your VCF, create a dictionnary with this array key : info, value : value of this info
    dict_INFO = {}
    for i in input_GFF_AttributesCol_Str.split(";"):
        ###Just looking for line with "=" character (as key = value)
        if "=" in i:
            ###Left from equal sign is key (Gene.refGene, ExonicFunc.refGene...)
            key = i.split("=")[0]
            ###Right from equal sign is value (RBL1,synonymous_SNV...)
            value = i.split("=")[1]
            ###Put them in a dictionnary
            dict_INFO[key]=value
  
    return dict_INFO
        

In [10]:
def parse_NucDiff_SV_GFF(input_NucDiff_Query_Ref_Struct_GFF_PATH):

    SV_CNT = 1

    All_Valid_SV_Types = ['insertion', 'duplication', 'tandem_duplication', 'deletion',  'collapsed_repeat',  'collapsed_tandem_repeat', 'inversion', 'substitution']
    INS_SV_Types = ['insertion', 'duplication',       'tandem_duplication']
    DEL_SV_Types = ['deletion',  'collapsed_repeat',  'collapsed_tandem_repeat']
    SUB_SV_Types = ['substitution']
    BLK_SV_Types = ['inversion']

    listOf_InfoTuples = []


    with open(input_NucDiff_Query_Ref_Struct_GFF_PATH) as input_SV_GFF:

        for line in input_SV_GFF:
            if not line.startswith("#"):
                NucDiff_GFF_Row_Line = line.rstrip("\n").split("\t")

                i_seqID_Chrom = NucDiff_GFF_Row_Line[0]
                i_source = NucDiff_GFF_Row_Line[1]
                i_type = NucDiff_GFF_Row_Line[2]            
                i_start = int( NucDiff_GFF_Row_Line[3] )    
                i_end = int( NucDiff_GFF_Row_Line[4] )
                i_score = NucDiff_GFF_Row_Line[5]
                i_strand = NucDiff_GFF_Row_Line[6]
                i_phase = NucDiff_GFF_Row_Line[7]
                i_attributes = NucDiff_GFF_Row_Line[8]

                i_Attributes_Dict = GFF_AttributesCol_To_Dict(i_attributes)

                NucDiff_SV_Type = i_Attributes_Dict["Name"]

                if not NucDiff_SV_Type in All_Valid_SV_Types: continue


                if "ins_len"  in i_Attributes_Dict.keys():
                    GEN_SV_Len = i_Attributes_Dict["ins_len"]
                    
                elif "del_len"  in i_Attributes_Dict.keys():
                    GEN_SV_Len = i_Attributes_Dict["del_len"]

                elif "subst_len"  in i_Attributes_Dict.keys():
                    GEN_SV_Len = i_Attributes_Dict["subst_len"]

                elif "blk_len"  in i_Attributes_Dict.keys():
                    GEN_SV_Len = i_Attributes_Dict["blk_len"]

                else: 
                    GEN_SV_Len = 0
                    print(NucDiff_SV_Type, "has no length attribute")


                GEN_SV_Len = int(GEN_SV_Len)
                
                
                if NucDiff_SV_Type in ['duplication', 'tandem_duplication']: # Shift start of all duplications to the left by the length of the duplication (For NucDiff Duplications)
                    i_start = i_start - GEN_SV_Len
                

                
                i_start_0based = i_start - 1
                i_end_0based = i_end
                
                SV_Info_Tuple = (i_seqID_Chrom, i_start_0based, i_end_0based, NucDiff_SV_Type, GEN_SV_Len, )


                #if (not "ins_len"  in i_Attributes_Dict.keys()) and (not "del_len"  in i_Attributes_Dict.keys()) :
                #if GEN_SV_Len >= 5000:
                #    print(SV_Info_Tuple)    
                #    print(NucDiff_GFF_Row_Line)
                #    print("")

                SV_CNT += 1

                #if SV_CNT >= 15: break

                listOf_InfoTuples.append(SV_Info_Tuple)


    NucDiff_SV_Info_DF = pd.DataFrame(listOf_InfoTuples)

    return NucDiff_SV_Info_DF

# Parsing NucDiff SVs

In [11]:
dictOf_NucDiff_DFs = {}

listOf_NucDiff_DFs = []

for sampleID in tqdm(SampleIDs_36CI_SOI) :
    
    #print(sampleID)
    Sample_Output_Dir = PMP_SM_TB_Portals_R1_Illumina_WGS_201201_OutputDir + "/" + sampleID
    i_NucDiff_OutputDir = Sample_Output_Dir + f"/pacbio_VariantCallingVersusH37Rv/NucDiff_Analysis_{sampleID}"

    i_NucDiff_ResultsDir = f"{i_NucDiff_OutputDir}/results"

    i_NucDiff_Query_Ref_Struct_GFF_PATH = f"{i_NucDiff_ResultsDir}/NucDiff_{sampleID}_ref_struct.Filtered.SVs.gff"

    
    i_NucDiff_SV_DF = parse_NucDiff_SV_GFF(i_NucDiff_Query_Ref_Struct_GFF_PATH)
    
    i_NucDiff_SV_DF.columns = ["Chrom", "start_0based", "end_0based", "SV_Type", "SV_Length"]

    i_NucDiff_SV_DF["SampleID"] = sampleID
    
    i_NucDiff_SV_DF = i_NucDiff_SV_DF.sort_values(["start_0based", "end_0based"])
    
    i_NucDiff_SV_Wi50bp_LenFilter_DF = i_NucDiff_SV_DF.query("SV_Length >= 50")
    
    i_NucDiff_SV_Wi50bp_LenFilter_NoInversions_DF = i_NucDiff_SV_Wi50bp_LenFilter_DF[ i_NucDiff_SV_Wi50bp_LenFilter_DF["SV_Type"] != "inversion" ] 
    
    
    dictOf_NucDiff_DFs[sampleID] = i_NucDiff_SV_DF
    
    listOf_NucDiff_DFs.append(i_NucDiff_SV_DF)    
    

100%|██████████| 36/36 [00:00<00:00, 98.06it/s]


# Concat all SVs (from all 36 CI)

In [12]:
NucDiff_SVs_36CI_DF = pd.concat(listOf_NucDiff_DFs)

NucDiff_SVs_36CI_DF["PrimaryLineage"] = NucDiff_SVs_36CI_DF["SampleID"].map(ID_To_Lineage_Dict)


In [13]:
NucDiff_SVs_36CI_DF["SV_Type"].value_counts()

insertion                  1394
deletion                   1159
duplication                 933
collapsed_repeat            703
tandem_duplication          658
collapsed_tandem_repeat     323
inversion                   158
substitution                156
Name: SV_Type, dtype: int64

In [14]:
NucDiff_SVs_36CI_DF[ NucDiff_SVs_36CI_DF["SV_Type"] != "inversion" ] 

Unnamed: 0,Chrom,start_0based,end_0based,SV_Type,SV_Length,SampleID,PrimaryLineage
1,NC_000962.3,71583,71586,duplication,2,M0011368_9,lineage4
0,NC_000962.3,71585,71586,insertion,35,M0011368_9,lineage4
3,NC_000962.3,150889,150903,duplication,13,M0011368_9,lineage4
2,NC_000962.3,150902,150903,insertion,167,M0011368_9,lineage4
4,NC_000962.3,335050,337913,tandem_duplication,2862,M0011368_9,lineage4
...,...,...,...,...,...,...,...
122,NC_000962.3,3935681,3936338,deletion,657,DNA086,lineage4
123,NC_000962.3,3948003,3948196,substitution,193,DNA086,lineage4
124,NC_000962.3,3948196,3949522,deletion,1326,DNA086,lineage4
125,NC_000962.3,4053437,4053549,tandem_duplication,111,DNA086,lineage4


In [15]:
NucDiff_SVs_36CI_DF.shape

(5484, 7)

In [16]:
NucDiff_SVs_36CI_DF.head(3)

Unnamed: 0,Chrom,start_0based,end_0based,SV_Type,SV_Length,SampleID,PrimaryLineage
1,NC_000962.3,71583,71586,duplication,2,M0011368_9,lineage4
0,NC_000962.3,71585,71586,insertion,35,M0011368_9,lineage4
3,NC_000962.3,150889,150903,duplication,13,M0011368_9,lineage4


In [17]:
DNA086_SVs_DF = dictOf_NucDiff_DFs["DNA086"]
DNA086_SVs_DF.shape

(127, 6)

In [18]:
DNA086_SVs_DF.query("SV_Length >= 50").head(3)

Unnamed: 0,Chrom,start_0based,end_0based,SV_Type,SV_Length,SampleID
2,NC_000962.3,150902,150903,insertion,167,DNA086
4,NC_000962.3,171457,171778,deletion,321,DNA086
7,NC_000962.3,334651,336877,duplication,2225,DNA086


In [19]:
DNA086_SVs_DF.query("SV_Type == 'deletion' ").head(2)

Unnamed: 0,Chrom,start_0based,end_0based,SV_Type,SV_Length,SampleID
4,NC_000962.3,171457,171778,deletion,321,DNA086
8,NC_000962.3,361764,361983,deletion,219,DNA086


In [20]:
DNA086_SVs_DF.query("SV_Length >= 50").shape

(82, 6)

In [21]:
DNA086_SVs_DF.head(10)

Unnamed: 0,Chrom,start_0based,end_0based,SV_Type,SV_Length,SampleID
1,NC_000962.3,71583,71586,duplication,2,DNA086
0,NC_000962.3,71585,71586,insertion,35,DNA086
3,NC_000962.3,150889,150903,duplication,13,DNA086
2,NC_000962.3,150902,150903,insertion,167,DNA086
4,NC_000962.3,171457,171778,deletion,321,DNA086
5,NC_000962.3,171778,171779,collapsed_repeat,1,DNA086
7,NC_000962.3,334651,336877,duplication,2225,DNA086
6,NC_000962.3,336876,336877,insertion,763,DNA086
8,NC_000962.3,361764,361983,deletion,219,DNA086
9,NC_000962.3,361983,362010,collapsed_repeat,27,DNA086


# Output all SVs detected across all 36 clinical isolates (with NucDiff)

In [22]:
Repo_DataDir = "../../Data"

PMP_36CI_NucDiff_SV_Analysis_Dir = Repo_DataDir + "/210126_PMP_36CI_NucDiff_SV_Analysis_Dir"

!mkdir $PMP_36CI_NucDiff_SV_Analysis_Dir
PMP_36CI_NucDiff_AllSVs_Detected_TSV = PMP_36CI_NucDiff_SV_Analysis_Dir + "/210126.PMP.36CI.NucDiff_AllSVs_Detected.V2.tsv"


mkdir: cannot create directory ‘../../Data/210126_PMP_36CI_NucDiff_SV_Analysis_Dir’: File exists


In [23]:
NucDiff_SVs_36CI_DF.to_csv(PMP_36CI_NucDiff_AllSVs_Detected_TSV, sep = "\t", index= False)

In [24]:
NucDiff_SVs_36CI_DF.shape

(5484, 7)

In [25]:
!ls -lah $PMP_36CI_NucDiff_SV_Analysis_Dir

total 336K
drwxrwsr-x  2 mm774 farhat  197 Feb 19 21:30 .
drwxrwsr-x 45 mm774 farhat 2.4K Mar 24 19:41 ..
-rw-rw-r--  1 mm774 farhat 218K Feb 19 21:30 210126.PMP.36CI.NucDiff_AllSVs_Detected.V2.50bp.bed
-rw-rw-r--  1 mm774 farhat 327K Feb 19 21:28 210126.PMP.36CI.NucDiff_AllSVs_Detected.V2.bed
-rw-rw-r--  1 mm774 farhat 327K Mar 26 13:42 210126.PMP.36CI.NucDiff_AllSVs_Detected.V2.tsv


### Read in dataframe of SVs (Testing)

In [26]:
Repo_DataDir = "../../Data"

PMP_36CI_NucDiff_SV_Analysis_Dir = Repo_DataDir + "/210126_PMP_36CI_NucDiff_SV_Analysis_Dir"
PMP_36CI_NucDiff_AllSVs_Detected_TSV = PMP_36CI_NucDiff_SV_Analysis_Dir + "/210126.PMP.36CI.NucDiff_AllSVs_Detected.V2.tsv"

NucDiff_SVs_36CI_DF = pd.read_csv(PMP_36CI_NucDiff_AllSVs_Detected_TSV, sep="\t")

In [27]:
NucDiff_SVs_36CI_DF.shape

(5484, 7)