### Current catalog

In [20]:
import pandas as pd
import pathlib
import seedir as sd

catalog_path = '../../data/previous-catalog/18strains.REL-1302-SV-GRCm38.sdp.tab'
catalog_data = pd.read_csv(catalog_path, sep='\t', low_memory=False)
catalog_data.head()

Unnamed: 0,CHROM,START,END,FORMAT,129P2,129S1,129S5,AJ,AKRJ,BALBcJ,...,CASTEiJ,CBAJ,DBA2J,FVBNJ,LPJ,NODShiLtJ,NZOHlLtJ,PWKPhJ,SPRETEiJ,WSBEiJ
0,X,3476745,3477456,POS:CL:BP:TY,X:3476761-3477345;DEL;RAW;DEL,X:3476745-3477456;DEL;RAW;DEL,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,X,3681483,3682211,POS:CL:BP:TY,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,X:3681483-3682211;DEL;RAW;DEL
2,X,3858775,3858777,POS:CL:BP:TY,0,0,0,0,0,0,...,0,0,0,X:3858775-3858777;INS;REF;H6INS,0,0,0,0,0,0
3,X,4274734,4277734,POS:CL:BP:TY,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,X:4274734-4277734;DEL;RAW;DEL,0
4,X,4381696,4387658,POS:CL:BP:TY,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,X:4381696-4387658;DEL;RAW;DEL,0


### Generate bed files per strain per type:

In [21]:
def bed_from_catalog(catalog_file, target_dir, target_strain="ALL", 
                     unified_file=False, naming_method=2, custom_mappings={}, min_len=-1):
    
    """
    bed_from_catalog(catalog_file, target_dir, target_strain, unified_file, naming_method)

    Generates BED Files from current SVs x Strain catalog.
    
    Parameters
    ----------
    catalog_file : str
        Path of the input catalog.
    target_dir : str
        Folder in which the BED files will be created.
    target_strain : str
        Strain (col names from catalog) for which the BED files will be created or "ALL".
    unified_file : bool
        If true, a single file is generated instead of one file per SV Type
    naming_method: int
        Naming for SV Types, currenly supports two options (1,2):
        1 => First 3 Chars, 2 => Any String until character "|" is found
    custom_mappings: dict
        Custom mapping for SV Type names
    min_len: int
        Minimum length
        
    Returns
    -------
    list
        Merged list of SV Types found
    """
    
    data_dicts = {}
    strain_data = []
    all_types = []

    for strain in catalog_data.columns[4:]:
        if(strain != target_strain and target_strain!="ALL"):
            continue

        strain_data=catalog_data[catalog_data[strain]!="0"][strain]
        strain_dicts=data_dicts.get(strain, {})

        for entry in strain_data:
            entry_data = entry.split(";")
            raw_pos = entry_data[0].split(":")

            if naming_method == 1:
                sv_type = entry_data[1][:3]
            else:
                sv_type = entry_data[1].split("|")[0].split("(")[0]
            
            sv_type = custom_mappings.get(sv_type, sv_type)

            chr1="chr"+raw_pos[0]
            start_end=raw_pos[1].split("-")
            start=start_end[0]
            end=start_end[1]
            length = int(end)-int(start)
            
            if start == end or length <= 40 and (sv_type == "INS") :
                start = int(start) - 20
                end = int (end) + 20
                
#             if min_len > 0 and length < min_len:
#                 continue
                
            if unified_file:
                sv_type =  "ALL"

            strain_type_dict = strain_dicts.get(sv_type, [])
            strain_type_dict.append({"chr": chr1, "start": start, "end": end, "len": str(length)}) 

            strain_dicts[sv_type]=strain_type_dict

        data_dicts[strain]=strain_dicts

        final_dir = target_dir + strain + "/"
        pathlib.Path(final_dir).mkdir(parents=True, exist_ok=True)

        for type_dict in data_dicts[strain].keys():
            type_df = pd.DataFrame(data_dicts[strain][type_dict])
            type_df.to_csv(final_dir + strain + "_" + type_dict + ".bed", sep='\t', index=False, header=False)
        
        all_types = list(set(all_types) | set([*data_dicts[strain]]))
        
    return all_types

In [22]:
target_dir="../../data/previous-catalog/50/"
target_strain="DBA2J"
unified_file=False
naming_method=2

custom_mappings={
    "DEL": "DEL",
    "DELINS": "DEL",
    "DELLINKED": "DEL",
    "INS": "INS",
    "INSLINKED": "INS",
    "INSLINKEDINS": "INS",
    "INV": "INV",
    "INVDELINS": "INV",
    "INVINS": "INV",
    "INVDUP": "INV",
}

bed_from_catalog(catalog_path, target_dir, target_strain, False, 2, custom_mappings)

['INV',
 'INVDEL',
 'INS',
 'TANDEMLOWDUP',
 'DEL',
 'TANDEMDUPINV',
 'TANDEMDUP',
 'GAIN']

### Generated files:

In [12]:
sd.seedir(target_dir, style='emoji', itemlimit=5, depthlimit=2, include_files='.*\.bed$', regex=True, sort=True)

📁 previous-catalog/
├─📁 129P2/
│ ├─📄 129P2_DEL.bed
│ ├─📄 129P2_GAIN.bed
│ ├─📄 129P2_INS.bed
│ ├─📄 129P2_INV.bed
│ └─📄 129P2_INVDEL.bed
├─📁 129S1/
│ ├─📄 129S1_DEL.bed
│ ├─📄 129S1_GAIN.bed
│ ├─📄 129S1_INS.bed
│ ├─📄 129S1_INV.bed
│ └─📄 129S1_INVDEL.bed
├─📁 129S5/
│ ├─📄 129S5_DEL.bed
│ ├─📄 129S5_GAIN.bed
│ ├─📄 129S5_INS.bed
│ ├─📄 129S5_INV.bed
│ └─📄 129S5_INVDEL.bed
├─📁 AJ/
│ ├─📄 AJ_DEL.bed
│ ├─📄 AJ_GAIN.bed
│ ├─📄 AJ_INS.bed
│ ├─📄 AJ_INV.bed
│ └─📄 AJ_INVDEL.bed
└─📁 AKRJ/
  ├─📄 AKRJ_DEL.bed
  ├─📄 AKRJ_GAIN.bed
  ├─📄 AKRJ_INS.bed
  ├─📄 AKRJ_INV.bed
  └─📄 AKRJ_INVDEL.bed
