In [2]:
import pandas as pd
from pyliftover import LiftOver
from cyvcf2 import VCF

In [6]:
###### Load blood data ######

blood_traits = pd.read_excel("media-1.xlsx", sheet_name='st4_confidently_fine_mapped_STR')
blood_traits = blood_traits[(blood_traits['chrom'].notna()) & (blood_traits['fine-mapping'] == "confidently")]
blood_traits.dropna(axis=1, how='all', inplace=True)


## Convert positions from hg19 to hg38

lo = LiftOver('hg19', 'hg38')
hg38_blood_traits = pd.DataFrame(columns = ['Chrom', 'Start', 'End', 'Motif'])
for index, row in blood_traits.iterrows():

    lifted_positions_start = lo.convert_coordinate('chr' + str(row['chrom']), row['start_pos (hg19)'])
    lifted_positions_end = lo.convert_coordinate('chr' + str(row['chrom']), row['end_pos (hg19)'])
    if lifted_positions_start and lifted_positions_end:
        hg38_blood_traits.loc[len(hg38_blood_traits)] = [lifted_positions_start[0][0], 
                                                         lifted_positions_start[0][1], 
                                                         lifted_positions_end[0][1],row['repeat_unit']]
    else:
        print(f"Coulf not lift {row['start_pos (hg19)']}")
        
hg38_blood_traits = hg38_blood_traits.drop_duplicates()
hg38_blood_traits['Start'] = hg38_blood_traits['Start'] - 10 # add a 10bp buffer
hg38_blood_traits['End'] = hg38_blood_traits['End'] + 10 # add a 10bp buffer

hg38_blood_traits[['Chrom', 'Start', 'End']].sort_values(['Chrom', 'Start']).to_csv("blood_loci.txt", index = False,
                                                                            header = False, sep = "\t")

In [7]:
#### checking output VCF files ####
extracted_loci = pd.DataFrame(columns = ['Chrom', 'Start', 'Motif', 'END'])
for i in range(1,23):
    vcf_file = VCF(f'blood_vcf/LOO_chr{i}.vcf.gz')
    for record in vcf_file:
        extracted_loci.loc[len(extracted_loci)] = [record.CHROM, record.POS, record.INFO['RU'], record.INFO['END']]

In [8]:
nucToNumber={"A":0,"C":1,"G":2,"T":3}
def GetCanonicalOneStrand(repseq):
    r"""Get canonical STR sequence, considering one strand

    The canonical sequence is the first alphabetically
    out of all possible rotations. e.g. CAG -> AGC.

    Parameters
    ----------
    repseq : str
          String giving a STR motif (repeat unit sequence)

    Returns
    -------
    canon : str
          The canonical sequence of the STR motif

    Examples
    --------
    >>> GetCanonicalOneStrand("CAG")
    "AGC"
    """
    if repseq == "GCN": #HTT
        return "AGC"
    if "/" in repseq:
        repseq = repseq.split("/")[0]
        
    repseq = repseq.upper()
    size = len(repseq)
    canonical = repseq
    for i in range(size):
        newseq = repseq[size-i:]+repseq[0:size-i]
        for j in range(size):
            if nucToNumber[newseq[j]] < nucToNumber[canonical[j]]:
                canonical = newseq
            elif nucToNumber[newseq[j]] > nucToNumber[canonical[j]]:
                break
    return canonical

In [9]:
all_found = []
for index,row in hg38_blood_traits.iterrows():
    if row['Chrom'] == "chrX":
        continue
    extracted = extracted_loci[(extracted_loci['Chrom'] == row['Chrom']) &                              
                               (extracted_loci['Motif'] == GetCanonicalOneStrand(row['Motif']))]
    found = []
    for j in range(len(list(extracted['Start']))):
        p = list(extracted['Start'])[j]
        for i in range(p - 10, p + 10):
            if row['Start'] == i:
                found.append([row['Chrom'], row['Motif'], p, list(extracted['END'])[j]])
                
    if len(found) == 0:
        print(f'not found: {row}')
        print("----------------")
    elif len(found) > 1:
        print(f'duplicated: {row}')
        print("----------------")
    else:
        all_found.append(found[0])

all_found = pd.DataFrame(all_found, columns = ['Chrom', 'Motif', 'Start', 'End'])

not found: Chrom        chr3
Start    15661256
End      15661315
Motif           A
Name: 57, dtype: object
----------------
duplicated: Chrom        chr1
Start    23491429
End      23491464
Motif           A
Name: 104, dtype: object
----------------


In [5]:
#### Load Capillary loci ####

capillary = pd.read_csv("capillary_loci.txt", header=None)
capillary.columns = ['Chrom', 'Start', 'End']
# capillary_blood = pd.concat([capillary, all_found])
# capillary_blood

In [14]:
for chrom in range(1,23):
    chrom_df = capillary_blood[capillary_blood['Chrom'] == f'chr{chrom}']
    chrom_df[['Chrom', 'Start', 'End']].to_csv(f"pathogenic_loci/loci_chr{chrom}.txt", sep = "\t", index = False,
                                              header = False)

In [3]:
missed_loci = pd.read_csv("missed_loci.txt", sep = "\t", header = None)
missed_loci

Unnamed: 0,0,1,2
0,chr1,227289887,227289904
1,chr1,150579759,150579814
2,chr2,15646393,15646415
3,chr2,23870286,23870305
4,chr2,207588729,207588742
5,chr3,136449901,136449919
6,chr4,3074877,3074968
7,chr4,56858490,56858512
8,chr4,7039089,7039118
9,chr4,39921580,39921605
