In [24]:
import pysam
from pysam import VariantFile as vcf
import operator
from math import log2
import pandas as pd
from pandas import DataFrame as dataframe
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial.distance import pdist, squareform
import scipy
import  os
import os.path
import matplotlib.colors as mcolors
from scipy import stats
import csv
import requests
import re
import json


In [25]:
def fetch_gene_info(chromosome, start, end):
    query_xml = f"""
    <Query virtualSchemaName="default" formatter="TSV" header="0" uniqueRows="1" count="" datasetConfigVersion="0.6">
        <Dataset name="hsapiens_gene_ensembl" interface="default">
            <Filter name="chromosome_name" value="{chromosome}"/>
            <Filter name="start" value="{start}"/>
            <Filter name="end" value="{end}"/>
            <Attribute name="ensembl_gene_id"/>
            <Attribute name="external_gene_name"/>
            <Attribute name="description"/>
            <Attribute name="start_position"/>
            <Attribute name="end_position"/>            
            <Attribute name="strand"/>
        </Dataset>
    </Query>
    """

    biomart_url = "http://www.ensembl.org/biomart/martservice?query="
    response = requests.get(biomart_url + query_xml.strip())

    if response.status_code != 200:
        raise Exception(f"Error fetching data from BioMart: {response.text}")

    genes = [line.split("\t") for line in response.text.strip().split("\n")]
    return genes


In [26]:
mhc_race_positions_dict={'ACB': [(32589647, 32805573)],
 'All': [(32453110, 32577355), (32589647, 32716541)],
 'ASW': [(32627859, 32843772), (29939668, 30120966), (29753369, 29913914)],
 'CDX': [(31295439, 31432528)],
 'CEU': [(32627859, 32776791)],
 'CHS': [(32589647, 32716541)],
 'CLM': [(32589647, 32732048), (32453110, 32577355)],
 'ESN': [(29720403, 29896285), (32644320, 32843772)],
 'GBR': [(32453110, 32577355)],
 'GIH': [(32589647, 32698571)],
 'GWD': [(29939668, 30085606), (32627859, 32732048), (29720403, 29913914)],
 'IBS': [(32453110, 32577355), (32589647, 32716541)],
 'ITU': [(32423532, 32554290)],
 'JPT': [(32589647, 32732048)],
 'KHV': [(32453110, 32577355), (32589647, 32716541)],
 'LWK': [(32627859, 32805573)],
 'MSL': [(31317765, 31528792), (29720403, 29913914), (32554291, 32776791)],
 'MXL': [(32589647, 32716541)],
 'PEL': [(32473902, 32616414)],
 'PUR': [(32589647, 32683157)],
 'STU': [(32453110, 32577355), (32589647, 32716541)],
 'TSI': [(32589647, 32698571)],
 'YRI': [(29939668, 30120966), (29720403, 29913914), (32589647, 32882258)]}

In [27]:
def generate_gene_info(gene_list_one_record,chrnum):
    result=[]
    result.append(gene_list_one_record[0])
    result.append(gene_list_one_record[1])
    s=gene_list_one_record[3]
    e=gene_list_one_record[4]
    
    pos="chr"+chrnum+":"+str(s)+"-"+str(e)
    result.append(pos)
    result.append(gene_list_one_record[5])
    result.append(gene_list_one_record[2])
    
    return result


def extract_numbers(input_string):
    return re.findall(r'\d+', input_string)

def ifgeneinside_region(complementary_region,gene_region):
    
    s=int(gene_region[0])
    e=int(gene_region[1])
     
    if (complementary_region[0]<=s) and complementary_region[1]>=e:
        return True
    else:
        return False

In [28]:
allrecords=[]

In [29]:
for race, arr in mhc_race_positions_dict.items():
 
    if race!="All":

        chr="6" 
                
        for a in arr:
            print(a)
            gene_list=fetch_gene_info(chromosome=chr,start=a[0]-1,end=a[1]-1)
            for gene_list_record in gene_list:
                if ifgeneinside_region(complementary_region=a,gene_region=[gene_list_record[3],gene_list_record[4]]):
                    onerecord=[race,1]
                    onerecord+=[chr+":"+str(a[0])+"-"+str(a[1])]
                    
                    onerecord+=generate_gene_info(gene_list_one_record=gene_list_record,chrnum="6")
                    allrecords.append(onerecord)   
                else:
                    continue                
    else:
       
        chr="6"
        for a in arr:
            print(a)
            gene_list=fetch_gene_info(chromosome=chr,start=a[0]-1,end=a[1]-1)
            for gene_list_record in gene_list:
                if ifgeneinside_region(complementary_region=a,gene_region=[gene_list_record[3],gene_list_record[4]]):
                    onerecord=[race,1]
                    onerecord+=[chr+":"+str(a[0])+"-"+str(a[1])]
                    
                    onerecord+=generate_gene_info(gene_list_one_record=gene_list_record,chrnum="6")
                    allrecords.append(onerecord)    
                else:
                    continue




(32589647, 32805573)


(32453110, 32577355)
(32589647, 32716541)
(32627859, 32843772)
(29939668, 30120966)
(29753369, 29913914)
(31295439, 31432528)
(32627859, 32776791)
(32589647, 32716541)
(32589647, 32732048)
(32453110, 32577355)
(29720403, 29896285)
(32644320, 32843772)
(32453110, 32577355)
(32589647, 32698571)
(29939668, 30085606)
(32627859, 32732048)
(29720403, 29913914)
(32453110, 32577355)
(32589647, 32716541)
(32423532, 32554290)
(32589647, 32732048)
(32453110, 32577355)
(32589647, 32716541)
(32627859, 32805573)
(31317765, 31528792)
(29720403, 29913914)
(32554291, 32776791)
(32589647, 32716541)
(32473902, 32616414)
(32589647, 32683157)
(32453110, 32577355)
(32589647, 32716541)
(32589647, 32698571)
(29939668, 30120966)
(29720403, 29913914)
(32589647, 32882258)


In [30]:
#generatedf
def generatedf(columns,allrecords):
    dictforDF=dict()
    for i in range(len(columns)):
        midarr=[]
        for record in allrecords:           
            midarr.append(record[i])
        dictforDF[columns[i]]=midarr
    #dataframe(dictforDF).to_csv("csv2_05_09.csv")
    return dataframe(dictforDF)

dfnew=generatedf(columns=["Race","isMHC","Position of disassortative mating region","Gene ID","Gene name","Position of gene","is_complement","Gene description"],allrecords=allrecords)

# dfnew['is_complement'] = dfnew['is_complement'].apply(lambda x:  "F" if x == "1" else "R")
# dfnew.to_csv("/data2/wangxuedong/mhc_test_data/similarity_region_pythonfiles/csvfiles/csv2_0_with_repeat.csv")

In [31]:
dfnew

Unnamed: 0,Race,isMHC,Position of disassortative mating region,Gene ID,Gene name,Position of gene,is_complement,Gene description
0,ACB,1,6:32589647-32805573,ENSG00000196735,HLA-DQA1,chr6:32628179-32647062,1,"major histocompatibility complex, class II, DQ..."
1,ACB,1,6:32589647-32805573,ENSG00000179344,HLA-DQB1,chr6:32659467-32668383,-1,"major histocompatibility complex, class II, DQ..."
2,ACB,1,6:32589647-32805573,ENSG00000223534,HLA-DQB1-AS1,chr6:32659880-32660729,1,HLA-DQB1 antisense RNA 1 [Source:HGNC Symbol;A...
3,ACB,1,6:32589647-32805573,ENSG00000235040,MTCO3P1,chr6:32706124-32706955,-1,MT-CO3 pseudogene 1 [Source:HGNC Symbol;Acc:HG...
4,ACB,1,6:32589647-32805573,ENSG00000232080,,chr6:32718005-32719170,1,novel transcript
...,...,...,...,...,...,...,...,...
377,YRI,1,6:32589647-32882258,ENSG00000204264,PSMB8,chr6:32840717-32844679,-1,proteasome 20S subunit beta 8 [Source:HGNC Sym...
378,YRI,1,6:32589647-32882258,ENSG00000204261,PSMB8-AS1,chr6:32844078-32846500,1,PSMB8 antisense RNA 1 (head to head) [Source:H...
379,YRI,1,6:32589647-32882258,ENSG00000240065,PSMB9,chr6:32844136-32859851,1,proteasome 20S subunit beta 9 [Source:HGNC Sym...
380,YRI,1,6:32589647-32882258,ENSG00000168394,TAP1,chr6:32845209-32853816,-1,"transporter 1, ATP binding cassette subfamily ..."


In [32]:
dfnew['is_complement'] = dfnew['is_complement'].apply(lambda x:  "F" if x == "1" else "R")
#dfnew.to_csv("/data2/wangxuedong/mhc_test_data/similarity_region_pythonfiles/csvfiles/csv2_0_with_repeat.csv")

In [33]:
dfnew

Unnamed: 0,Race,isMHC,Position of disassortative mating region,Gene ID,Gene name,Position of gene,is_complement,Gene description
0,ACB,1,6:32589647-32805573,ENSG00000196735,HLA-DQA1,chr6:32628179-32647062,F,"major histocompatibility complex, class II, DQ..."
1,ACB,1,6:32589647-32805573,ENSG00000179344,HLA-DQB1,chr6:32659467-32668383,R,"major histocompatibility complex, class II, DQ..."
2,ACB,1,6:32589647-32805573,ENSG00000223534,HLA-DQB1-AS1,chr6:32659880-32660729,F,HLA-DQB1 antisense RNA 1 [Source:HGNC Symbol;A...
3,ACB,1,6:32589647-32805573,ENSG00000235040,MTCO3P1,chr6:32706124-32706955,R,MT-CO3 pseudogene 1 [Source:HGNC Symbol;Acc:HG...
4,ACB,1,6:32589647-32805573,ENSG00000232080,,chr6:32718005-32719170,F,novel transcript
...,...,...,...,...,...,...,...,...
377,YRI,1,6:32589647-32882258,ENSG00000204264,PSMB8,chr6:32840717-32844679,R,proteasome 20S subunit beta 8 [Source:HGNC Sym...
378,YRI,1,6:32589647-32882258,ENSG00000204261,PSMB8-AS1,chr6:32844078-32846500,F,PSMB8 antisense RNA 1 (head to head) [Source:H...
379,YRI,1,6:32589647-32882258,ENSG00000240065,PSMB9,chr6:32844136-32859851,F,proteasome 20S subunit beta 9 [Source:HGNC Sym...
380,YRI,1,6:32589647-32882258,ENSG00000168394,TAP1,chr6:32845209-32853816,R,"transporter 1, ATP binding cassette subfamily ..."


In [34]:
dfnew['Position of disassortative mating region'] = 'chr' + dfnew['Position of disassortative mating region'].astype(str)


In [35]:
dfnew

Unnamed: 0,Race,isMHC,Position of disassortative mating region,Gene ID,Gene name,Position of gene,is_complement,Gene description
0,ACB,1,chr6:32589647-32805573,ENSG00000196735,HLA-DQA1,chr6:32628179-32647062,F,"major histocompatibility complex, class II, DQ..."
1,ACB,1,chr6:32589647-32805573,ENSG00000179344,HLA-DQB1,chr6:32659467-32668383,R,"major histocompatibility complex, class II, DQ..."
2,ACB,1,chr6:32589647-32805573,ENSG00000223534,HLA-DQB1-AS1,chr6:32659880-32660729,F,HLA-DQB1 antisense RNA 1 [Source:HGNC Symbol;A...
3,ACB,1,chr6:32589647-32805573,ENSG00000235040,MTCO3P1,chr6:32706124-32706955,R,MT-CO3 pseudogene 1 [Source:HGNC Symbol;Acc:HG...
4,ACB,1,chr6:32589647-32805573,ENSG00000232080,,chr6:32718005-32719170,F,novel transcript
...,...,...,...,...,...,...,...,...
377,YRI,1,chr6:32589647-32882258,ENSG00000204264,PSMB8,chr6:32840717-32844679,R,proteasome 20S subunit beta 8 [Source:HGNC Sym...
378,YRI,1,chr6:32589647-32882258,ENSG00000204261,PSMB8-AS1,chr6:32844078-32846500,F,PSMB8 antisense RNA 1 (head to head) [Source:H...
379,YRI,1,chr6:32589647-32882258,ENSG00000240065,PSMB9,chr6:32844136-32859851,F,proteasome 20S subunit beta 9 [Source:HGNC Sym...
380,YRI,1,chr6:32589647-32882258,ENSG00000168394,TAP1,chr6:32845209-32853816,R,"transporter 1, ATP binding cassette subfamily ..."


In [36]:
dfnew.to_csv("mhc_0.csv")