In [3]:
import pysam
from pysam import VariantFile as vcf
import operator
from math import log2
import pandas as pd
from pandas import DataFrame as dataframe
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial.distance import pdist, squareform
import scipy
import  os
import os.path
import matplotlib.colors as mcolors
from scipy import stats
import csv
from statsmodels.stats.multitest import multipletests

In [4]:
import requests
from xml.etree import ElementTree as ET

def fetch_gene_info(chromosome, start, end):
    query_xml = f"""
    <Query virtualSchemaName="default" formatter="TSV" header="0" uniqueRows="1" count="" datasetConfigVersion="0.6">
        <Dataset name="hsapiens_gene_ensembl" interface="default">
            <Filter name="chromosome_name" value="{chromosome}"/>
            <Filter name="start" value="{start}"/>
            <Filter name="end" value="{end}"/>
            <Attribute name="ensembl_gene_id"/>
            <Attribute name="external_gene_name"/>
            <Attribute name="description"/>
            <Attribute name="start_position"/>
            <Attribute name="end_position"/>            
            <Attribute name="strand"/>
        </Dataset>
    </Query>
    """

    biomart_url = "http://www.ensembl.org/biomart/martservice?query="
    response = requests.get(biomart_url + query_xml.strip())

    if response.status_code != 200:
        raise Exception(f"Error fetching data from BioMart: {response.text}")

    genes = [line.split("\t") for line in response.text.strip().split("\n")]
    print(genes)
    return genes


In [5]:
split_race_region={'ACB': {'chr2': [[92299419, 92523478]],
  'chr3': [[89474731, 90288204]],
  'chr5': [[62144306, 62605777]],
  'chr14': [[105863438, 106259905]],
  'chr16': [[36332499, 46403568]],
  'chr18': [[19764009, 20828994]],
  'chr22': [[22691667, 22898645]]},'All':{'chr2': [[108377615, 108667023]],
 'chr3': [[87292011, 87555666]],
 'chr9': [[88434984, 88700396]],
 'chr11': [[39680311, 39930646]],
 'chr12': [[20836055, 21097074], [34328866, 34516639]],
 'chr14': [[31627550, 31888106]]},
 'ASW': {'chr2': [[92299419, 92507872],
   [92633551, 92902449],
   [93680283, 94160185],
   [94892770, 95546698],
   [202715454, 203549066]],
  'chr3': [[93559360, 94116184]],
  'chr8': [[49383975, 49798135], [103467484, 103900956]],
  'chr10': [[72960046, 73409340]],
  'chr14': [[105859287, 106198900]],
  'chr16': [[36260059, 46728534]]},
 'BEB': {'chr15': [[77138940, 77566277]]},
 'CDX': {'chr8': [[70075248, 70483016]],
  'chr11': [[50311095, 50728899]],
  'chr14': [[66379055, 66778338]],
  'chr20': [[21834643, 22233777]]},
 'CEU': {'chr8': [[84442692, 84821607]],
  'chr12': [[34328866, 34590462]],
  'chr14': [[105917600, 106259905]],
  'chr22': [[22661982, 22898645]]},
 'CHB': {'chr6_partC': [[85501992, 86104633]],
  'chr11': [[47086853, 47921826]]},
 'CHS': {'chr10': [[63107761, 63493252]],
  'chr13': [[47866045, 48251299]],
  'chr15': [[76471437, 76909646]]},
 'CLM': {'chr19': [[42860739, 43046093]]},
 'ESN': {'chr12': [[72479117, 72899345]]},
 'FIN': {'chr3': [[75380511, 75596024]],
  'chr5': [[131238500, 132001329]],
  'chr17': [[39204905, 39592604], [45610677, 46138242]]},
 'GBR': {'chr19': [[22622704, 23047137]], 'chr22': [[41329774, 41808259]]},
 'GIH': {'chr3': [[157950988, 158348010]],
  'chr6_partC': [[85696088, 86044297]],
  'chr12': [[132935977, 133264749]],
  'chr19': [[42820338, 43026385]]},
 'GWD': {'chr2': [[213229719, 213686213]],
  'chr3': [[47078886, 47548303]],
  'chr10': [[38164827, 38614638]]},
 'IBS': {'chr3': [[75380511, 75596024]]},
 'ITU': {'chr1': [[189001539, 189336299]],
  'chr8': [[50652103, 51059321]],
  'chr10': [[72260319, 72661979]],
  'chr12': [[32954541, 33301513]]},
 'JPT': {'chr10': [[73001439, 73409340]], 'chr14': [[66187582, 66778338]]},
 'KHV': {'chr2': [[186057591, 186516143]],
  'chr4': [[151301085, 151752781]],
  'chr10': [[38073145, 38635036], [73001439, 73409340]],
  'chr17': [[58394956, 59118140]]},
 'LWK': {'chr2': [[93680283, 94160185]],
  'chr9': [[64439637, 65054326]],
  'chr10': [[102847077, 103283225]],
  'chr14': [[105863438, 106385732]],
  'chr16': [[36332499, 46403568]],
  'chr22': [[22691667, 22898645]]},
 'MSL': {'chr2': [[93680283, 94143754]],
  'chr5': [[46140397, 46434522]],
  'chr10': [[38073145, 38635036]],
  'chr14': [[105863438, 106259905]],
  'chr18': [[19764009, 20828994]],
  'chr19': [[27564615, 27844019]],
  'chr20': [[18364231, 18744031]],
  'chr22': [[22691667, 22898645]]},
 'MXL': {'chr3': [[94780047, 95118014]], 'chr11': [[49588569, 50709669]]},
 'PEL': {'chr4': [[150649663, 151256009]],
  'chr5': [[46140397, 46434522]],
  'chr11': [[48330435, 49133873]]},
 'PJL': {'chr6_partC': [[118358152, 118798801]],
  'chr7': [[57291163, 57656843]]},
 'PUR': {'chr11': [[49133908, 50818091], [54704427, 55220212]]},
 'STU': {'chr2': [[135176374, 135572114]],
  'chr3': [[23111879, 23593852]],
  'chr14': [[66143522, 66818672]]},
 'TSI': {'chr2': [[193268274, 193813343]], 'chr13': [[55278837, 55753073]]},
 'YRI': {'chr10': [[63150067, 63541883]],
  'chr14': [[66379055, 66778338]],
  'chr22': [[22691667, 22898645]]}}

In [6]:
def generate_gene_info(gene_list_one_record,chrnum):
    result=[]
    result.append(gene_list_one_record[0])
    result.append(gene_list_one_record[1])
    s=gene_list_one_record[3]
    e=gene_list_one_record[4]
    
    pos="chr"+chrnum+":"+str(s)+"-"+str(e)
    result.append(pos)
    result.append(gene_list_one_record[5])
    result.append(gene_list_one_record[2])
    
    return result

In [7]:
import re

def extract_numbers(input_string):
    return re.findall(r'\d+', input_string)


In [8]:
def ifgeneinside_region(complementary_region,gene_region):
    
    s=int(gene_region[0])
    e=int(gene_region[1])
     
    if (complementary_region[0]<=s) and complementary_region[1]>=e:
        return True
    else:
        return False

In [9]:
allrecords=[]

#non_mhc
for race,split_ in split_race_region.items():
    if race!="All":
        for chr,arr in split_.items():
            # chr=chr.split("/")[0].split("_")[1]
            chr=extract_numbers(chr)[0]
            print(chr)
            for a in arr:
                gene_list=fetch_gene_info(chromosome=chr,start=a[0]-1,end=a[1]-1)
                if gene_list!=[['']]:
                    for gene_list_record in gene_list:
                        if ifgeneinside_region(complementary_region=a,gene_region=[gene_list_record[3],gene_list_record[4]]):
                            onerecord=[race,0]
                            onerecord+=["chr"+chr+":"+str(a[0])+"-"+str(a[1])]
                            
                            onerecord+=generate_gene_info(gene_list_one_record=gene_list_record,chrnum=chr)
                            allrecords.append(onerecord)
                        else:
                            continue
                
    else:
        for chr,arr in split_.items():
            #chr=chr.split("_")[1]
            chr=extract_numbers(chr)[0]
            for a in arr:
                gene_list=fetch_gene_info(chromosome=chr,start=a[0]-1,end=a[1]-1)
                if gene_list!=[['']]:
                    for gene_list_record in gene_list:

                        if ifgeneinside_region(complementary_region=a,gene_region=[gene_list_record[3],gene_list_record[4]]):
                            onerecord=["All",0]
                            onerecord+=["chr"+chr+":"+str(a[0])+"-"+str(a[1])]
                            
                            onerecord+=generate_gene_info(gene_list_one_record=gene_list_record,chrnum=chr)
                            allrecords.append(onerecord)
                        else:
                            continue

2
[['']]
3
[['ENSG00000044524', 'EPHA3', 'EPH receptor A3 [Source:HGNC Symbol;Acc:HGNC:3387]', '89107621', '89482134', '1'], ['ENSG00000240951', 'MTCO2P6', 'MT-CO2 pseudogene 6 [Source:HGNC Symbol;Acc:HGNC:52022]', '89587886', '89588570', '-1'], ['ENSG00000240309', 'MTCO1P6', 'MT-CO1 pseudogene 6 [Source:HGNC Symbol;Acc:HGNC:52008]', '89588714', '89590097', '-1'], ['ENSG00000212598', 'U3', 'Small nucleolar RNA U3 [Source:RFAM;Acc:RF00012]', '90030284', '90030495', '1'], ['ENSG00000222490', 'RNU6-712P', 'RNA, U6 small nuclear 712, pseudogene [Source:HGNC Symbol;Acc:HGNC:47675]', '90184634', '90184733', '1'], ['ENSG00000189002', 'PROS2P', 'protein S (beta) pseudogene [Source:HGNC Symbol;Acc:HGNC:9458]', '90202316', '90257415', '-1'], ['ENSG00000271024', 'HSPE1P19', 'heat shock protein family E (Hsp10) member 1 pseudogene 19 [Source:HGNC Symbol;Acc:HGNC:49338]', '90261414', '90261708', '-1']]
5
[['ENSG00000251983', 'RN7SKP157', 'RN7SK pseudogene 157 [Source:HGNC Symbol;Acc:HGNC:45881]', '

In [10]:
mhc_race_positions_dict={'All':{'mhc':[[32589647, 32751796]]}}

In [11]:

for race, split_ in mhc_race_positions_dict.items():
    if split_!={}:
        for chr,arr in split_.items():
            chr="6"
            for a in arr:
                gene_list=fetch_gene_info(chromosome=chr,start=a[0]-1,end=a[1]-1)
                if gene_list!=[['']]:
                    for gene_list_record in gene_list:
                        if ifgeneinside_region(complementary_region=a,gene_region=[gene_list_record[3],gene_list_record[4]]):
                            onerecord=[race,1]
                            onerecord+=["chr"+chr+":"+str(a[0])+"-"+str(a[1])]
                            
                            onerecord+=generate_gene_info(gene_list_one_record=gene_list_record,chrnum="6")
                            allrecords.append(onerecord)    
                        else:
                            continue

[['ENSG00000196126', 'HLA-DRB1', 'major histocompatibility complex, class II, DR beta 1 [Source:HGNC Symbol;Acc:HGNC:4948]', '32577902', '32589848', '-1'], ['ENSG00000196735', 'HLA-DQA1', 'major histocompatibility complex, class II, DQ alpha 1 [Source:HGNC Symbol;Acc:HGNC:4942]', '32628179', '32647062', '1'], ['ENSG00000179344', 'HLA-DQB1', 'major histocompatibility complex, class II, DQ beta 1 [Source:HGNC Symbol;Acc:HGNC:4944]', '32659467', '32668383', '-1'], ['ENSG00000223534', 'HLA-DQB1-AS1', 'HLA-DQB1 antisense RNA 1 [Source:HGNC Symbol;Acc:HGNC:39762]', '32659880', '32660729', '1'], ['ENSG00000235040', 'MTCO3P1', 'MT-CO3 pseudogene 1 [Source:HGNC Symbol;Acc:HGNC:31342]', '32706124', '32706955', '-1'], ['ENSG00000232080', '', 'novel transcript', '32718005', '32719170', '1'], ['ENSG00000226030', 'HLA-DQB3', 'major histocompatibility complex, class II, DQ beta 3 [Source:HGNC Symbol;Acc:HGNC:4946]', '32730758', '32731695', '-1'], ['ENSG00000237541', 'HLA-DQA2', 'major histocompatibil

In [12]:
allrecords

[['ACB',
  0,
  'chr3:89474731-90288204',
  'ENSG00000240951',
  'MTCO2P6',
  'chr3:89587886-89588570',
  '-1',
  'MT-CO2 pseudogene 6 [Source:HGNC Symbol;Acc:HGNC:52022]'],
 ['ACB',
  0,
  'chr3:89474731-90288204',
  'ENSG00000240309',
  'MTCO1P6',
  'chr3:89588714-89590097',
  '-1',
  'MT-CO1 pseudogene 6 [Source:HGNC Symbol;Acc:HGNC:52008]'],
 ['ACB',
  0,
  'chr3:89474731-90288204',
  'ENSG00000212598',
  'U3',
  'chr3:90030284-90030495',
  '1',
  'Small nucleolar RNA U3 [Source:RFAM;Acc:RF00012]'],
 ['ACB',
  0,
  'chr3:89474731-90288204',
  'ENSG00000222490',
  'RNU6-712P',
  'chr3:90184634-90184733',
  '1',
  'RNA, U6 small nuclear 712, pseudogene [Source:HGNC Symbol;Acc:HGNC:47675]'],
 ['ACB',
  0,
  'chr3:89474731-90288204',
  'ENSG00000189002',
  'PROS2P',
  'chr3:90202316-90257415',
  '-1',
  'protein S (beta) pseudogene [Source:HGNC Symbol;Acc:HGNC:9458]'],
 ['ACB',
  0,
  'chr3:89474731-90288204',
  'ENSG00000271024',
  'HSPE1P19',
  'chr3:90261414-90261708',
  '-1',
  'he

In [13]:
#generatedf
def generatedf(columns,allrecords):
    dictforDF=dict()
    for i in range(len(columns)):
        midarr=[]
        for record in allrecords:           
            midarr.append(record[i])
        dictforDF[columns[i]]=midarr
    #dataframe(dictforDF).to_csv("csv2_05_09.csv")
    return dataframe(dictforDF)


In [14]:
dfnew=generatedf(columns=["Race","isMHC","Position of disassortative mating region","Gene ID","Gene name","Position of gene","is_complement","Gene description"],allrecords=allrecords)

In [15]:
dfnew['is_complement'] = dfnew['is_complement'].apply(lambda x:  "F" if x == "1" else "R")
# dfnew.to_csv("csv2_05_11new.csv")

In [16]:
dfnew

Unnamed: 0,Race,isMHC,Position of disassortative mating region,Gene ID,Gene name,Position of gene,is_complement,Gene description
0,ACB,0,chr3:89474731-90288204,ENSG00000240951,MTCO2P6,chr3:89587886-89588570,R,MT-CO2 pseudogene 6 [Source:HGNC Symbol;Acc:HG...
1,ACB,0,chr3:89474731-90288204,ENSG00000240309,MTCO1P6,chr3:89588714-89590097,R,MT-CO1 pseudogene 6 [Source:HGNC Symbol;Acc:HG...
2,ACB,0,chr3:89474731-90288204,ENSG00000212598,U3,chr3:90030284-90030495,F,Small nucleolar RNA U3 [Source:RFAM;Acc:RF00012]
3,ACB,0,chr3:89474731-90288204,ENSG00000222490,RNU6-712P,chr3:90184634-90184733,F,"RNA, U6 small nuclear 712, pseudogene [Source:..."
4,ACB,0,chr3:89474731-90288204,ENSG00000189002,PROS2P,chr3:90202316-90257415,R,protein S (beta) pseudogene [Source:HGNC Symbo...
...,...,...,...,...,...,...,...,...
1334,All,1,chr6:32589647-32751796,ENSG00000235040,MTCO3P1,chr6:32706124-32706955,R,MT-CO3 pseudogene 1 [Source:HGNC Symbol;Acc:HG...
1335,All,1,chr6:32589647-32751796,ENSG00000232080,,chr6:32718005-32719170,F,novel transcript
1336,All,1,chr6:32589647-32751796,ENSG00000226030,HLA-DQB3,chr6:32730758-32731695,R,"major histocompatibility complex, class II, DQ..."
1337,All,1,chr6:32589647-32751796,ENSG00000237541,HLA-DQA2,chr6:32741391-32747198,F,"major histocompatibility complex, class II, DQ..."


In [17]:
dfnew.to_csv("tables5_part0.csv")
