In [1]:
import pysam
from pysam import VariantFile as vcf
import operator
from math import log2
import pandas as pd
from pandas import DataFrame as dataframe
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial.distance import pdist, squareform
import scipy
import  os
import os.path
import matplotlib.colors as mcolors
from scipy import stats
import csv
from statsmodels.stats.multitest import multipletests

In [9]:
import requests
from xml.etree import ElementTree as ET

def fetch_gene_info(chromosome, start, end):
    query_xml = f"""
    <Query virtualSchemaName="default" formatter="TSV" header="0" uniqueRows="1" count="" datasetConfigVersion="0.6">
        <Dataset name="hsapiens_gene_ensembl" interface="default">
            <Filter name="chromosome_name" value="{chromosome}"/>
            <Filter name="start" value="{start}"/>
            <Filter name="end" value="{end}"/>
            <Attribute name="ensembl_gene_id"/>
            <Attribute name="external_gene_name"/>
            <Attribute name="description"/>
            <Attribute name="start_position"/>
            <Attribute name="end_position"/>            
            <Attribute name="strand"/>
        </Dataset>
    </Query>
    """

    biomart_url = "http://www.ensembl.org/biomart/martservice?query="
    response = requests.get(biomart_url + query_xml.strip())

    if response.status_code != 200:
        raise Exception(f"Error fetching data from BioMart: {response.text}")

    genes = [line.split("\t") for line in response.text.strip().split("\n")]
    print(genes)
    return genes


In [10]:
split_race_region={'ACB': {'split_chr1/xbb': [[123992935, 124938490]],
  'split_chr1/xba': [[121579777, 123992934]],
  'split_chr13/xaa': [[16000187, 18181463]],
  'split_chr2/xav': [[91720904, 94530414]],
  'split_chr3/xau': [[90103657, 93851087]]},
  'All':{'split_chr13_xaa': [[16000187, 18181463]],
 'split_chr1_xba': [[121579777, 123992934]],
 'split_chr1_xbb': [[123992935, 124938490]],
 'split_chr2_xav': [[91720904, 94530414]],
 'split_chr3_xau': [[90362505, 91477491]]},
 'ASW': {'split_chr1/xbb': [[123992935, 124938490]],
  'split_chr1/xba': [[121635735, 123992934]],
  'split_chr13/xaa': [[16040220, 18181463]],
  'split_chr16/xaj': [[35510197, 46630361]],
  'split_chr2/xav': [[91720904, 94530414]],
  'split_chr3/xau': [[90103657, 93851087]]},
 'BEB': {'split_chr1/xbb': [[123992935, 124938490]],
  'split_chr1/xba': [[121579777, 123992934]],
  'split_chr13/xaa': [[16165134, 17417031]],
  'split_chr16/xaj': [[35510197, 46692526]],
  'split_chr2/xav': [[91992563, 94195617]]},
 'CDX': {},
 'CEU': {'split_chr1/xbb': [[123992935, 124920289]],
  'split_chr1/xba': [[121579777, 123992934]],
  'split_chr13/xaa': [[16446009, 18181463]],
  'split_chr2/xav': [[92076199, 94195617]]},
 'CHB': {},
 'CHS': {'split_chr1/xba': [[123102017, 123978193]]},
 'CLM': {'split_chr1/xbb': [[123992935, 124938490]],
  'split_chr1/xba': [[121579777, 123992934]],
  'split_chr13/xaa': [[16446009, 18181463]],
  'split_chr16/xaj': [[35510197, 46630361]],
  'split_chr16/xai': [[32030198, 35510168]],
  'split_chr2/xav': [[91992563, 94654240]]},
 'ESN': {'split_chr1/xbb': [[123992935, 124938490]],
  'split_chr1/xba': [[121635735, 123992934]],
  'split_chr13/xaa': [[16000187, 18181463]],
  'split_chr2/xav': [[91720904, 94654240]],
  'split_chr3/xau': [[90103657, 93851087]]},
 'FIN': {'split_chr1/xbb': [[124048603, 124877211]],
  'split_chr1/xba': [[121635735, 123992934]],
  'split_chr2/xav': [[91992563, 94195617]]},
 'GBR': {'split_chr1/xbb': [[123992935, 124877211]],
  'split_chr1/xba': [[122678788, 123992934]]},
 'GIH': {'split_chr1/xbb': [[123992935, 124877211]],
  'split_chr1/xba': [[121635735, 123992934]],
  'split_chr16/xaj': [[35510197, 46692526]],
  'split_chr2/xav': [[91951194, 94195617]]},
 'GWD': {'split_chr1/xbb': [[123992935, 124938490]],
  'split_chr1/xba': [[121579777, 123992934]],
  'split_chr13/xaa': [[16000187, 18181463]],
  'split_chr16/xaj': [[35510197, 46630361]],
  'split_chr2/xav': [[91720904, 94195617]],
  'split_chr3/xau': [[90103657, 93851087]]},
 'IBS': {'split_chr1/xbb': [[123992935, 124938490]],
  'split_chr1/xba': [[121579777, 123992934]],
  'split_chr13/xaa': [[16446009, 18024589]],
  'split_chr16/xaj': [[35510197, 46630361]],
  'split_chr2/xav': [[91992563, 94654240]]},
 'ITU': {'split_chr1/xbb': [[123992935, 124877211]],
  'split_chr1/xba': [[121635735, 123992934]],
  'split_chr16/xaj': [[35510197, 46692526]],
  'split_chr2/xav': [[91992563, 94195617]]},
 'JPT': {},
 'KHV': {},
 'LWK': {'split_chr1/xbb': [[123992935, 124938490]],
  'split_chr1/xba': [[121579777, 123992934]],
  'split_chr13/xaa': [[16040220, 18181463]],
  'split_chr2/xav': [[91720904, 94654240]],
  'split_chr3/xau': [[90103657, 93851087]]},
 'MSL': {'split_chr1/xbb': [[123992935, 124938490]],
  'split_chr1/xba': [[121635735, 123992934]],
  'split_chr13/xaa': [[16040220, 18181463]],
  'split_chr2/xav': [[91720904, 94195617]],
  'split_chr3/xau': [[90103657, 93851087]]},
 'MXL': {'split_chr1/xbb': [[123992935, 124920289]],
  'split_chr1/xba': [[121635735, 123992934]],
  'split_chr16/xaj': [[35510197, 46692526]],
  'split_chr2/xav': [[91720904, 94735664]]},
 'PEL': {'split_chr1/xbb': [[123992935, 124782972]],
  'split_chr1/xba': [[122885884, 123992934]]},
 'PJL': {'split_chr1/xbb': [[123992935, 124938490]],
  'split_chr1/xba': [[121579777, 123992934]],
  'split_chr13/xaa': [[16165134, 18024589]],
  'split_chr16/xaj': [[35510197, 46692526]],
  'split_chr2/xav': [[91768349, 94195617]]},
 'PUR': {'split_chr1/xbb': [[123992935, 124938490]],
  'split_chr1/xba': [[121579777, 123992934]],
  'split_chr13/xaa': [[16152209, 18181463]],
  'split_chr16/xaj': [[35510197, 46630361]],
  'split_chr2/xav': [[91720904, 94809756]]},
 'STU': {'split_chr1/xbb': [[123992935, 124920289]],
  'split_chr1/xba': [[121635735, 123992934]],
  'split_chr16/xaj': [[35510197, 46692526]],
  'split_chr16/xai': [[33059491, 35510168]],
  'split_chr2/xav': [[91992563, 94195617]]},
 'TSI': {'split_chr1/xbb': [[123992935, 124920289]],
  'split_chr1/xba': [[121579777, 123992934]],
  'split_chr2/xav': [[91992563, 94195617]]},
 'YRI': {'split_chr1/xbb': [[123992935, 124938490]],
  'split_chr1/xba': [[121635735, 123992934]],
  'split_chr13/xaa': [[16000187, 18181463]],
  'split_chr2/xav': [[91720904, 94530414]],
  'split_chr3/xau': [[90103657, 93851087]]}}

In [11]:
def generate_gene_info(gene_list_one_record,chrnum):
    result=[]
    result.append(gene_list_one_record[0])
    result.append(gene_list_one_record[1])
    s=gene_list_one_record[3]
    e=gene_list_one_record[4]
    
    pos="chr"+chrnum+":"+str(s)+"-"+str(e)
    result.append(pos)
    result.append(gene_list_one_record[5])
    result.append(gene_list_one_record[2])
    
    return result

In [12]:
import re

def extract_numbers(input_string):
    return re.findall(r'\d+', input_string)

In [13]:
def ifgeneinside_region(complementary_region,gene_region):
    
    s=int(gene_region[0])
    e=int(gene_region[1])
     
    if (complementary_region[0]<=s) and complementary_region[1]>=e:
        return True
    else:
        return False

In [18]:
allrecords=[]

#non_mhc
for race,split_ in split_race_region.items():
    if race!="All":
        for chr,arr in split_.items():
            chr=chr.split("/")[0].split("_")[1]
            chr=extract_numbers(chr)[0]
            print(chr)
            for a in arr:
                gene_list=fetch_gene_info(chromosome=chr,start=a[0]-1,end=a[1]-1)
                if gene_list!=[['']]:
                    for gene_list_record in gene_list:
                        if ifgeneinside_region(complementary_region=a,gene_region=[gene_list_record[3],gene_list_record[4]]):
                            onerecord=[race,0]
                            onerecord+=["chr"+chr+":"+str(a[0])+"-"+str(a[1])]
                            
                            onerecord+=generate_gene_info(gene_list_one_record=gene_list_record,chrnum=chr)
                            allrecords.append(onerecord)
                        else:
                            continue
                
    else:
        for chr,arr in split_.items():
            chr=chr.split("_")[1]
            chr=extract_numbers(chr)[0]
            for a in arr:
                gene_list=fetch_gene_info(chromosome=chr,start=a[0]-1,end=a[1]-1)
                if gene_list!=[['']]:
                    for gene_list_record in gene_list:

                        if ifgeneinside_region(complementary_region=a,gene_region=[gene_list_record[3],gene_list_record[4]]):
                            onerecord=["All",0]
                            onerecord+=["chr"+chr+":"+str(a[0])+"-"+str(a[1])]
                            
                            onerecord+=generate_gene_info(gene_list_one_record=gene_list_record,chrnum=chr)
                            allrecords.append(onerecord)
                        else:
                            continue

1


[['']]
1
[['ENSG00000224857', 'LINC01691', 'long intergenic non-protein coding RNA 1691 [Source:HGNC Symbol;Acc:HGNC:52479]', '121573946', '121580524', '-1']]
13
[['ENSG00000279924', '', 'novel ankyrin repeat domain-containing protein pseudogene', '18174010', '18178465', '-1']]
2
[['ENSG00000271627', 'NKAIN1P2', 'NKAIN1 pseudogene 2 [Source:HGNC Symbol;Acc:HGNC:54726]', '91723023', '91723605', '1'], ['ENSG00000223703', 'IGSF3P2', 'IGSF3 pseudogene 2 [Source:HGNC Symbol;Acc:HGNC:55896]', '91736726', '91767439', '1'], ['ENSG00000232531', 'KMT5AP2', 'KMT5A pseudogene 2 [Source:HGNC Symbol;Acc:HGNC:54727]', '91747940', '91748986', '-1'], ['ENSG00000286698', '', 'novel transcript', '91759462', '91767701', '-1'], ['ENSG00000236969', 'GGT8P', 'gamma-glutamyltransferase 8 pseudogene [Source:HGNC Symbol;Acc:HGNC:33438]', '91775944', '91781169', '1'], ['ENSG00000235235', 'IGKV1OR2-1', 'immunoglobulin kappa variable 1/OR2-1 (pseudogene) [Source:HGNC Symbol;Acc:HGNC:5760]', '91817771', '91818248',

In [19]:
allrecords

[['ACB',
  0,
  'chr13:16000187-18181463',
  'ENSG00000279924',
  '',
  'chr13:18174010-18178465',
  '-1',
  'novel ankyrin repeat domain-containing protein pseudogene'],
 ['ACB',
  0,
  'chr2:91720904-94530414',
  'ENSG00000271627',
  'NKAIN1P2',
  'chr2:91723023-91723605',
  '1',
  'NKAIN1 pseudogene 2 [Source:HGNC Symbol;Acc:HGNC:54726]'],
 ['ACB',
  0,
  'chr2:91720904-94530414',
  'ENSG00000223703',
  'IGSF3P2',
  'chr2:91736726-91767439',
  '1',
  'IGSF3 pseudogene 2 [Source:HGNC Symbol;Acc:HGNC:55896]'],
 ['ACB',
  0,
  'chr2:91720904-94530414',
  'ENSG00000232531',
  'KMT5AP2',
  'chr2:91747940-91748986',
  '-1',
  'KMT5A pseudogene 2 [Source:HGNC Symbol;Acc:HGNC:54727]'],
 ['ACB',
  0,
  'chr2:91720904-94530414',
  'ENSG00000286698',
  '',
  'chr2:91759462-91767701',
  '-1',
  'novel transcript'],
 ['ACB',
  0,
  'chr2:91720904-94530414',
  'ENSG00000236969',
  'GGT8P',
  'chr2:91775944-91781169',
  '1',
  'gamma-glutamyltransferase 8 pseudogene [Source:HGNC Symbol;Acc:HGNC:33

In [21]:
#generatedf
def generatedf(columns,allrecords):
    dictforDF=dict()
    for i in range(len(columns)):
        midarr=[]
        for record in allrecords:           
            midarr.append(record[i])
        dictforDF[columns[i]]=midarr
    #dataframe(dictforDF).to_csv("csv2_05_09.csv")
    return dataframe(dictforDF)


In [22]:
dfnew=generatedf(columns=["Race","isMHC","Position of disassortative mating region","Gene ID","Gene name","Position of gene","is_complement","Gene description"],allrecords=allrecords)

In [23]:
dfnew['is_complement'] = dfnew['is_complement'].apply(lambda x:  "F" if x == "1" else "R")
# dfnew.to_csv("csv2_05_11new.csv")

In [24]:
dfnew

Unnamed: 0,Race,isMHC,Position of disassortative mating region,Gene ID,Gene name,Position of gene,is_complement,Gene description
0,ACB,0,chr13:16000187-18181463,ENSG00000279924,,chr13:18174010-18178465,R,novel ankyrin repeat domain-containing protein...
1,ACB,0,chr2:91720904-94530414,ENSG00000271627,NKAIN1P2,chr2:91723023-91723605,F,NKAIN1 pseudogene 2 [Source:HGNC Symbol;Acc:HG...
2,ACB,0,chr2:91720904-94530414,ENSG00000223703,IGSF3P2,chr2:91736726-91767439,F,IGSF3 pseudogene 2 [Source:HGNC Symbol;Acc:HGN...
3,ACB,0,chr2:91720904-94530414,ENSG00000232531,KMT5AP2,chr2:91747940-91748986,R,KMT5A pseudogene 2 [Source:HGNC Symbol;Acc:HGN...
4,ACB,0,chr2:91720904-94530414,ENSG00000286698,,chr2:91759462-91767701,R,novel transcript
...,...,...,...,...,...,...,...,...
1043,YRI,0,chr3:90103657-93851087,ENSG00000189002,PROS2P,chr3:90202316-90257415,R,protein S (beta) pseudogene [Source:HGNC Symbo...
1044,YRI,0,chr3:90103657-93851087,ENSG00000271024,HSPE1P19,chr3:90261414-90261708,R,heat shock protein family E (Hsp10) member 1 p...
1045,YRI,0,chr3:90103657-93851087,ENSG00000283367,,chr3:91356755-91356914,F,
1046,YRI,0,chr3:90103657-93851087,ENSG00000283544,,chr3:91374236-91513775,R,"primase, DNA, polypeptide 2 (58kDa) (PRIM2) ps..."


In [None]:
dfnew.to_csv("csv2.csv")
