In [8]:
import pysam
from pysam import VariantFile as vcf
import operator
from math import log2
import pandas as pd
from pandas import DataFrame as dataframe
import matplotlib.pyplot as plt
import numpy as np

import  os
import os.path

In [9]:
class CalProb:
    def __init__(self,vcf_in):
        self.vcf_in=vcf_in
        self.result=dict()
    
    def calPaAndPb(self):
        for rec in self.vcf_in.fetch():
            count_0=0
            count_1=0
            GTs=[]
            samplelist=list((self.vcf_in.header.samples))
            for samplename in samplelist:
                t1=rec.samples[samplename]['GT']
                if len(t1)==2:
                    GTs.append(t1)
                
            #print(GTs)
            list1=[x[0] for x in GTs]
            list2=[x[1] for x in GTs]
            for m in list1:
                if m==0:
                    count_0+=1
                else:
                    count_1+=1
            for n in list2:
                if n==0:
                    count_0+=1
                else:
                    count_1+=1
            p0=count_0/(2*len(samplelist))
            p1=count_1/(2*len(samplelist))
            p0=0.2+0.8*p0
            p1=0.2+0.8*p1
            arr=[]
            arr.append(p0)
            arr.append(p1)
            self.result[rec.pos]=arr
    
    def calPAB(self):     
        for rec in self.vcf_in.fetch():
            count_01=0
            count_10=0
            samplelist=list((self.vcf_in.header.samples))
            for samplename in samplelist:
                t1=rec.samples[samplename]['GT']
                if(operator.eq(t1,(0,1))):
                    count_01+=1
                if(operator.eq(t1,(1,0))):
                    count_10+=1
            num=(count_01+count_10)/len(samplelist)
            self.result[rec.pos].append(num)

    def calP(self):
        positions=list(self.result.keys())
        for position in positions:
            arr=self.result[position]
            if arr[2]==0:
                cal=0
            else:
                # if arr[0]*arr[1]==0:
                #     cal=arr[2]*log2(arr[2]/(1e-10))
             
                cal=arr[2]*log2(arr[2]/(arr[0]*arr[1]))
            
            self.result[position].append(cal)
    def __runProb__(self):
        self.calPaAndPb()
        self.calPAB()
        self.calP()
        
    def __getResult__(self):
        return self.result
    
    def df_pos_prob(self,pos_prob_csvname):
        positions=list(self.result.keys())
        column_names=['PA','PB','PAB','P']
        df=dataframe(index=positions,columns=column_names)
        for i in range(len(positions)):
            a=self.result[positions[i]]
            df.iloc[i][0]=a[0]
            df.iloc[i][1]=a[1]
            df.iloc[i][2]=a[2]
            df.iloc[i][3]=a[3]
        df.to_csv("../csvfiles/pos"+pos_prob_csvname+".csv")
        return df
    
    def df_index_prob(self,index_prob_csvname):
        row_names=list(self.result.keys())
        PAs=[]
        PBs=[]
        PABs=[]
        Ps=[]
        for x in row_names:
            PAs.append(self.result[x][0])
            PBs.append(self.result[x][1])
            PABs.append(self.result[x][2])
            Ps.append(self.result[x][3])
        dictforDF=dict()
        dictforDF['PA']=PAs
        dictforDF['PB']=PBs
        dictforDF['PAB']=PABs
        dictforDF['P']=Ps
        df_index=dataframe(dictforDF)
        df_index.to_csv("../csvfiles/index"+index_prob_csvname+".csv")
        return df_index

    # def Graph_pos_prob(self,df,pos_probgraphname):
    #     x_values1=np.array(df.index)
    #     y_values1=np.array(df["P"])
    #     plt.scatter(x_values1, y_values1,s=1)
    #     plt.title("Positions and Probability")
    #     plt.xlabel("Position")
    #     plt.ylabel("Probability")
    #     plt.rcParams["figure.figsize"] = (20,5)
    #     plt.savefig("../graphs/"+pos_probgraphname+".jpg")

    # def Graph_index_prob(self,df,index_prob_graphname):
    #     x_values2 = np.array(df.index)
    #     y_values2 = np.array(df["P"])
    #     plt.scatter(x_values2, y_values2, s=1)
    #     plt.title("Index and Probability")
    #     plt.xlabel("Index")
    #     plt.ylabel("Probability")
    #     plt.rcParams["figure.figsize"] = (20,5)
    #     plt.savefig("../graphs/"+index_prob_graphname+".jpg")
    

In [10]:
foldernames=[]
foldernames.append('split_chrX')
for i in range(1,23):
    foldernames.append('split_chr'+str(i))
# for i in range(7,10):
#     foldernames.append('split_chr'+str(i))
foldernames

['split_chrX',
 'split_chr1',
 'split_chr2',
 'split_chr3',
 'split_chr4',
 'split_chr5',
 'split_chr6',
 'split_chr7',
 'split_chr8',
 'split_chr9',
 'split_chr10',
 'split_chr11',
 'split_chr12',
 'split_chr13',
 'split_chr14',
 'split_chr15',
 'split_chr16',
 'split_chr17',
 'split_chr18',
 'split_chr19',
 'split_chr20',
 'split_chr21',
 'split_chr22']

In [11]:
def ifExistCSVFile(filename):
    filepath='../csvfiles/'
    files=os.listdir(filepath)
    if filename in files:
        return True
    else:
        return False

In [12]:

for foldername in foldernames:
    for curDir, dirs, files in os.walk(top="../"+foldername+"/"):
        for file in files:
            if file.endswith(".vcf.gz"):
                path=os.path.join(curDir,file)
                print(path)
                fileprefix=file[:file.index(".")]
                print(foldername+"_"+fileprefix)
                pos_prob_name="pos_prob_"+foldername+"_"+fileprefix
                index_prob_name="index_prob_"+foldername+"_"+fileprefix
                csv_pos_prob_name=pos_prob_name+".csv"
                csv_index_prob_name=index_prob_name+".csv"
                if ifExistCSVFile(csv_pos_prob_name or csv_index_prob_name):
                    continue
                else:
                    vcf_in=vcf(path)
                    testclass=CalProb(vcf_in=vcf_in)
                    testclass.__runProb__()
                    df_pos_prob=testclass.df_pos_prob(pos_prob_csvname=pos_prob_name)
                    # testclass.Graph_pos_prob(df=df_pos_prob,pos_probgraphname=pos_prob_name)
                    df_index_prob=testclass.df_index_prob(index_prob_csvname=index_prob_name)
                    # testclass.Graph_index_prob(df=df_index_prob,index_prob_graphname=index_prob_name)


                

../split_chrX/xak.vcf.gz
split_chrX_xak
../split_chrX/xaa.vcf.gz
split_chrX_xaa
../split_chrX/xab.vcf.gz
split_chrX_xab
../split_chrX/xac.vcf.gz
split_chrX_xac
../split_chrX/xad.vcf.gz
split_chrX_xad
../split_chrX/xae.vcf.gz
split_chrX_xae
../split_chrX/xaf.vcf.gz
split_chrX_xaf
../split_chrX/xag.vcf.gz
split_chrX_xag
../split_chrX/xah.vcf.gz
split_chrX_xah
../split_chrX/xai.vcf.gz
split_chrX_xai
../split_chrX/xaj.vcf.gz
split_chrX_xaj
../split_chrX/xal.vcf.gz
split_chrX_xal
../split_chrX/xam.vcf.gz
split_chrX_xam
../split_chrX/xan.vcf.gz
split_chrX_xan
../split_chrX/xao.vcf.gz
split_chrX_xao
../split_chrX/xap.vcf.gz
split_chrX_xap
../split_chrX/xaq.vcf.gz
split_chrX_xaq
../split_chrX/xar.vcf.gz
split_chrX_xar
../split_chrX/xas.vcf.gz
split_chrX_xas
../split_chrX/xat.vcf.gz
split_chrX_xat
../split_chrX/xau.vcf.gz
split_chrX_xau
../split_chrX/xav.vcf.gz
split_chrX_xav
../split_chrX/xaw.vcf.gz
split_chrX_xaw
../split_chrX/xax.vcf.gz
split_chrX_xax
../split_chrX/xay.vcf.gz
split_chrX_xay


In [13]:
vcf_in=vcf("../mhcdataset/1kgp.29720000-33130000.vcf","r")
# vcf_in=vcf(path,"r")
testclass=CalProb(vcf_in=vcf_in)
testclass.__runProb__()
df_pos_prob=testclass.df_pos_prob(pos_prob_csvname="mhc")
# testclass.Graph_pos_prob(df=df_pos_prob,pos_probgraphname=pos_prob_name)
df_index_prob=testclass.df_index_prob(index_prob_csvname="mhc")