#Step 0: Import Packages and Set the Environment

In [None]:
!apt-get install bedtools
!pip install phylopandas
!pip install pybedtools
!pip install bio

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
bedtools is already the newest version (2.30.0+dfsg-2ubuntu0.1).
0 upgraded, 0 newly installed, 0 to remove and 15 not upgraded.


In [None]:
import pybedtools
from pybedtools import BedTool
import glob
import os
import Bio
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from collections import defaultdict
import pandas as pd
import subprocess
import pysam
from pysam import VariantFile
import matplotlib.pyplot as plt
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def bed_to_df(bed_tsv):
  df=pd.read_csv(bed_tsv,sep='\t',header=None)
  df.columns=['chrom','start','end']
  return(df)

def vcf_to_df(vcf_file):
  vcf_in = VariantFile(vcf_file)
  counter=0
  data=[]
  for entry in vcf_in:
    counter=counter+1
    data.append([entry.chrom,entry.pos,entry.id,entry.ref,entry.alts,entry.qual,entry.filter,entry.info,entry.format,entry.samples])
  print("Total number of entries=%d"%counter)
  vcf_in.close()
  df = pd.DataFrame(data, columns=['chrom','pos','id','ref','alts','qual','filter','info','format','samples'])
  return(df)

def fasta_to_df(fasta_file):
  identifiers=[]
  lengths=[]
  for seq_record in SeqIO.parse(fasta_file, 'fasta'):
    identifiers.append(str(seq_record.seq))
    lengths.append(len(seq_record.seq))
  d={'Sequence':identifiers,'Len':lengths}
  df=pd.DataFrame(d)
  return(df)

def Generate_Sequences(bed_user,fasta_user):
  fasta_file = BedTool(fasta_user)
  bed_file=BedTool(bed_user)
  bed_with_seq = bed_file.sequence(fi=fasta_file)
  outputfile=path+'out.fasta'
  bed_with_seq.save_seqs(outputfile)
  df=fasta_to_df(outputfile)
  return(df)

In [None]:
path='/content/drive/MyDrive/CSIRE/Sequencing/'

##Test: Using Pandas to Visualize the Reference Genome

In [None]:
with open(path+'GRCh38.primary_assembly.genome.fa') as fasta_file:
  identifiers=[]
  lengths=[]
  for seq_record in SeqIO.parse(fasta_file, 'fasta'):
    identifiers.append(str(seq_record.seq))
    lengths.append(len(seq_record.seq))

In [None]:
d={'Sequence':identifiers,'Len':lengths}
data=pd.DataFrame(d)
data

Unnamed: 0,Sequence,Len
0,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,248956422
1,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,242193529
2,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,198295559
3,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,190214555
4,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,181538259
...,...,...
189,GAACACCTAAACGCCCTAAATTCTTTCTTTACTCCATTCTGCTCCT...,62944
190,GCATAGTCAAGTTTGCAAACCACTGCTCTCGAGTTTTAATTGACAT...,40191
191,TTAAACGGTTGTTTCACTGCGGGGAAAAGAGTATCCCAAGCTCCTC...,36723
192,ACGATCTCACTGTGTCACCCAGGTTGGAGTGCGGTGCACAATCTGA...,79590


In [None]:
data.Sequence[1:4].nunique()

3

##Test: Using Pandas to Visualize Core Promoter Regions

In [None]:
df_bed

NameError: ignored

# Step 1: Using Pybedtools to Analyze the Reference Genome

In [None]:
fasta_file = BedTool(path+'GRCh38.primary_assembly.genome.fa')
df=pd.read_csv(path+'chr1_bed.tsv',sep='\t',header=None,names=['chrom','start','end'])
df.end+=1
display(df.head())
df.to_csv(path+'chr1_bed.rev.tsv',sep="\t", header=False,index=False)
bed_file=BedTool(path+'chr1_bed.rev.tsv')
bed_with_seq = bed_file.sequence(fi=fasta_file)

Unnamed: 0,chrom,start,end
0,chr1,24401417,24401497
1,chr1,24383964,24384044
2,chr1,24379784,24379864
3,chr1,24373771,24373851
4,chr1,24369800,24369880


#Step 2: Convert a .BED File to a .CSV File

In [None]:
outputfile=path+'output_July17.csv'
bed_with_seq.save_seqs(outputfile)

<BedTool(/content/drive/MyDrive/CSIRE/Sequencing/chr1_bed.rev.tsv)>

In [None]:
df=pd.read_csv(outputfile)
len(df.iloc[0].values[0])

#Step 3: Use Pandas to Create a Dataframe Using a .BED File and a Reference Genome

In [None]:
tsvfile=path+'sample_10.tsv'
bedfile=path+'Core_Prom.bed.gsheet'
df=pd.read_csv(tsvfile,sep='\t')
df

In [None]:
with open(path+'GRCh38.primary_assembly.genome.fa') as fasta_file:
  identifiers=[]
  lengths=[]
  for seq_record in SeqIO.parse(fasta_file, 'fasta'):
    identifiers.append(str(seq_record.seq))
    lengths.append(len(seq_record.seq))

In [None]:
df_seq=Generate_Sequences(path+'Core_Prom_bed.tsv',path+'GRCh38.primary_assembly.genome.fa')
df_seq.head(10)

KeyboardInterrupt: ignored

#Step 4: Find the Prescence of Mutations in Core Promoter Regions

In [None]:
df_bed=bed_to_df(path+'Core_Prom_bed.tsv')
df_vcf=vcf_to_df("/content/drive/MyDrive/CSIRE/VCF/VCFFiles/VCF2/TCGA_BRCA.e7bc7eaa-5912-47c3-8396-23ee0b4cd20a.wxs.MuTect2.somatic_annotation.vcf")

Total number of entries=8960


In [None]:
df_vcf['included_in_interval'] = df_vcf.apply(lambda row: any((df_bed['chrom'] == row['chrom']) &
                                                                 (df_bed['start'] <= row['pos']) &
                                                                 (df_bed['end'] >= row['pos'])), axis=1)

In [None]:
df_vcf.head()

In [None]:
df_vcf[df_vcf.included_in_interval==True]

In [None]:
df_bed['included_in_interval'] = df_bed.apply(lambda row: any((df_vcf['chrom'] == row['chrom']) &
                                                                 (df_vcf['pos'] >= row['start']) &
                                                                 (df_vcf['pos'] <= row['end'])), axis=1)
df_bed.head()

In [None]:
df_bed=df_bed.drop_duplicates()
print(len(df_bed[df_bed.included_in_interval==True]),len(df_bed[df_bed.included_in_interval==False]),len(df_bed))
print(len(df_vcf[df_vcf.included_in_interval==True]),len(df_vcf[df_vcf.included_in_interval==False]),len(df_vcf))

In [None]:
df_bed([df_bed.included_in_interval==True])

# Plotting

In [None]:
# y = np.array([8840,120])
# plt.pie(y)
# myexplode = [0,0.3]
# plt.pie(y, labels = ['not_included','included'],colors=["DimGrey",'CadetBlue'],explode=myexplode)
# plt.show()

In [None]:
labels = ['n_variants','n_in promoter_region']
fig, ax=plt.subplots(1,1,figsize=(4,4))
ax.bar(labels,[8960,120])
ax.set_yscale('log')
ax.set_ylim(10,1e5)
plt.show()

In [None]:
labels = ['n_regions','n_containing_variant']
fig, ax=plt.subplots(1,1,figsize=(4,4))
ax.bar(labels,[209239,132],color='orange')
ax.set_yscale('log')
ax.set_ylim(10,1e6)
plt.show()

In [None]:
len(df_vcf)

In [None]:
len(df_bed)

In [None]:
#Use pybedtools.bedtool.BedTool.overlap or BedTool.overlap?