In [13]:
##### Imports
import pandas as pd

from scripts import functions

In [14]:
##### Constants
HG19_GENOME_DIR = 'data/hg19.gencode.v19.primary_genome_by_chromosome'
HG19_ANNOTATIONS_FILE = 'data/hg19.gencode.v19.primary_annotation.gtf'
HG38_GENOME_DIR = 'data/hg38.gencode.v41.primary_genome_by_chromosome'
HG38_ANNOTATIONS_FILE = 'data/hg38.gencode.v41.primary_annotation.gtf'

HG19_INTRON_BED = "data/hg19.ucsc_gencode_v19.introns.bed.gz"
HG19_INTRON_DIR = "data/hg19.ucsc_gencode_v19.introns_by_chromosome"
HG38_INTRON_BED = "data/hg38.ucsc_gencode_v41.introns.bed.gz"
HG38_INTRON_DIR = "data/hg38.ucsc_gencode_v41.introns_by_chromosome"

In [16]:
##### Settings
pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows', 200)

In [17]:
##### Loading Data
hg19_introns = pd.read_csv(HG19_INTRON_BED, delimiter="\t", header=None)
hg19_introns
# hg38_introns = pd.read_csv(HG38_INTRON_BED, delimiter="\t", header=None)
# hg38_introns

Unnamed: 0,0,1,2,3,4,5
0,chr1,12227,12612,uc001aaa.3_intron_0_0_chr1_12228_f,0,+
1,chr1,12721,13220,uc001aaa.3_intron_1_0_chr1_12722_f,0,+
2,chr1,12227,12645,uc010nxr.1_intron_0_0_chr1_12228_f,0,+
3,chr1,12697,13220,uc010nxr.1_intron_1_0_chr1_12698_f,0,+
4,chr1,12227,12594,uc010nxq.1_intron_0_0_chr1_12228_f,0,+
...,...,...,...,...,...,...
659322,chr19_gl000209_random,140096,144360,uc002quo.2_intron_4_0_chr19_gl000209_random_14...,0,+
659323,chr19_gl000209_random,144465,144927,uc002quo.2_intron_5_0_chr19_gl000209_random_14...,0,+
659324,chr19_gl000209_random,144980,145078,uc002quo.2_intron_6_0_chr19_gl000209_random_14...,0,+
659325,chr19_gl000209_random,149474,150938,uc002qup.1_intron_0_0_chr19_gl000209_random_14...,0,+


In [18]:
# ##### 
hg19_introns.columns = ["Chromosome", "Start", "End", "ID", "Nothing_but_zeroes", "Strand"]
hg19_introns.drop(columns=["Nothing_but_zeroes"], inplace=True)
hg19_introns = hg19_introns[~hg19_introns.Chromosome.str.contains("_")]
hg19_introns.Start = hg19_introns.Start.apply(lambda pos: pos + 1)
hg19_introns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Chromosome,Start,End,ID,Strand
0,chr1,12228,12612,uc001aaa.3_intron_0_0_chr1_12228_f,+
1,chr1,12722,13220,uc001aaa.3_intron_1_0_chr1_12722_f,+
2,chr1,12228,12645,uc010nxr.1_intron_0_0_chr1_12228_f,+
3,chr1,12698,13220,uc010nxr.1_intron_1_0_chr1_12698_f,+
4,chr1,12228,12594,uc010nxq.1_intron_0_0_chr1_12228_f,+
...,...,...,...,...,...
631330,chr22,51222450,51223600,uc003bnr.1_intron_0_0_chr22_51222450_f,+
631331,chr22,51223722,51227177,uc003bnr.1_intron_1_0_chr22_51223722_f,+
631332,chr22,51227227,51227319,uc003bnr.1_intron_2_0_chr22_51227227_f,+
631333,chr22,51222293,51223600,uc010hbj.3_intron_0_0_chr22_51222293_f,+


In [19]:
# ##### 
chrom = "chrY"
# chrom = sys.argv[1]
fasta_file = HG19_INTRON_DIR + f"/{chrom}.fa"
print(chrom, ":", fasta_file)
with open(fasta_file, "w") as w:
	pass
	# w.write("#All intron sequences are forward-strand sequences, irrespective of the intron's orientation\n#Some introns are of length 0 (Start=Stop) and have an empty line where a DNA sequence would be\n#ID|Start|End|Strand\n")

chrY : /datasets2/genomes/fasta_files/hg19.ucsc_gencode_v19.introns_by_chromosome/chrY.fa


In [20]:
# ##### 
target_introns = hg19_introns[hg19_introns.Chromosome == chrom]
target_introns.sort_values(by=["Start"], inplace=True)
intron_count = target_introns.shape[0]

i = 0
for _, row in target_introns.iterrows():
	id = (f'>{row["ID"]}|{row["Start"]}|{row["End"]}|{row["Strand"]}')
	if row["Start"] == row["End"]:
		seq = ""
	else:
		seq = functions.get_sequence(HG19_GENOME_DIR, 
									chrom,
									row["Start"],
									row["End"],
									row["Strand"],
									reverse_complement=False)
	
	with open(fasta_file, "a") as a:
		a.write(id + "\n" + seq + "\n")

	i += 1
	if i % 1_000 == 0:
		print(f"{i:,} of {intron_count:,}")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


1,000 of 4,896
2,000 of 4,896
3,000 of 4,896
4,000 of 4,896


In [None]:
# ##### 
# hg38_introns.columns = ["Chromosome", "Start", "End", "ID", "Nothing_but_zeroes", "Strand"]
# hg38_introns.drop(columns=["Nothing_but_zeroes"], inplace=True)
# hg38_introns = hg38_introns[~hg38_introns.Chromosome.str.contains("_")]
# hg38_introns.Start = hg38_introns.Start.apply(lambda pos: pos + 1)
# hg38_introns

In [None]:
# ##### 
# # chrom = "chr1"
# chrom = sys.argv[1]
# fasta_file = HG38_INTRON_DIR + f"/{chrom}.fa"
# print(chrom, ":", fasta_file)
# with open(fasta_file, "w") as w:
# 	pass

In [None]:
# ##### 
# target_introns = hg38_introns[hg38_introns.Chromosome == chrom]
# target_introns.sort_values(by=["Start"], inplace=True)
# intron_count = target_introns.shape[0]

# i = 0
# for _, row in target_introns.iterrows():
# 	id = (f'>{row["ID"]}|{row["Start"]}|{row["End"]}|{row["Strand"]}')
# 	if row["Start"] == row["End"]:
# 		seq = ""
# 	else:
# 		seq = functions.get_sequence(HG38_GENOME_DIR, 
# 									chrom,
# 									row["Start"],
# 									row["End"],
# 									row["Strand"],
# 									reverse_complement=False)
	
# 	with open(fasta_file, "a") as a:
# 		a.write(id + "\n" + seq + "\n")

# 	i += 1
# 	if i % 100_000 == 0: 
# 		print(f"{i:,} introns of {intron_count:,}")

In [None]:
##### 


In [None]:
##### 


In [None]:
##### 


In [None]:
##### 


In [None]:
##### 


In [None]:
##### 


In [None]:
##### 


In [None]:
##### 


In [None]:
##### 


In [None]:
##### 


In [None]:
##### 


In [None]:
##### 


In [None]:
##### 


In [None]:
##### 


In [None]:
##### 


#####	Discarded code	#####