# CCDS Tagging Target Search

## Overview:
- Intended to search for relevant targets in human exons given parameters about target site and position
- Reads in nucleotide sequences of human genes from the CCDS files of the human genome
- Current strategy for capturing 3' UTRs:
    - Search for TTAC sequences near the 3' end of genes
    - From CCDS IDs, look up ENST IDs using ENSEMBL Biomart
    - Find relevant ENST IDs in gencode.v38.pc_transcripts
    - Check for PAM etc.

## To Do:
- Find a different input file or modify current for better analysis of sense strand targets
    - Sequences all end at the stop codon of the gene
    - This prevents a good search of sense strand TTACs very close to the end as the spacer sequence would be further downstream... would want 3' UTRs for all of these genes
- Search in the RC direction
    - Trivial to implement computationally, unclear if relevant biologically
    

In [1]:
# Import generally useful packages
import csv
import itertools
import sys
import os
import math
import numpy as np
from tqdm.notebook import tnrange, tqdm_notebook, tqdm

#Packages for data analysis
import pandas as pd

#Packages for sequence parsing
import regex as re
from Bio import SeqIO

In [95]:
#Pre-process CCDS key into dictionary format to speed up name look-up
genes = pd.read_csv('CCDS.20180614.csv')
gene_dict = {}

for index, gene in genes.iterrows():
    gene_dict[gene["ccds_id"]]=gene["gene"]
    
genes

Unnamed: 0,#chromosome,nc_accession,gene,gene_id,ccds_id,ccds_status,cds_strand,cds_from,cds_to,cds_locations,match_type
0,1,NC_000001.8,LINC00115,79854,CCDS1.1,Withdrawn,-,801942,802433,[801942-802433],Identical
1,1,NC_000001.11,SAMD11,148398,CCDS2.2,Public,+,925941,944152,"[925941-926012, 930154-930335, 931038-931088, ...",Identical
2,1,NC_000001.11,NOC2L,26155,CCDS3.1,Public,-,944693,959239,"[944693-944799, 945056-945145, 945517-945652, ...",Identical
3,1,NC_000001.11,PLEKHN1,84069,CCDS4.1,Public,+,966531,974574,"[966531-966613, 966703-966802, 970276-970422, ...",Identical
4,1,NC_000001.11,HES4,57801,CCDS5.1,Public,-,999058,999972,"[999058-999431, 999525-999612, 999691-999786, ...",Identical
...,...,...,...,...,...,...,...,...,...,...,...
35133,X,NC_000023.11,HSFX4,101927685,CCDS87786.1,Public,+,149929644,149931104,"[149929644-149930126, 149930586-149931104]",Identical
35134,X,NC_000023.11,VMA21,203547,CCDS87789.1,Public,+,151396839,151405057,"[151396839-151397056, 151403630-151403739, 151...",Identical
35135,X,NC_000023.11,PNMA6F,105373377,CCDS87792.1,Public,-,153318937,153320673,[153318937-153320673],Identical
35136,X,NC_000023.11,PNMA6E,649238,CCDS87793.1,Public,-,153396905,153398848,"[153396905-153397871, 153398706-153398848]",Identical


In [3]:
#Examine file structure of parsed sequences from CCDS
genesequences = SeqIO.parse("CCDS_nucleotide.current.fna", "fasta")
next(genesequences)

#Note that each sequence has a name/id, and ends with stop codon of gene

SeqRecord(seq=Seq('ATGTCCAAGGGGATCCTGCAGGTGCATCCTCCGATCTGCGACTGCCCGGGCTGC...TGA', SingleLetterAlphabet()), id='CCDS2.2|Hs109|chr1', name='CCDS2.2|Hs109|chr1', description='CCDS2.2|Hs109|chr1', dbxrefs=[])

In [4]:
#SaCas9 search in the forward direction, TTACs in window of 10-17, any frame

# Parameters:
search_regex = "TTAC\w{8,15}G[G|A][G|A]T" #Regex for sequence to match
distance_from_end = 45 #Maximum distance from 3' end of gene to TTAC = last 20 aas
gRNA_length = 27 #Includes PAM
PAM_length = 6
file_name = "SaCas9_10-17_Forward_15aa"

'''
SaCas9 has a gRNA structure of GN(21)NNGRRT
PAM regex is thus "\w{2}G(G|A)(G|A)T"
Want TTAC from window of 10-17 -> "TTAC\w{8-15}G[G|A][G|A]T"
'''

#Initialize relevant variables
counter = 0
good_sequences = []
filtered_data = []

#Load file
genesequences = SeqIO.parse("CCDS_nucleotide.current.fna", "fasta")
pbar = tqdm(genesequences)

#Iterate through sequences and search for matches
for sequence in pbar: 
    target_RC_search = re.finditer(search_regex, str(sequence.seq)) #Regex match iterator
    for match in target_RC_search:
        (start, stop) = match.span()
        real_start = start-(gRNA_length-len(match.group())) #Compensates for parts of gRNA missing from match
        if real_start+1 >= len(sequence)-distance_from_end:
            TTAC_pos = len(match.group())-PAM_length
            frame = (start+4)%3 + 1
            gRNA_dist = len(sequence)-start-4
            gRNA_sequence = str(sequence.seq)[real_start:real_start+gRNA_length]
            gene_name = gene_dict[re.match("CCDS.*\.\d", sequence.name).group()]
            good_sequences.append(sequence)
            filtered_data.append([sequence.name, gene_name, gRNA_sequence, str(sequence.seq), TTAC_pos, frame, gRNA_dist])
            counter += 1
    
print(f"Number of matching sequences: {counter}")

#Write data to relevant files
with open(file_name+"_Matches.fasta", "w") as output_handle:
        SeqIO.write(good_sequences, output_handle, "fasta")
        
df = pd.DataFrame(filtered_data, columns = ["CCDS Name", "Gene Name", "gRNA", "Sequence", "TTAC position", "Frame", "Distance from 3' End"])
df.to_csv(file_name+".csv")
df

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Number of matching sequences: 150


Unnamed: 0,CCDS Name,Gene Name,gRNA,Sequence,TTAC position,Frame,Distance from 3' End
0,CCDS746.1|Hs109|chr1,GCLM,TACATTTTACAAGCTAAAAGAAGGGGT,ATGGGCACCGACAGCCGCGCGGCCAAGGCGCTCCTGGCGCGGGCCC...,15,2,23
1,CCDS1037.1|Hs109|chr1,S100A12,CTGCCCATTACCACACCCACAAAGAGT,ATGACAAAACTTGAAGAGCATCTGGAGGGAATTGTCAATATCTTCC...,14,1,18
2,CCDS1285.1|Hs109|chr1,C1orf112,TAAAACGTTACATACATACTCTAGGGT,ATGTTTTTACCTCATATGAACCACCTGACATTGGAACAGACTTTCT...,14,1,18
3,CCDS1288.1|Hs109|chr1,KIFAP3,ATGAACCTTACTACTATGGCTATGGAT,ATGCAAGGGGAGGACGCCAGATACCTCAAAAGGAAAGTTAAAGGAG...,14,1,21
4,CCDS1362.1|Hs109|chr1,C1orf21,CGGGATTACTGTTCGGAAGAAGAGGAT,ATGGGCTGTGCCTCCGCCAAGCATGTTGCCACTGTTCAAAATGAAG...,16,1,27
...,...,...,...,...,...,...,...
145,CCDS86187.1|Hs109|chr11,LGR4,GCGCTATGCTTACAATCTACCAAGAGT,ATGCCGGGCCCGCTAGGGCTGCTCTGCTTCCTCGCCCTGGGGCTGC...,12,1,24
146,CCDS86584.1|Hs109|chr17,B9D1,CATACAGGTTACAGCCTGGGAGTGGGT,ATGGCGACCGCGAGTCCTAGCGTCTTTCTACTCATGGTCAACGGGC...,13,3,34
147,CCDS86588.1|Hs109|chr17,ADAP2,AACCGGCTTACTGCATCAACAGAGAGT,ATGGGCGATCGCGAGCGCAACAAGAAGCGGCTGCTGGAGCTGCTGC...,14,3,34
148,CCDS87280.1|Hs109|chr4,HPGD,CCCAGGGTTACTTTAGAACTGCTGGAT,ATGCACGTGAACGGCAAAGTGGCGCTGGTGACCGGCGCGGCTCAGG...,14,1,30


In [5]:
#SaCas9 search in the forward direction, TTACs in window of 10-17, any frame

# Parameters:
search_regex = "TTAC\w{8,15}G[G|A][G|A]T" #Regex for sequence to match
distance_from_end = 60 #Maximum distance from 3' end of gene to TTAC = last 20 aas
gRNA_length = 27 #Includes PAM
PAM_length = 6
file_name = "SaCas9_10-17_Forward_20aa"

'''
SaCas9 has a gRNA structure of GN(21)NNGRRT
PAM regex is thus "\w{2}G(G|A)(G|A)T"
Want TTAC from window of 10-17 -> "TTAC\w{8-15}G[G|A][G|A]T"
'''

#Initialize relevant variables
counter = 0
good_sequences = []
filtered_data = []

#Load file
genesequences = SeqIO.parse("CCDS_nucleotide.current.fna", "fasta")
pbar = tqdm(genesequences)

#Iterate through sequences and search for matches
for sequence in pbar: 
    target_RC_search = re.finditer(search_regex, str(sequence.seq)) #Regex match iterator
    for match in target_RC_search:
        (start, stop) = match.span()
        real_start = start-(gRNA_length-len(match.group())) #Compensates for parts of gRNA missing from match
        if real_start+1 >= len(sequence)-distance_from_end:
            TTAC_pos = len(match.group())-PAM_length
            frame = (start+4)%3 + 1
            gRNA_dist = len(sequence)-start-4
            gRNA_sequence = str(sequence.seq)[real_start:real_start+gRNA_length]
            gene_name = gene_dict[re.match("CCDS.*\.\d", sequence.name).group()]
            good_sequences.append(sequence)
            filtered_data.append([sequence.name, gene_name, gRNA_sequence, str(sequence.seq), TTAC_pos, frame, gRNA_dist])
            counter += 1
    
print(f"Number of matching sequences: {counter}")

#Write data to relevant files
with open(file_name+"_Matches.fasta", "w") as output_handle:
        SeqIO.write(good_sequences, output_handle, "fasta")
        
df = pd.DataFrame(filtered_data, columns = ["CCDS Name", "Gene Name", "gRNA", "Sequence", "TTAC position", "Frame", "Distance from 3' End"])
df.to_csv(file_name+".csv")
df

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Number of matching sequences: 290


Unnamed: 0,CCDS Name,Gene Name,gRNA,Sequence,TTAC position,Frame,Distance from 3' End
0,CCDS627.1|Hs109|chr1,UBE2U,CAGTATTACAAATGGAAGAAAATGGAT,ATGCACGGCAGAGCTTACCTCTTGCTGCACAGAGACTTCTGTGATC...,16,1,51
1,CCDS694.1|Hs109|chr1,DNASE2B,GGCAAATTTACCAAGCATTTCAAGGAT,ATGCCCCAGCTGTGCACCAGGGCCAGCTCATCAGAGATTCCTGGCA...,14,1,45
2,CCDS746.1|Hs109|chr1,GCLM,TACATTTTACAAGCTAAAAGAAGGGGT,ATGGGCACCGACAGCCGCGCGGCCAAGGCGCTCCTGGCGCGGGCCC...,15,2,23
3,CCDS867.1|Hs109|chr1,HIPK1,ACAATTTACACTGGATACCCGCTGAGT,ATGGCATCACAGCTGCAAGTGTTTTCGCCCCCATCAGTGTCGTCGA...,16,1,51
4,CCDS869.1|Hs109|chr1,HIPK1,ACAATTTACACTGGATACCCGCTGAGT,ATGGTTTTGATGTTTCAGATTCGTTATATTTCACAAACACAAGGCT...,16,1,51
...,...,...,...,...,...,...,...
285,CCDS86966.1|Hs109|chr20,OSBPL2,TATGCAGGGGATTACTTTGAGCGGAAT,ATGCCAATCGCCTTCAACGAGCCTCTGAGCTTCTTGCAGCGGATCA...,10,1,39
286,CCDS87159.1|Hs109|chr3,SLC33A1,AATTTAAAAAGTTACAGGATGAAGGAT,ATGTCACCCACCATCTCCCACAAGGACAGCAGCCGGCAACGGCGGC...,10,2,44
287,CCDS87280.1|Hs109|chr4,HPGD,CCCAGGGTTACTTTAGAACTGCTGGAT,ATGCACGTGAACGGCAAAGTGGCGCTGGTGACCGGCGCGGCTCAGG...,14,1,30
288,CCDS87482.1|Hs109|chr7,BZW2,AGTGGTTACAAAATGCAGAAGAAGAAT,ATGGCAGAAAAAGATGCCAACTCTGTTACCTCGTCTTTGAGAAAAG...,16,2,44


In [93]:
#SaCas9 search in the forward direction, TTACs in window of 8-17, any frame

# Parameters:
search_regex = "TTAC\w{6,15}G[G|A][G|A]T" #Regex for sequence to match
distance_from_end = 60 #Maximum distance from 3' end of gene to TTAC = last 20 aas
gRNA_length = 27 #Includes PAM
PAM_length = 6
file_name = "SaCas9_8-17_Forward_20aa"

'''
SaCas9 has a gRNA structure of GN(21)NNGRRT
PAM regex is thus "\w{2}G(G|A)(G|A)T"
Want TTAC from window of 8-17 -> "TTAC\w{6-15}G[G|A][G|A]T"
'''

#Initialize relevant variables
counter = 0
good_sequences = []
filtered_data = []

#Load file
genesequences = SeqIO.parse("CCDS_nucleotide.current.fna", "fasta")
pbar = tqdm(genesequences)

#Iterate through sequences and search for matches
for sequence in pbar: 
    target_RC_search = re.finditer(search_regex, str(sequence.seq)) #Regex match iterator
    for match in target_RC_search:
        (start, stop) = match.span()
        real_start = start-(gRNA_length-len(match.group())) #Compensates for parts of gRNA missing from match
        if real_start+1 >= len(sequence)-distance_from_end:
            TTAC_pos = len(match.group())-PAM_length
            frame = (start+4)%3 + 1
            gRNA_dist = len(sequence)-start-4
            gRNA_sequence = str(sequence.seq)[real_start:real_start+gRNA_length]
            gene_name = gene_dict[re.match("CCDS.*\.\d", sequence.name).group()]
            good_sequences.append(sequence)
            filtered_data.append([sequence.name, gene_name, gRNA_sequence, str(sequence.seq), TTAC_pos, frame, gRNA_dist])
            counter += 1
    
print(f"Number of matching sequences: {counter}")

#Write data to relevant files
with open(file_name+"_Matches.fasta", "w") as output_handle:
        SeqIO.write(good_sequences, output_handle, "fasta")
        
df = pd.DataFrame(filtered_data, columns = ["CCDS Name", "Gene Name", "gRNA", "Sequence", "TTAC position", "Frame", "Distance from 3' End"])
df.to_csv(file_name+".csv")
df

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Number of matching sequences: 354


Unnamed: 0,CCDS Name,Gene Name,gRNA,Sequence,TTAC position,Frame,Distance from 3' End
0,CCDS157.1|Hs109|chr1,CELA2A,GCGGGTCTCCAATTACATCGACTGGAT,ATGATAAGGACGCTGCTGCTGTCCACTTTGGTGGCTGGAGCCCTCA...,9,1,36
1,CCDS450.1|Hs109|chr1,COL9A2,CCTCTGCCCGCCTTACAGAGCCTGGAT,ATGGCCGCCGCTACGGCCTCCCCCCGCAGCCTCCTTGTTCTCCTCC...,9,3,28
2,CCDS627.1|Hs109|chr1,UBE2U,CAGTATTACAAATGGAAGAAAATGGAT,ATGCACGGCAGAGCTTACCTCTTGCTGCACAGAGACTTCTGTGATC...,16,1,51
3,CCDS694.1|Hs109|chr1,DNASE2B,GGCAAATTTACCAAGCATTTCAAGGAT,ATGCCCCAGCTGTGCACCAGGGCCAGCTCATCAGAGATTCCTGGCA...,14,1,45
4,CCDS746.1|Hs109|chr1,GCLM,TACATTTTACAAGCTAAAAGAAGGGGT,ATGGGCACCGACAGCCGCGCGGCCAAGGCGCTCCTGGCGCGGGCCC...,15,2,23
...,...,...,...,...,...,...,...
349,CCDS87159.1|Hs109|chr3,SLC33A1,AATTTAAAAAGTTACAGGATGAAGGAT,ATGTCACCCACCATCTCCCACAAGGACAGCAGCCGGCAACGGCGGC...,10,2,44
350,CCDS87280.1|Hs109|chr4,HPGD,CCCAGGGTTACTTTAGAACTGCTGGAT,ATGCACGTGAACGGCAAAGTGGCGCTGGTGACCGGCGCGGCTCAGG...,14,1,30
351,CCDS87397.1|Hs109|chr6,OARD1,GCGATATATATATTACTTGGATTGGAT,ATGGCCAGCAGCCTTAATGAAGATCCAGAAGGAAGCAGAATCACTT...,9,1,21
352,CCDS87482.1|Hs109|chr7,BZW2,AGTGGTTACAAAATGCAGAAGAAGAAT,ATGGCAGAAAAAGATGCCAACTCTGTTACCTCGTCTTTGAGAAAAG...,16,2,44


In [6]:
#SpCas9 search in the forward direction, TTAC at position 20, any frame

# Parameters:
search_regex = "TTAC\w{17}GG" #Regex for sequence to match
distance_from_end = 45 #Maximum distance from 3' end of gene to TTAC = last 15 aas
gRNA_length = 23 #Includes PAM
PAM_length = 3
file_name = "SpCas9_20_Forward_15aa"

#Initialize relevant variables
counter = 0
good_sequences = []
filtered_data = []

#Load file
genesequences = SeqIO.parse("CCDS_nucleotide.current.fna", "fasta")
pbar = tqdm(genesequences)

#Iterate through sequences and search for matches
for sequence in pbar: 
    target_RC_search = re.finditer(search_regex, str(sequence.seq)) #Regex match iterator
    for match in target_RC_search:
        (start, stop) = match.span()
        real_start = start-(gRNA_length-len(match.group())) #Compensates for parts of gRNA missing from match
        if real_start+1 >= len(sequence)-distance_from_end:
            TTAC_pos = len(match.group())-PAM_length
            frame = (start+4)%3 + 1
            gRNA_dist = len(sequence)-start-4
            gRNA_sequence = str(sequence.seq)[real_start:real_start+gRNA_length]
            gene_name = gene_dict[re.match("CCDS.*\.\d", sequence.name).group()]
            good_sequences.append(sequence)
            filtered_data.append([sequence.name, gene_name, gRNA_sequence, str(sequence.seq), TTAC_pos, frame, gRNA_dist])
            counter += 1
    
print(f"Number of matching sequences: {counter}")

#Write data to relevant files
with open(file_name+"_Matches.fasta", "w") as output_handle:
        SeqIO.write(good_sequences, output_handle, "fasta")
        
df = pd.DataFrame(filtered_data, columns = ["CCDS Name", "Gene Name", "gRNA", "Sequence", "TTAC position", "Frame", "Distance from 3' End"])
df.to_csv(file_name+".csv")
df

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Number of matching sequences: 90


Unnamed: 0,CCDS Name,Gene Name,gRNA,Sequence,TTAC position,Frame,Distance from 3' End
0,CCDS157.1|Hs109|chr1,CELA2A,TTACATCGACTGGATCAATTCGG,ATGATAAGGACGCTGCTGCTGTCCACTTTGGTGGCTGGAGCCCTCA...,20,1,36
1,CCDS348.1|Hs109|chr1,PTP4A2,TTACGCTTCAGAGATACCAATGG,ATGAACCGTCCAGCCCCTGTGGAGATCTCCTATGAGAACATGCGTT...,20,2,38
2,CCDS442.1|Hs109|chr1,PPIE,TTACCAGCCAGCCGAGGTCCTGG,ATGGCCACCACCAAGCGCGTCTTGTACGTGGGTGGACTGGCAGAGG...,20,3,31
3,CCDS1059.1|Hs109|chr1,RPS27,TTACAGAAGGATGTTCCTTCAGG,ATGCCTCTCGCAAAGGATCTCCTTCATCCCTCTCCAGAAGAGGAGA...,20,3,34
4,CCDS1230.1|Hs109|chr1,SDHC,TTACTGTGTTGTCCTCTATGGGG,ATGGCTGCGCTGTTGCTGAGACACGTTGGTCGTCATTGCCTCCGAG...,20,3,34
...,...,...,...,...,...,...,...
85,CCDS76767.1|Hs109|chr15,CA12,TTACAAGCCAGCCACCAAGATGG,ATGCCCCGGCGCAGCCTGCACGCGGCGGCCGTGCTCCTGCTGGTGA...,20,1,39
86,CCDS77519.1|Hs109|chr2,KANSL1L,TTACTAATGTTAAAAAAAATAGG,ATGACCCCAGCTCTGAGGGAGGCAACAGCAAAGGGTATCAGCTTTT...,20,3,22
87,CCDS78135.1|Hs109|chr6,SAYSD1,TTACAGTTGAGACCCCTGGCAGG,ATGCTGGAAGCGGCTCAGCCCCAGGGCAGCACATCAGAGACACCAT...,20,2,26
88,CCDS82507.1|Hs109|chr2,TEX51,TTACTGGCAAAGGAAGAAGGAGG,ATGCTGCCTCTCCTGATCATCTGTCTCCTGCCTGCCATTGAAGGGA...,20,3,25


In [7]:
#SpCas9 search in the forward direction, TTAC at position 20, any frame

#Parameters:
search_regex = "TTAC\w{17}GG" #Regex for sequence to match
distance_from_end = 60 #Maximum distance from 3' end of gene to TTAC = last 20 aas
gRNA_length = 23 #Includes PAM
PAM_length = 3
file_name = "SpCas9_20_Forward_20aa"

#Initialize relevant variables
counter = 0
good_sequences = []
filtered_data = []

#Load file
genesequences = SeqIO.parse("CCDS_nucleotide.current.fna", "fasta")
pbar = tqdm(genesequences)

#Iterate through sequences and search for matches
for sequence in pbar: 
    target_RC_search = re.finditer(search_regex, str(sequence.seq)) #Regex match iterator
    for match in target_RC_search:
        (start, stop) = match.span()
        real_start = start-(gRNA_length-len(match.group())) #Compensates for parts of gRNA missing from match
        if real_start+1 >= len(sequence)-distance_from_end:
            TTAC_pos = len(match.group())-PAM_length
            frame = (start+4)%3 + 1
            gRNA_dist = len(sequence)-start-4
            gRNA_sequence = str(sequence.seq)[real_start:real_start+gRNA_length]
            gene_name = gene_dict[re.match("CCDS.*\.\d", sequence.name).group()]
            good_sequences.append(sequence)
            filtered_data.append([sequence.name, gene_name, gRNA_sequence, str(sequence.seq), TTAC_pos, frame, gRNA_dist])
            counter += 1
    
print(f"Number of matching sequences: {counter}")

#Write data to relevant files
with open(file_name+"_Matches.fasta", "w") as output_handle:
        SeqIO.write(good_sequences, output_handle, "fasta")
        
df = pd.DataFrame(filtered_data, columns = ["CCDS Name", "Gene Name", "gRNA", "Sequence", "TTAC position", "Frame", "Distance from 3' End"])
df.to_csv(file_name+".csv")
df

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Number of matching sequences: 136


Unnamed: 0,CCDS Name,Gene Name,gRNA,Sequence,TTAC position,Frame,Distance from 3' End
0,CCDS157.1|Hs109|chr1,CELA2A,TTACATCGACTGGATCAATTCGG,ATGATAAGGACGCTGCTGCTGTCCACTTTGGTGGCTGGAGCCCTCA...,20,1,36
1,CCDS348.1|Hs109|chr1,PTP4A2,TTACGCTTCAGAGATACCAATGG,ATGAACCGTCCAGCCCCTGTGGAGATCTCCTATGAGAACATGCGTT...,20,2,38
2,CCDS442.1|Hs109|chr1,PPIE,TTACCAGCCAGCCGAGGTCCTGG,ATGGCCACCACCAAGCGCGTCTTGTACGTGGGTGGACTGGCAGAGG...,20,3,31
3,CCDS1059.1|Hs109|chr1,RPS27,TTACAGAAGGATGTTCCTTCAGG,ATGCCTCTCGCAAAGGATCTCCTTCATCCCTCTCCAGAAGAGGAGA...,20,3,34
4,CCDS1230.1|Hs109|chr1,SDHC,TTACTGTGTTGTCCTCTATGGGG,ATGGCTGCGCTGTTGCTGAGACACGTTGGTCGTCATTGCCTCCGAG...,20,3,34
...,...,...,...,...,...,...,...
131,CCDS77519.1|Hs109|chr2,KANSL1L,TTACTAATGTTAAAAAAAATAGG,ATGACCCCAGCTCTGAGGGAGGCAACAGCAAAGGGTATCAGCTTTT...,20,3,22
132,CCDS78135.1|Hs109|chr6,SAYSD1,TTACAGTTGAGACCCCTGGCAGG,ATGCTGGAAGCGGCTCAGCCCCAGGGCAGCACATCAGAGACACCAT...,20,2,26
133,CCDS82180.1|Hs109|chr17,TBX4,TTACAGTACCATTCAGGAATGGG,ATGCTGCAGGATAAGGGCCTGTCCGAGAGCGAGGAGGCCTTCCGGG...,20,2,47
134,CCDS82507.1|Hs109|chr2,TEX51,TTACTGGCAAAGGAAGAAGGAGG,ATGCTGCCTCTCCTGATCATCTGTCTCCTGCCTGCCATTGAAGGGA...,20,3,25


In [17]:
#Search in the forward direction for SpCas9 gRNAs that overlap with stop/5' UTR, TTAC at position 20, any frame

#Parameters:
search_regex = "(TTAC\w{0,17}$)|(TTAC\w{17}G$)" #Regex for sequence to match
file_name = "5_UTR_Overlapping_Forward_SpCas9"

#Initialize relevant variables
counter = 0
good_sequences = []
filtered_data = []

#Load file
genesequences = SeqIO.parse("CCDS_nucleotide.current.fna", "fasta")
pbar = tqdm(genesequences)

#Iterate through sequences and search for matches
for sequence in pbar: 
    last24 = str(sequence.seq)[-24:] #If you don't truncate, regex search totally kills your memory :(
    target_RC_search = re.finditer(search_regex, last24) #Regex match iterator
    for match in target_RC_search:
        (start, stop) = match.span()
        frame = (start+4)%3 + 1
        gRNA_dist = 24-start-4
        gRNA_sequence = last24[start:]
        gene_name = gene_dict[re.match("CCDS.*\.\d", sequence.name).group()]
        good_sequences.append(sequence)
        filtered_data.append([sequence.name, gene_name, gRNA_sequence, str(sequence.seq), frame, gRNA_dist])
        counter += 1
    
print(f"Number of matching sequences: {counter}")

#Write data to relevant files
with open(file_name+"_Matches.fasta", "w") as output_handle:
        SeqIO.write(good_sequences, output_handle, "fasta")

initial_filtered_sequences = good_sequences[:]

df = pd.DataFrame(filtered_data, columns = ["CCDS Name", "Gene Name", "gRNA", "Sequence", "Frame", "Distance from 3' End"])
df.to_csv(file_name+".csv")
df

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Number of matching sequences: 1146


Unnamed: 0,CCDS Name,Gene Name,gRNA,Sequence,Frame,Distance from 3' End
0,CCDS246.1|Hs109|chr1,PNRC2,TTACTTAAAGTACAGGTATAA,ATGGGTGGTGGAGAGAGGTATAACATTCCAGCCCCTCAATCTAGAA...,2,17
1,CCDS248.1|Hs109|chr1,IFNLR1,TTACATGGCCAGGTGA,ATGGCGGGGCCCGAGCGCTGGGGCCCCCTGCTCCTGTGCCTGCTGC...,1,12
2,CCDS249.1|Hs109|chr1,IFNLR1,TTACATGGCCAGGTGA,ATGGCGGGGCCCGAGCGCTGGGGCCCCCTGCTCCTGTGCCTGCTGC...,1,12
3,CCDS406.1|Hs109|chr1,EVA1B,TTACTGA,ATGGATGCCCCGCGAAGGGACATGGAGTTGCTCAGCAACAGCCTGG...,1,3
4,CCDS467.1|Hs109|chr1,ZMYND12,TTACTTAG,ATGAATGTGATCTACCCACTGGCAGTCCCCAAGGGGCGCAGACTCT...,3,4
...,...,...,...,...,...,...
1141,CCDS87332.1|Hs109|chr5,ARHGAP26,TTACGTGGAGTTCCTCTAA,ATGGAGCGTGCCCTGCCGAGGGGGCGCTGCCTCCCCTTGGGAAAAG...,1,15
1142,CCDS87340.1|Hs109|chr5,PPP1R2B,TTACGAAGTTCATAG,ATGGCGGCCTCGACGGCCTCCCACCGGCCCATCAAGGGGATCTTGA...,2,11
1143,CCDS87395.1|Hs109|chr6,OARD1,TTACTGTGTACACACTCTGA,ATGGGCGCTGGGATAGCTGTCCTCTTTAAGAAGAAATTTGGAGGGG...,3,16
1144,CCDS87430.1|Hs109|chr6,CD164,TTACCACACTCTGTAA,ATGCGAAAAGGAAGAAAAGTACCCATGTATGTACCGGGTGTTTTGC...,1,12


In [42]:
print(initial_filtered_sequences[0])
current_CCDS_id = re.match("(CCDS.*)\.\d", str(initial_filtered_sequences[0].name)).group(1)
print(current_CCDS_id)

ID: CCDS246.1|Hs109|chr1
Name: CCDS246.1|Hs109|chr1
Description: CCDS246.1|Hs109|chr1
Number of features: 0
Seq('ATGGGTGGTGGAGAGAGGTATAACATTCCAGCCCCTCAATCTAGAAATGTTAGT...TAA', SingleLetterAlphabet())
CCDS246


In [50]:
#IGNORE THIS - IS DEPRECATED, kept for comments
'''
#Create dictionary of CDS to ENST, and a filtered set of ENST sequence names
good_ENSTs = set()
CCDStoENST = {}

with open('5UTR_CCDS_to_ENST.csv', newline='') as file:
    linereader = csv.reader(file, delimiter=' ', quotechar='|')
    for line in linereader:
        CCDS_id, ENST_id = line[0].split(',')
        good_ENSTs.add(ENST_id)
        CCDStoENST[CCDS_id]=ENST_id
        
#Filter ENST sequences for faster parsing:
good_ENST_sequences = {}

#Load file
ENSTsequences = SeqIO.parse("gencode.v38.pc_transcripts.fa", "fasta")
pbar = tqdm(ENSTsequences)

#Iterate through sequences and search for matches
for sequence in pbar: 
    ENSTname = re.match("ENST\d*\.\d",sequence.name).group()
    if ENSTname in good_ENSTs:
        good_ENST_sequences[ENSTname] = sequence
        
print(len(good_ENST_sequences))
'''
#Somehow there are sequences lost here, went from 1129 CDS to ENST matches to 996 filtered ENST sequences
#I suspect this is a version issue, so will instead try with no version numbers

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


996


In [51]:
#Create dictionary of CDS to ENST, and a filtered set of ENST sequence names
good_ENSTs = set()
CCDStoENST = {}

with open('5UTR_CCDS_to_ENST.csv', newline='') as file:
    linereader = csv.reader(file, delimiter=' ', quotechar='|')
    for line in linereader:
        CCDS_id, ENST_id_initial = line[0].split(',')
        ENST_id = re.match("(ENST\d*)\.\d", ENST_id_initial).group(1)
        good_ENSTs.add(ENST_id)
        CCDStoENST[CCDS_id]=ENST_id
        
#Filter ENST sequences for faster parsing:
good_ENST_sequences = {}

#Load file
ENSTsequences = SeqIO.parse("gencode.v38.pc_transcripts.fa", "fasta")
pbar = tqdm(ENSTsequences)

#Iterate through sequences and search for matches
for sequence in pbar: 
    ENSTname = re.match("(ENST\d*)\.\d",sequence.name).group(1)
    if ENSTname in good_ENSTs:
        good_ENST_sequences[ENSTname] = sequence
        
print(len(good_ENST_sequences))
#Now, we are up to 1126 out of 1129. That is good enough for me. 
#One missing is from key issues reading in the start of .csv for CCDS
#Not sure about the other two

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


1126


In [92]:
#Search in the forward direction for SpCas9 gRNAs that overlap with stop/5' UTR, TTAC at position 20, any frame
#Part 2, with ENST gRNAs

#Parameters:
search_regex = "(TTAC\w{0,17}$)|(TTAC\w{17}G$)" #Regex for sequence to match
file_name = "5_UTR_Overlapping_Forward_SpCas9_Filtered"

#Initialize relevant variables
counter = 0
good_sequences = []
filtered_data = []

#Load sequences
pbar = tqdm(initial_filtered_sequences)

#Iterate through sequences and search for matches
for sequence in pbar: 
    #Extract CCDS ID and find corresponding ENST ID
    current_CCDS = re.match("(CCDS.*)\.\d", sequence.name).group(1)
    current_ENST_name = CCDStoENST.get(current_CCDS)
    if current_ENST_name:
        current_ENST = good_ENST_sequences.get(current_ENST_name) #Dictionary lookup for ENST sequence
        if current_ENST is not None:
            #Find correct match from before in CCDS
            last50 = str(sequence.seq)[-50:] #If you don't truncate, regex search totally kills your memory :(
            target_RC_search = re.finditer(search_regex, last50) #Regex match iterator
            for match in target_RC_search:
                (start, stop) = match.span()
                ENST_seq_to_match = last50[:start] #Set TTAC-preceeding sequence to find in ENST
                ENST_match = re.finditer(ENST_seq_to_match, str(current_ENST.seq))
                for possible_seq in ENST_match:
                    (ENST_ignore, ENST_start) = possible_seq.span()
                    if str(current_ENST.seq)[ENST_start:ENST_start+4] == "TTAC": #Check that is TTAC site
                        if str(current_ENST.seq)[ENST_start+21:ENST_start+23] == "GG": #Check for PAM
                            gRNA_sequence = str(current_ENST.seq)[ENST_start:ENST_start+23]
                            frame = (start+4)%3 + 1
                            gRNA_dist = 50-start-4
                            gene_name = gene_dict[re.match("CCDS.*\.\d", sequence.name).group()]
                            good_sequences.append(current_ENST)
                            filtered_data.append([sequence.name, current_ENST_name, gene_name, gRNA_sequence, str(sequence.seq), str(current_ENST.seq), frame, gRNA_dist])
                            counter += 1
        
    
print(f"Number of matching sequences: {counter}")

#Write data to relevant files
with open(file_name+"_ENST_Matches.fasta", "w") as output_handle:
        SeqIO.write(good_sequences, output_handle, "fasta")
        
df = pd.DataFrame(filtered_data, columns = ["CCDS Name", "ENST Name", "Gene Name", "gRNA", "CCDS Sequence", "ENST Sequence", "Frame", "Distance from 3' End"])
df.to_csv(file_name+".csv")
df

HBox(children=(FloatProgress(value=0.0, max=1146.0), HTML(value='')))


Number of matching sequences: 76


Unnamed: 0,CCDS Name,ENST Name,Gene Name,gRNA,CCDS Sequence,ENST Sequence,Frame,Distance from 3' End
0,CCDS503.1|Hs109|chr1,ENST00000372343,IPO13,TTACACAGCTGACTACTGAGGGG,ATGGAGCGGCGGGAGGAGCAGCCGGGGGCTGCAGGGGCTGGAGCAG...,GCTTGTCTTGTCAGTCACTGGGGCGGAGGCAGCGGCTGTAGCGGGG...,3,15
1,CCDS722.1|Hs109|chr1,ENST00000370459,GBP5,TTACTCTAAAGTGCTAAATATGG,ATGGCTTTAGAGATCCACATGTCAGACCCCATGTGCCTCATCGAGA...,GCTAATTGTTGTAGATCATCACTTCAAGGTGCCCATATCTTTCTAG...,1,5
2,CCDS1037.1|Hs109|chr1,ENST00000368737,S100A12,TTACCACACCCACAAAGAGTAGG,ATGACAAAACTTGAAGAGCATCTGGAGGGAATTGTCAATATCTTCC...,CTTCCTTGGCTCAGTGCCCTTCACCACTGCTGGCTTTTTGCTGTAG...,3,18
3,CCDS1318.1|Hs109|chr1,ENST00000367674,TNR,TTACAGTTCTGAGCAGTGGGCGG,ATGGGGGCAGATGGGGAAACAGTGGTTCTGAAGAACATGCTCATTG...,ACGTCAGAGGCAGGAACCGACTGTGCTAAGGCTGTTGGCTCAACAC...,1,8
4,CCDS1997.1|Hs109|chr2,ENST00000390655,CD8B,TTACAAATGAGCAGAGAATACGG,ATGCGGCCGCGGCTGTGGCTCCTCTTGGCCGCGCAGCTGACAGTTC...,AGGTGTCCCGGGCGCGCCACGATGCGGCCGCGGCTGTGGCTCCTCT...,3,6
...,...,...,...,...,...,...,...,...
71,CCDS82410.1|Hs109|chr19,ENST00000425570,ZNF418,TTACAAGTGAAAAGAAATTTGGG,ATGCAAGAAGGCCTTCATAGAATGACTTGGGACTGCATTGCTATTG...,CTCTGGTAGCGACCATTTTGGTTAATGTTGGGTGTGTTTCTGCGGT...,3,6
72,CCDS82506.1|Hs109|chr2,ENST00000452780,EPB41L5,TTACTGACCACTGAGCTCTGAGG,ATGCTGAGTTTCTTCCGTAGAACACTAGGGCGTCGGTCTATGCGTA...,ATGCTGAGTTTCTTCCGTAGAACACTAGGGCGTCGGTCTATGCGTA...,1,17
73,CCDS82829.1|Hs109|chr3,ENST00000460856,KALRN,TTACGTTTAGCGCGCATCCTGGG,ATGACGGACCGCTTCTGGGACCAGTGGTATCTCTGGTATCTCCGCT...,GTGTGGGAGGACTGGTGGCTGTCTTTGCAGGCAGGCATTTGCTTAG...,3,6
74,CCDS86415.1|Hs109|chr14,ENST00000556285,TGFB3,TTACTGCTTCCGGTGAGACTGGG,ATGAAGATGCACTTGCAAAGGGCTCTGGTGGTCCTGGCCCTGCTGA...,GGGGAGGCCGCCTGGTTTTCCTCCCTCCTTCTGCACGTCTGCTGGG...,3,12
