In [1]:
import os, tarfile
import pyBigWig
import pybedtools
import zipfile, gzip
from gtfparse import read_gtf
import pandas as pd
pd.set_option('display.max_columns', None)

INFO:numexpr.utils:Note: NumExpr detected 64 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [2]:
gtf_file_path = "/data/projects/Resources/Gencode_genome_annotation/gencode.v40.annotation.gtf"

In [3]:
def parse_attributes(attribute_string):
    # Split each attribute into key and value, then strip quotes and spaces
    return dict(item.strip().replace('"', '').split(' ') for item in attribute_string if item)

In [4]:
column_names = [
    "seqname", "source", "feature", "start", "end",
    "score", "strand", "frame", "attribute"
]

# Read the GTF file
gtf_df = pd.read_csv(gtf_file_path, sep="\t", comment='#', header=None, names=column_names)

In [5]:
# Step 1: Split the 'attribute' string into a list of strings for each key-value pair
attributes_list = gtf_df['attribute'].str.split(';')


# Apply the function to each row's attribute list
attributes_dicts = attributes_list.apply(parse_attributes)

# Step 3: Convert the list of dictionaries into a DataFrame
attributes_df = pd.DataFrame(list(attributes_dicts))

# Step 4: Combine the new attributes DataFrame with the original gtf_df
# This step assumes that the indexes are aligned and can be directly concatenated
combined_df = pd.concat([gtf_df, attributes_df], axis=1)

# Optionally, you can drop the original 'attribute' column if it's no longer needed
combined_df = combined_df.drop('attribute', axis=1)

In [7]:
combined_df

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,gene_name,level,hgnc_id,havana_gene,transcript_id,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
0,chr1,HAVANA,gene,11869,14409,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,2,HGNC:37102,OTTHUMG00000000961.2;,,,,,,,,,,,
1,chr1,HAVANA,transcript,11869,14409,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,2,HGNC:37102,OTTHUMG00000000961.2,ENST00000456328.2,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1;,,,,,
2,chr1,HAVANA,exon,11869,12227,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,2,HGNC:37102,OTTHUMG00000000961.2,ENST00000456328.2,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1;,1,ENSE00002234944.1,,,
3,chr1,HAVANA,exon,12613,12721,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,2,HGNC:37102,OTTHUMG00000000961.2,ENST00000456328.2,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1;,2,ENSE00003582793.1,,,
4,chr1,HAVANA,exon,13221,14409,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,2,HGNC:37102,OTTHUMG00000000961.2,ENST00000456328.2,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1;,3,ENSE00002312635.1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3283857,chrM,ENSEMBL,transcript,15888,15953,.,+,.,ENSG00000210195.2,Mt_tRNA,MT-TT,3,HGNC:7499,,ENST00000387460.2,Mt_tRNA,MT-TT-201,,Ensembl_canonical;,,,,,,
3283858,chrM,ENSEMBL,exon,15888,15953,.,+,.,ENSG00000210195.2,Mt_tRNA,MT-TT,3,HGNC:7499,,ENST00000387460.2,Mt_tRNA,MT-TT-201,,Ensembl_canonical;,,1,ENSE00001544475.2,,,
3283859,chrM,ENSEMBL,gene,15956,16023,.,-,.,ENSG00000210196.2,Mt_tRNA,MT-TP,3,HGNC:7494;,,,,,,,,,,,,
3283860,chrM,ENSEMBL,transcript,15956,16023,.,-,.,ENSG00000210196.2,Mt_tRNA,MT-TP,3,HGNC:7494,,ENST00000387461.2,Mt_tRNA,MT-TP-201,,Ensembl_canonical;,,,,,,


In [16]:
combined_df.groupby("gene_type").size()

gene_type
IG_C_gene                                 281
IG_C_pseudogene                            33
IG_D_gene                                 152
IG_J_gene                                  76
IG_J_pseudogene                             9
IG_V_gene                                1134
IG_V_pseudogene                           661
IG_pseudogene                               3
Mt_rRNA                                     6
Mt_tRNA                                    66
TEC                                      3221
TR_C_gene                                  70
TR_D_gene                                  16
TR_J_gene                                 316
TR_J_pseudogene                            12
TR_V_gene                                 823
TR_V_pseudogene                           123
lncRNA                                 258393
miRNA                                    5637
misc_RNA                                 6636
polymorphic_pseudogene                   2011
processed_pseudogene    

In [9]:
combined_df.to_csv("/data/projects/Resources/Gencode_genome_annotation/gencode_annotation_V40.tsv", sep= "\t", index=False)

In [12]:
# Filter rows where 'feature' is 'transcript' and 'seqname' is not 'chrM'
gtf_transcript = combined_df[(combined_df["feature"] == "transcript") & (combined_df["seqname"] != "chrM")]

# Display the resulting DataFrame
gtf_transcript

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,gene_name,level,hgnc_id,havana_gene,transcript_id,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
1,chr1,HAVANA,transcript,11869,14409,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,2,HGNC:37102,OTTHUMG00000000961.2,ENST00000456328.2,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1;,,,,,
5,chr1,HAVANA,transcript,12010,13670,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,2,HGNC:37102,OTTHUMG00000000961.2,ENST00000450305.2,transcribed_unprocessed_pseudogene,DDX11L1-201,,Ensembl_canonical,OTTHUMT00000002844.2;,,,PGO:0000019,,
13,chr1,HAVANA,transcript,14404,29570,.,-,.,ENSG00000227232.5,unprocessed_pseudogene,WASH7P,2,HGNC:38034,OTTHUMG00000000958.1,ENST00000488147.1,unprocessed_pseudogene,WASH7P-201,,Ensembl_canonical,OTTHUMT00000002839.1;,,,PGO:0000005,,
26,chr1,ENSEMBL,transcript,17369,17436,.,-,.,ENSG00000278267.1,miRNA,MIR6859-1,3,HGNC:50039,,ENST00000619216.1,miRNA,MIR6859-1-201,,Ensembl_canonical;,,,,,,
29,chr1,HAVANA,transcript,29554,31097,.,+,.,ENSG00000243485.5,lncRNA,MIR1302-2HG,2,HGNC:52482,OTTHUMG00000000959.2,ENST00000473358.1,lncRNA,MIR1302-2HG-202,5,Ensembl_canonical,OTTHUMT00000002840.1;,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3283698,chrY,HAVANA,transcript,57209306,57210051,.,+,.,ENSG00000182484.15_PAR_Y,protein_coding,WASH6P,2,HGNC:31685,OTTHUMG00000022677.5,ENST00000483079.6_PAR_Y,retained_intron,WASH6P-210,1,PAR,OTTHUMT00000058833.1;,,,,,
3283701,chrY,HAVANA,transcript,57209887,57212186,.,+,.,ENSG00000182484.15_PAR_Y,protein_coding,WASH6P,2,HGNC:31685,OTTHUMG00000022677.5,ENST00000496301.6_PAR_Y,retained_intron,WASH6P-215,2,PAR,OTTHUMT00000058827.1;,,,,,
3283704,chrY,HAVANA,transcript,57210344,57212074,.,+,.,ENSG00000182484.15_PAR_Y,protein_coding,WASH6P,2,HGNC:31685,OTTHUMG00000022677.5,ENST00000483286.6_PAR_Y,retained_intron,WASH6P-211,1,PAR,OTTHUMT00000058834.1;,,,,,
3283708,chrY,HAVANA,transcript,57210591,57212074,.,+,.,ENSG00000182484.15_PAR_Y,protein_coding,WASH6P,2,HGNC:31685,OTTHUMG00000022677.5,ENST00000464205.6_PAR_Y,processed_transcript,WASH6P-205,2,PAR,OTTHUMT00000058835.1;,,,,,


In [20]:
def extract_tss_regions_from_df(tss_df):
    """
    Extracts TSS (+50 and -50 bp) regions from the GTF DataFrame.
    :param df: A pandas DataFrame containing GTF data.
    :return: A DataFrame with TSS regions.
    """

    # Calculate TSS based on strand
    tss_df['TSS'] = tss_df.apply(lambda x: x['start'] if x['strand'] == '+' else x['end'], axis=1)

    # Adjust TSS based on strand for +50 and -50 bp
    tss_df['start_adj'] = tss_df.apply(lambda x: x['TSS'] - 50 if x['strand'] == '+' else x['TSS'], axis=1)
    tss_df['end_adj'] = tss_df.apply(lambda x: x['TSS'] + 50 if x['strand'] == '+' else x['TSS'] + 100, axis=1)

    # Select and rename relevant columns
    tss_regions = tss_df[['seqname', 'start_adj', 'end_adj', 'strand', 'gene_id', 'gene_type', 'gene_name', 'hgnc_id', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name']]
    tss_regions.columns = ['Chromosome', 'Start', 'End', 'Strand', 'GeneID', 'gene_type', 'gene_name', 'hgnc_id', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name']

    return tss_regions

In [21]:
tss_regions_df = extract_tss_regions_from_df(combined_df)

In [22]:
tss_regions_df

Unnamed: 0,Chromosome,Start,End,Strand,GeneID,gene_type,gene_name,hgnc_id,havana_gene,transcript_id,transcript_type,transcript_name
0,chr1,11819,11919,+,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,HGNC:37102,OTTHUMG00000000961.2;,,,
1,chr1,11819,11919,+,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,HGNC:37102,OTTHUMG00000000961.2,ENST00000456328.2,processed_transcript,DDX11L1-202
2,chr1,11819,11919,+,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,HGNC:37102,OTTHUMG00000000961.2,ENST00000456328.2,processed_transcript,DDX11L1-202
3,chr1,12563,12663,+,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,HGNC:37102,OTTHUMG00000000961.2,ENST00000456328.2,processed_transcript,DDX11L1-202
4,chr1,13171,13271,+,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,HGNC:37102,OTTHUMG00000000961.2,ENST00000456328.2,processed_transcript,DDX11L1-202
...,...,...,...,...,...,...,...,...,...,...,...,...
3283857,chrM,15838,15938,+,ENSG00000210195.2,Mt_tRNA,MT-TT,HGNC:7499,,ENST00000387460.2,Mt_tRNA,MT-TT-201
3283858,chrM,15838,15938,+,ENSG00000210195.2,Mt_tRNA,MT-TT,HGNC:7499,,ENST00000387460.2,Mt_tRNA,MT-TT-201
3283859,chrM,16023,16123,-,ENSG00000210196.2,Mt_tRNA,MT-TP,HGNC:7494;,,,,
3283860,chrM,16023,16123,-,ENSG00000210196.2,Mt_tRNA,MT-TP,HGNC:7494,,ENST00000387461.2,Mt_tRNA,MT-TP-201


In [23]:
# Define a custom aggregation function
def custom_agg(series):
    # If all values are the same, return any one of them; otherwise, return the list of unique values
    if series.nunique() == 1:
        return series.iloc[0]
    else:
        return list(series.unique())

In [None]:
# Group by the columns 'Chromosome', 'Start', 'End', 'Strand'
# and aggregate other columns
unique_tss_df = tss_regions_df.groupby(['Chromosome', 'Start', 'End', 'Strand']).agg(custom_agg).reset_index()

In [None]:
unique_tss_df

In [8]:
tss_regions_dfATAWAWR (W = A or T, R = A or G)



# TATAAAA
# TATAAAT
# TATATAA
# TATATAT


substrings = ['TATAAAA', 'TATAAAT', 'TATATAA', 'TATAAAA']
rev_com = ['TTTTATA', 'ATTTATA', 'TTATATA', 'TTTTATA']
#substrings =['TATA']

In [9]:
df_TATA = df_pos[df_pos['Sequence'].apply(lambda x: any(substring in x for substring in substrings))]

In [10]:
df_TATA

Unnamed: 0,Sequence
5,ACCTATAAAATCCAGGTAAATATCTGATACTGGCACACAGGTTGGA...
44,ATGATATAAAACAGGTCAGAACCCTCCTGCCTGTCTGCTCAGTTCA...
65,GATATAAAAGTGCTCCAACAAACCCGGAGCTGGCGTGCCGACCGCG...
86,TTTGTATAAAAGGTGAACTGAGATTTCATTCAGTCTACAGCTCTTG...
103,GCTTTTCTTCCCTCCCAAGAGACCAGCAAGGCTCACTATAAATAGC...
...,...
4002,CCATCTATAAATAGCTGGTCATCAGGGTTGATGAGAGGCTCGGAGG...
4090,CTTATAAAAGGGACCCCAGGGAGGTAGCCCCTTCTACCATGTGTGG...
4131,TTATAAAAGGGACCCCAGGGAGGTAGCCCCTTCTACCATGTGTGGA...
4139,TTTGCTAGCATATAAATCTCTCGGAGGCAGAAAACATGTCTGAAAT...


In [19]:
df_non_TATA = df_pos[~df_pos['Sequence'].apply(lambda x: any(substring in x for substring in substrings))]

In [20]:
df_non_TATA

Unnamed: 0,Sequence
0,GCTTTGCATGCGCACGGCCGGCCCCACCCCCGCTGTCAGCTGGAGG...
1,GGACGGAGCGGGGTGGAGGGCGGGGGCGGGGCCTAGTCCTGAGAGG...
2,CTCCACTTCCTGTCCCCGCGGCTCTGTGTCCCCTGCTAGCCGTAGG...
3,TGGCCCCGCCCCTCCGCGCTCAGGCCCCGCCCCCAGCTCCGAGGGC...
4,TCCCTCCCACCCCACTGCTCCCGCTCCATTGTCTGGGAATTGCAGC...
...,...
4319,ATGACAGTCCTGTCTCTTTAAGAAGAAAAAAAAAGGCTGCTGGGTA...
4320,GTGCGAGGAGTGGCGCATGCTCTGTGAGGCCGGCAGCTTCCCATTG...
4321,ATGACAGTCCTGTCTCTTTAAGAAGAAAAAAAAAGGCTGCTGGGTA...
4322,CGCGGCACAGGCGGCGGCGTCTCCAGGGGGAGCCAAGGTACGTAGG...


In [7]:
df_pos=pd.DataFrame()
df_pos['Sequence'] = combined_csv_data['sequence'].str.upper().apply(seq2kmer, args=(6,))

In [8]:
df_pos

Unnamed: 0,Sequence
0,GCTTTG CTTTGC TTTGCA TTGCAT TGCATG GCATGC CATG...
1,GGACGG GACGGA ACGGAG CGGAGC GGAGCG GAGCGG AGCG...
2,CTCCAC TCCACT CCACTT CACTTC ACTTCC CTTCCT TTCC...
3,TGGCCC GGCCCC GCCCCG CCCCGC CCCGCC CCGCCC CGCC...
4,TCCCTC CCCTCC CCTCCC CTCCCA TCCCAC CCCACC CCAC...
...,...
4320,GTGCGA TGCGAG GCGAGG CGAGGA GAGGAG AGGAGT GGAG...
4321,ATGACA TGACAG GACAGT ACAGTC CAGTCC AGTCCT GTCC...
4322,CGCGGC GCGGCA CGGCAC GGCACA GCACAG CACAGG ACAG...
4323,ATGACA TGACAG GACAGT ACAGTC CAGTCC AGTCCT GTCC...


In [9]:
df_pos['Label']=1

In [10]:
df_pos = df_pos.reset_index(drop=True)
df_pos

Unnamed: 0,Sequence,Label
0,GCTTTG CTTTGC TTTGCA TTGCAT TGCATG GCATGC CATG...,1
1,GGACGG GACGGA ACGGAG CGGAGC GGAGCG GAGCGG AGCG...,1
2,CTCCAC TCCACT CCACTT CACTTC ACTTCC CTTCCT TTCC...,1
3,TGGCCC GGCCCC GCCCCG CCCCGC CCCGCC CCGCCC CGCC...,1
4,TCCCTC CCCTCC CCTCCC CTCCCA TCCCAC CCCACC CCAC...,1
...,...,...
236970,GTGCGA TGCGAG GCGAGG CGAGGA GAGGAG AGGAGT GGAG...,1
236971,ATGACA TGACAG GACAGT ACAGTC CAGTCC AGTCCT GTCC...,1
236972,CGCGGC GCGGCA CGGCAC GGCACA GCACAG CACAGG ACAG...,1
236973,ATGACA TGACAG GACAGT ACAGTC CAGTCC AGTCCT GTCC...,1


In [12]:
df_pos.to_csv("/data/projects/DNABERT_data/Core_promoters/Core_promoter_regions/positive_set.tsv", sep="\t", index=False)