### Create sequence files associated with yeast experiments in GLM-Prior paper

### Please download the following:

1. yeast genome GTF file (R64-1-1): https://ftp.ensembl.org/pub/release-113/gtf/saccharomyces_cerevisiae/
2. yeast fasta file (R64-1-1): https://ftp.ensembl.org/pub/release-113/fasta/saccharomyces_cerevisiae/dna/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz
3. TF motifs from CisBP:
   - Select "By Species", "Saacharomyces_cerevisiae", "TF info", "PWMs", then download species archive: https://cisbp.ccbr.utoronto.ca/bulk.php


In [104]:
import numpy as np
import pandas as pd
import roman
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import motifs
import csv
import re
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from scipy.stats import gmean


### Step 1: Process gene location information from GTF file

In [4]:
gtf_file_path = '../data/yeast/genome_files/Saccharomyces_cerevisiae.R64-1-1.UTR.gtf'

# Open the GTF file in read mode
with open(gtf_file_path, 'r') as gtf_file:
    # Skip the first 5 lines
    for _ in range(5):
        gtf_file.readline()

    # Initialize an empty list to store the data
    data = []
    
    # Read and process the remaining lines
    for line in gtf_file:
        # Split the line into columns
        columns = line.strip().split('\t')
        chromosome = columns[0]
        start = int(columns[3])
        stop = int(columns[4])
        category = columns[2]

        match = re.search(r'gene_id "([^"]+)"', columns[8])
        gene_id = match.group(1) if match else None

        # Append the data to the list
        data.append([chromosome, start, stop, gene_id, category])

# Create a DataFrame from the collected data
gtf_df = pd.DataFrame(data, columns=['chromosome', 'start', 'stop', 'gene_id', 'category'])

### Step 2: Parse fasta file for DNA sequences corresponding to genes in the GTF file

In [5]:
# Specify the path to your FASTA file containing genomic DNA
fasta_file_path = '../data/yeast/genome_files/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa'

# Create a dictionary to store chromosome sequences
chromosome_sequences = {}

# Iterate over the FASTA file and populate the dictionary
with open(fasta_file_path, 'r') as fasta_file:
    for record in SeqIO.parse(fasta_file, 'fasta'):
        # Keep the chromosome name as is
        chromosome_id = record.id

        chromosome_sequences[chromosome_id] = str(record.seq)

# Print available chromosome IDs from the genomic data FASTA file
print("Chromosome IDs in FASTA file:", list(chromosome_sequences.keys()))

# Add a new column 'sequence' to the existing DataFrame
gtf_df['sequence'] = ''

# Iterate over the DataFrame rows
for index, row in gtf_df.iterrows():
    # Extract relevant information from the DataFrame
    chromosome = row['chromosome']
    start = row['start']
    stop = row['stop']

    # Use the dictionary to get the chromosome sequence
    chromosome_sequence = chromosome_sequences.get(chromosome)

    if chromosome_sequence:
        # Extract the DNA sequence based on start and stop positions
        gene_sequence = chromosome_sequence[start - 1:stop]

        # Update the 'sequence' column in the DataFrame
        gtf_df.at[index, 'sequence'] = gene_sequence


Chromosome IDs in FASTA file: ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI', 'Mito']


In [6]:
gene_info = gtf_df[gtf_df["category"] == "gene"]
gene_info

Unnamed: 0,chromosome,start,stop,gene_id,category,sequence
0,IV,1802,2953,YDL248W,gene,ATGAAAGAGAATGAAGTCAAAGATGAGAAAAGCGTAGATGTGTTAT...
6,IV,5985,7814,YDL247W,gene,ATGAAAAACTTATCTTTTCTCATAAACAGAAGAAAGGAAAATACAA...
12,IV,8683,9756,YDL246C,gene,TCATTCAGGACCAAAGATAATAGTCTTGACTACCTCTCCACCATGG...
18,IV,11657,13360,YDL245C,gene,TCAATTAAAACTCTTTGGGAACTTCAAAACTTCTTTCCAGCTTTTC...
24,IV,16204,17226,YDL244W,gene,ATGTCTACAGACAAGATCACATTTTTGTTGAACTGGCAACCAACCC...
...,...,...,...,...,...,...
36142,Mito,61022,61729,Q0160,gene,ATGAAAAATATTAAAAAAAATCAAGTAATAAATTTAGGACCTAATT...
36148,Mito,73758,74513,Q0250,gene,ATGTTAGATTTATTAAGATTACAATTAACAACATTCATTATGAATG...
36154,Mito,74495,75984,Q0255,gene,ATGATTAAATGAACAATAATTAATATTTACTTATTATTAATATTTT...
36164,Mito,79213,80022,Q0275,gene,ATGACACATTTAGAAAGAAGTAGACATCAACAACATCCATTTCATA...


In [None]:
gene_info.to_csv("../data/yeast/gene_DNA_sequences.tsv", sep = '\t')

### Step 3: Create a MEME file using the inferelator_prior code:
python -m inferelator_prior.pwm_to_meme --motif pwms_all_motifs/* --info TF_Information_all_motifs_plus.txt --out Scer.cisbp.2024.meme

### Step 4: Summarize the motifs from the MEME file
python -m inferelator_prior.motif_information --motif Scer.cisbp.2024.meme --out Scer.cisbp.2024.info.tsv

In [15]:
TF_info_scores = pd.read_csv("../data/yeast/genome_files/Scer.cisbp.2024.info.tsv", sep = '\t')
TF_info_scores

Unnamed: 0,Motif_ID,Motif_Name,Information_Content,Shannon_Entropy,Length,Consensus
0,M00001_2.00,ABF1,15.210,1.874,8,TTATCACT
1,M00002_2.00,AFT2,7.829,13.994,10,NGGGTGTNNN
2,M00003_2.00,MBP1,10.704,6.334,8,GACGCGTA
3,M00004_2.00,SWI4,11.038,6.441,8,GACGCGAA
4,M00005_2.00,XBP1,11.143,5.838,8,TCTCGAAG
...,...,...,...,...,...,...
638,M10990_2.00,MCM1,16.520,17.788,16,TTTCCYAAWCNGGTAA
639,M11215_2.00,RAP1,18.891,10.770,14,GRTGTATGGGTKTT
640,M11453_2.00,CAR1,17.165,19.428,17,GCYTTCGGCGGCTANTN
641,M11487_2.00,GAL4,22.228,27.379,23,GYTCGGASGACWGTSCTCCGATG


### Step 5: Process TF DNA sequences

In [16]:
TF_info_plus = pd.read_csv("../data/yeast/genome_files/Saccharomyces_cerevisiae_2024_01_09_11_11_am/TF_Information_all_motifs_plus.txt", sep = '\t')
TF_info_plus

Unnamed: 0,TF_ID,Family_ID,TSource_ID,Motif_ID,MSource_ID,DBID,TF_Name,TF_Species,TF_Status,Family_Name,...,MSource_Year,PMID,MSource_Version,SR_Model,SR_NoThreshold,TfSource_Name,TfSource_URL,TfSource_Year,TfSource_Month,TfSource_Day
0,T000084_2.00,F002_2.00,TS12_2.00,M00001_2.00,MS01_2.00,YKL112W,ABF1,Saccharomyces_cerevisiae,D,ABF1,...,2008,19111667,,SequenceIdentity,True,Ensembl,http://www.ensembl.org/,2018,Dec,8
1,T000084_2.00,F002_2.00,TS12_2.00,M00902_2.00,MS22_2.00,YKL112W,ABF1,Saccharomyces_cerevisiae,D,ABF1,...,2011,22189060,,SequenceIdentity,True,Ensembl,http://www.ensembl.org/,2018,Dec,8
2,T000084_2.00,F002_2.00,TS12_2.00,M07432_2.00,MS31_2.00,YKL112W,ABF1,Saccharomyces_cerevisiae,D,ABF1,...,2014,24194598,August 2014,SequenceIdentity,True,Ensembl,http://www.ensembl.org/,2018,Dec,8
3,T000084_2.00,F002_2.00,TS12_2.00,M08469_2.00,MS13_2.00,YKL112W,ABF1,Saccharomyces_cerevisiae,D,ABF1,...,2011,22102575,June 2011,SequenceIdentity,True,Ensembl,http://www.ensembl.org/,2018,Dec,8
4,T000084_2.00,F002_2.00,TS12_2.00,M09695_2.00,MS59_2.00,YKL112W,ABF1,Saccharomyces_cerevisiae,D,ABF1,...,2006,16381825,7.0,SequenceIdentity,True,Ensembl,http://www.ensembl.org/,2018,Dec,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1702,T388382_2.00,F324_2.00,TS12_2.00,M01326_2.00,MS57_2.00,YPR196W,YPR196W,Saccharomyces_cerevisiae,I,Zinc cluster,...,2019,,,SimilarityRegression,False,Ensembl,http://www.ensembl.org/,2018,Dec,8
1703,T388382_2.00,F324_2.00,TS12_2.00,M01601_2.00,MS63_2.00,YPR196W,YPR196W,Saccharomyces_cerevisiae,D,Zinc cluster,...,2009,19158363,,SimilarityRegression,False,Ensembl,http://www.ensembl.org/,2018,Dec,8
1704,T388382_2.00,F324_2.00,TS12_2.00,M07537_2.00,MS31_2.00,YPR196W,YPR196W,Saccharomyces_cerevisiae,D,Zinc cluster,...,2014,24194598,August 2014,SimilarityRegression,False,Ensembl,http://www.ensembl.org/,2018,Dec,8
1705,T388382_2.00,F324_2.00,TS12_2.00,M08700_2.00,MS13_2.00,YPR196W,YPR196W,Saccharomyces_cerevisiae,D,Zinc cluster,...,2011,22102575,June 2011,SimilarityRegression,False,Ensembl,http://www.ensembl.org/,2018,Dec,8


In [17]:
unique_entries = TF_info_plus.drop_duplicates(subset='TF_Name')
my_dict = dict(zip(unique_entries['TF_Name'], unique_entries['DBID']))

In [19]:
TF_info_scores['DBID'] = TF_info_scores['Motif_Name'].map(my_dict)
TF_info_scores

Unnamed: 0,Motif_ID,Motif_Name,Information_Content,Shannon_Entropy,Length,Consensus,DBID
0,M00001_2.00,ABF1,15.210,1.874,8,TTATCACT,YKL112W
1,M00002_2.00,AFT2,7.829,13.994,10,NGGGTGTNNN,YPL202C
2,M00003_2.00,MBP1,10.704,6.334,8,GACGCGTA,YDL056W
3,M00004_2.00,SWI4,11.038,6.441,8,GACGCGAA,YER111C
4,M00005_2.00,XBP1,11.143,5.838,8,TCTCGAAG,YIL101C
...,...,...,...,...,...,...,...
638,M10990_2.00,MCM1,16.520,17.788,16,TTTCCYAAWCNGGTAA,YMR043W
639,M11215_2.00,RAP1,18.891,10.770,14,GRTGTATGGGTKTT,YNL216W
640,M11453_2.00,CAR1,17.165,19.428,17,GCYTTCGGCGGCTANTN,YPL111W
641,M11487_2.00,GAL4,22.228,27.379,23,GYTCGGASGACWGTSCTCCGATG,YPL248C


In [22]:
TF_info_scores.to_csv("../data/yeast/genome_files/TF_info_scores_with_DBID.tsv", sep = '\t', index=None)

### Additional steps to aggregate proxy sequences for TFs missing from YEASTRACT are below. 
For TFs without CisBP sequences in yeastract dataset, we use the promoter regions of the target genes from the yeastract interaction matrix. Promoter regions (as defined by the yeastract database) are 1000bp regions upstream or downstream of a genes TSS (depending on strand orientation)

In [20]:
yeastract = pd.read_csv("../data/yeast/YEASTRACT_20190713_BOTH.tsv", sep = '\t', index_col=0)
yeastract

Unnamed: 0,YBR112C,YBR150C,YBR182C,YBR239C,YBR240C,YBR297W,YCL058C,YCL055W,YCR018C,YCR065W,...,YBL005W,YBR033W,YBR049C,YBR066C,YBR081C,YBR083W,YJL089W,YJL056C,YMR035W,MAL63
YAL068C,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
YAL067C,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
YAL066W,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
YAL065C,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
YAL064W-B,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YPR145C-A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
YPR170W-B,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tX(XXX)L,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RDN5-3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
TFs = pd.read_csv("../data/yeast/genome_files/TF_info_scores_with_DBID.tsv", sep = '\t')
TFs

Unnamed: 0,Motif_ID,Motif_Name,Information_Content,Shannon_Entropy,Length,Consensus,DBID
0,M00001_2.00,ABF1,15.210,1.874,8,TTATCACT,YKL112W
1,M00002_2.00,AFT2,7.829,13.994,10,NGGGTGTNNN,YPL202C
2,M00003_2.00,MBP1,10.704,6.334,8,GACGCGTA,YDL056W
3,M00004_2.00,SWI4,11.038,6.441,8,GACGCGAA,YER111C
4,M00005_2.00,XBP1,11.143,5.838,8,TCTCGAAG,YIL101C
...,...,...,...,...,...,...,...
638,M10990_2.00,MCM1,16.520,17.788,16,TTTCCYAAWCNGGTAA,YMR043W
639,M11215_2.00,RAP1,18.891,10.770,14,GRTGTATGGGTKTT,YNL216W
640,M11453_2.00,CAR1,17.165,19.428,17,GCYTTCGGCGGCTANTN,YPL111W
641,M11487_2.00,GAL4,22.228,27.379,23,GYTCGGASGACWGTSCTCCGATG,YPL248C


In [24]:
print(f"Number of TFs in YEASTRACT that have sequences from CisBP: {len(set(yeastract.columns) & set(TFs.DBID))}")

Number of TFs in YEASTRACT that have sequences from CisBP: 174


In [25]:
print(f"Number of TFs that are in CisBP Motif database: {len(TFs.DBID.unique())}")

Number of TFs that are in CisBP Motif database: 212


In [26]:
yeastract_tfs = yeastract.columns.tolist()
tfs_with_sequences = TFs.DBID.unique().tolist()

missing_tfs = list(set(yeastract_tfs) - set(tfs_with_sequences))
print(f"Number of TFs in yeastract: {len(yeastract_tfs)}, number of TFs in CisBP: {len(tfs_with_sequences)}, number of TFs in yeastract without sequences {len(missing_tfs)}")


Number of TFs in yeastract: 220, number of TFs in CisBP: 212, number of TFs in yeastract without sequences 46


In [29]:
### Create a dataframe of promoters for all genes in the gtf file:

# File paths
fasta_file_path = '../data/yeast/genome_files/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa'
gtf_file_path = '../data/yeast/genome_files/Saccharomyces_cerevisiae.R64-1-1.UTR.gtf'

# Load genome sequence as a dictionary keyed by chromosome name
genome = SeqIO.to_dict(SeqIO.parse(fasta_file_path, "fasta"))

# Initialize list to store gene and promoter data
promoter_data = []

# Open the GTF file and parse relevant information
with open(gtf_file_path, 'r') as gtf_file:
    # Skip the first 5 lines
    for _ in range(5):
        gtf_file.readline()
    
    # Process each remaining line in the GTF file
    for line in gtf_file:
        columns = line.strip().split("\t")
        chromosome = columns[0]
        start = int(columns[3])
        stop = int(columns[4])
        strand = columns[6]
        category = columns[2]
        
        # Extract gene_id instead of gene_name
        match = re.search(r'gene_id "([^"]+)"', columns[8])
        gene_id = match.group(1) if match else None
        
        # Proceed only if the category is 'gene'
        if category == "gene" and gene_id:
            # Calculate promoter region based on the strand
            if strand == '+':
                promoter_start = max(0, start - 1000)
                promoter_end = start
                promoter_seq = genome[chromosome].seq[promoter_start:promoter_end]
            else:
                promoter_start = stop
                promoter_end = min(len(genome[chromosome].seq), stop + 1000)
                promoter_seq = genome[chromosome].seq[promoter_start:promoter_end].reverse_complement()

            # Append to promoter data list
            promoter_data.append({
                "gene_id": gene_id,
                "chromosome": chromosome,
                "strand": strand,
                "promoter_sequence": str(promoter_seq)
            })

# Convert to DataFrame
promoter_df = pd.DataFrame(promoter_data)

# Save to CSV if needed
# promoter_df.to_csv("yeast_promoter_sequences.csv", index=False)


In [30]:
promoter_df

Unnamed: 0,gene_id,chromosome,strand,promoter_sequence
0,YDL248W,IV,+,CAATTTATATACACTTATGCCAATATTACAAAAAAATCACCACTAA...
1,YDL247W,IV,+,ACCGCCCAATTCCAAGATCTTTAATATACTATAACTTGAAAGTCAT...
2,YDL246C,IV,-,GGTCTTACGACAAAGTTCAAAGCTTGATGAATTGTCTATTACTTTA...
3,YDL245C,IV,-,CCATTTTTTCAGAATCCTCGATAACATAATCGATCAAAACAGTAAT...
4,YDL244W,IV,+,TTCTAAGTAACCAATTTCAAAACTGCATCTATCTTTGGATAAACTG...
...,...,...,...,...
6032,Q0160,Mito,+,CAGCATATTAAGTAATCCTATTATTAGGTAATCGTTTAGATATTAA...
6033,Q0250,Mito,+,CTCCTAGCAGGATTCACATCTCCTTCGGCCGGACTCCTTCGGGGTC...
6034,Q0255,Mito,+,GGAGGACCGAAGGAGTTTTAGTATTTTTTTTTTTTTAATAAAATAT...
6035,Q0275,Mito,+,AATTATTAAATAAATATATAATATATTATATATAATTTATAATATA...


In [31]:
# filter gene promoters by a list of genes to get corresponding promoter seqs

gene_ids = 'YAL064W, YAL063C, YBR139W, YCR065W, YDL049C, YDL047W, YDL037C, YDR055W, YDR082W, YDR345C, YDR508C, YEL007W, YGL184C, YGL096W, YGR108W, YGR248W, YGR295C, YHL040C, YHR007C, YHR094C, YIL169C, YIL168W, YIL119C, YIL013C, YIL011W, YIR019C, YJL196C, YJL160C, YJL095W, YLL052C, YLR110C, YLR312W-A, YML055W, YMR006C, YMR136W, YMR145C, YMR199W, YMR297W, YNL052W, YOL156W, YOR028C, YOR032C, YOR273C, YPL177C, YPL061W, YPL026C, YPL016W, YPR013C, YPR055W, YPR065W, YPR101W, YAL040C, YAR050W, YBL029C-A, YBL015W, YJL060W, YJL048C, STA1, YDR524W-C'
gene_id_list = [gene.strip() for gene in gene_ids.split(',')]

# Filter the DataFrame
filtered_df = promoter_df[promoter_df['gene_id'].isin(gene_id_list)]

# Get the promoter sequences as a list
promoter_sequences_list = filtered_df['promoter_sequence'].tolist()

# Convert the list to a comma-separated string if needed
promoter_sequences_string = ', '.join(promoter_sequences_list)

# Output the result
print(promoter_sequences_string)


GTACCCGCCCAAGACCAAGAAAGAAGATAGAGGTGTTGTGCCGGGGAAGTGCAAGCATTGTGGACTGCAATTTCCGAACGTTGACGTCTGGCTAAATAAGCACGTGGGGAAAGGATGTGGCTACTCATATCACTAAAACCAAAGTCATTCACCGACTCATACATACACACACATACATACATTTATTTATTTATGCCCTAATAATACACATCATTTCTGCGTAAATTATTGTTACGCTGCCCTACGTTATTAATCTAGATTCTTGTTTTCTCTCTGTGTCTCGTTTCTCTTTTTAGGCATGAAGCGAAAAAATCAACAACCGCTATTGACAGGAAGCAAGCTGGCATGAGCCGTTTTTTTTACTTTTTTTGATTTTACCAGAAGGGCGTTGAGACCTGCATCAGCCGTTTGCGGATAGGTTATGCCAGTGATGAAGCACCTGATTCAACTGCCGGCTCTCCGATTGGTCTCCGATCGTTTAGTTCCTTATATTGTACACTATTTCAGCTGTACAAGCCTAGGCCCGAATCGAATTATAAGTCCGTCCGGCTCAAGTCATGGACTCATTATCGATTCGTCTTTTTTTAGCTGCCCCCACGTTATCTCTTCTTTTTTTTTTCTCTGCATCGCGGCGAGTCCCTTGAAAGGGTCGCGAACCTTCGAAGCAGCACTTCTTTTTTTCACATGAGAATGTAGTCCATTGCGTAAATCCAGTCTTTGCGGCTTTCCCTTCTTCCTCATCATATTGTTGACCAACTCCAACAGAAAAGCACATTTCCTCTTCACCAATTTTCTTCTTTCTTTTTTGAAAGTACGAACATGTAAGGGACATTGGACATTATGTGCTCGTTCATGTATAATATTACATGAGGTAGTTAGTTACGTAAATTTATAAAAACGTATCGATGAAAGTTGCACAGAAGTAAAAAGTAAAGTATACAAGTATTACGTCGACTTCTTAGGTAGGTAACAATAATAAAAAATCAATTACGTAAAACGT

In [32]:
def get_tf_promoter_info_as_dataframe(tf_name, gene_ids):
    # Split the input gene IDs into a list
    gene_id_list = [gene.strip() for gene in gene_ids.split(',')]
    
    # Filter the DataFrame for the specified gene IDs
    filtered_df = promoter_df[promoter_df['gene_id'].isin(gene_id_list)]
    
    # Prepare a new DataFrame for the results
    result_df = pd.DataFrame({
        'TF': tf_name,
        'Gene_ID': filtered_df['gene_id'],
        'Promoter_Sequence': filtered_df['promoter_sequence']
    })
    
    return result_df


In [33]:
tf_name = "FLO8"
gene_ids = "YAL064W, YAL063C, YBR139W, YCR065W, YDL049C, YDL047W, YDL037C, YDR055W, YDR082W, YDR345C, YDR508C, YEL007W, YGL184C, YGL096W, YGR108W, YGR248W, YGR295C, YHL040C, YHR007C, YHR094C, YIL169C, YIL168W, YIL119C, YIL013C, YIL011W, YIR019C, YJL196C, YJL160C, YJL095W, YLL052C, YLR110C, YLR312W-A, YML055W, YMR006C, YMR136W, YMR145C, YMR199W, YMR297W, YNL052W, YOL156W, YOR028C, YOR032C, YOR273C, YPL177C, YPL061W, YPL026C, YPL016W, YPR013C, YPR055W, YPR065W, YPR101W, YAL040C, YAR050W, YBL029C-A, YBL015W, YJL060W, YJL048C, STA1, YDR524W-C"


In [34]:
FLO8 = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [35]:
tf_name = "YBR297W"
gene_ids = "YER190W, YFL066C, YHL050C, YIL026C, YKL113C, YKR006C, YKR007W, YLR411W, YMR279C, YOR075W, YAL062W"


In [36]:
YBR297W = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)


In [37]:
tf_name = "YLR223C"
gene_ids = "YBR189W, YBR191W, YCR031C, YDL191W, YDL184C, YDL130W, YDL083C, YDL082W, YDL081C, YDL075W, YDL061C, YDR012W, YDR064W, YDR382W, YDR418W, YDR448W, YDR450W, YDR471W, YDR500C, YEL054C, YER056C-A, YER074W, YER091C, YER131W, YFL034C-A, YFR031C-A, YFR032C, YFR032C-A, YGL189C, YGL147C, YGL123W, YGL103W, YGL076C, YGL031C, YGL030W, YGR027C, YGR085C, YGR090W, YGR118W, YGR148C, YGR214W, YHL033C, YHL015W, YHL001W, YHR010W, YHR021C, YHR204W, YIL148W, YIL133C, YIL052C, YIL018W, YJL191W, YJL190C, YJL189W, YJL177W, YJL136C, YJR122W, YJR123W, YJR145C, YKL180W, YKL156W, YKL006W, YKR057W, YLR029C, YLR048W, YLR061W, YLR075W, YLR167W, YLR185W, YLR287C-A, YLR325C, YLR333C, YLR340W, YLR344W, YLR367W, YLR388W, YLR406C, YLR441C, YLR448W, YML073C, YML063W, YML026C, YML024W, YMR116C, YMR142C, YMR143W, YMR194W, YMR230W, YMR242C, YNL302C, YNL301C, YNL178W, YNL162W, YNL096C, YNL069C, YNL067W, YOL127W, YOL120C, YOL040C, YOL039W, YOR063W, YOR096W, YOR167C, YOR182C, YOR234C, YOR293W, YOR313C, YOR369C, YPL220W, YPL199C, YPL198W, YPL143W, YPL132W, YPL131W, YPL090C, YPL081W, YPL079W, YPR043W, YPR102C, YPR103W, YPR132W, YBL092W, YBL087C, YBL072C, YBL027W, YBR048W"


In [38]:
YLR223C = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)


In [39]:
tf_name = "YIR033W"
gene_ids = "YDL052C, YDR492W, YGL225W, YGL055W, YGR175C, YJL196C, YKL008C, YLR372W, YMR246W, YOL101C, YOR377W, YPL057C"


In [40]:
YCR084C = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [41]:
tf_name = "YCR084C"
gene_ids = "YAL063C, YBR118W, YBR181C, YBR182C, YCL025C, YDL228C, YDL185W, YDL037C, YDR041W, YDR043C, YDR072C, YDR134C, YDR215C, YDR258C, YDR259C, YDR342C, YDR343C, YDR441C, YDR449C, YDR525W, YEL060C, YEL045C, YER055C, YER056C, YER145C, YER158C, YFL051C, YFL026W, YFL022C, YFL015C, YFR032C, YGL123W, YGL103W, YGL097W, YGL030W, YGR033C, YGR085C, YGR139W, YGR180C, YGR213C, YGR248W, YHL033C, YHL029C, YHL016C, YHL001W, YHR010W, YHR032W, YHR095W, YHR203C, YHR210C, YIL172C, YIL162W, YIL123W, YIL117C, YIL055C, YIR013C, YJL225C, YJR146W, YJR157W, YJR158W, YKL097C, YKL060C, YKR075C, YKR096W, YKR101W, YLL045C, YLR044C, YLR076C, YLR236C, YLR264W, YLR279W, YLR296W, YLR299W, YLR302C, YLR304C, YLR402W, YLR441C, YLR461W, YML122C, YMR007W, YMR011W, YMR016C, YMR057C, YMR142C, YMR242C, YMR319C, YNL069C, YNL067W, YNR059W, YNR071C, YOL158C, YOL157C, YOL156W, YOL040C, YOR027W, YOR050C, YOR063W, YOR344C, YOR348C, YPL231W, YPL190C, YPL177C, YPR013C, YPR015C, YPR064W, YAL039C, YAR028W, YAR047C, YAR060C, YBL072C, YBL030C, YBR067C, YJL079C, YIL170W"


In [42]:
YCR084C = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [43]:
tf_name = "YJL206C"
gene_ids = "YBR065C"

In [44]:
YJL206C = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [45]:
tf_name = "YOR363C"
gene_ids = "YBR127C, YDL092W, YDL078C, YDR041W, YDR072C, YDR096W, YDR139C, YDR234W, YDR247W, YDR256C, YDR297W, YEL060C, YEL011W, YER015W, YER131W, YGL205W, YGL056C, YGR035C, YGR036C, YGR044C, YGR180C, YHL027W, YHR007C, YHR126C, YHR142W, YHR143W, YIL160C, YIL100W, YIL069C, YJL159W, YJL158C, YJL148W, YKR009C, YKR093W, YLR153C, YLR284C, YML056C, YMR037C, YMR193C-A, YNL184C, YNL010W, YNL009W, YOL147C, YOL146W, YOL126C, YOL002C, YOR100C, YOR273C, YOR287C, YOR317W, YOR363C, YOR383C, YPL221W, YPL095C, YPR128C, YAL016W, YBR019C, YBR067C"


In [46]:
YOR363C = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [47]:
tf_name = "YMR172W"
gene_ids = "YDL022W, YDR074W, YDR534C, YDR536W, YER062C, YFL014W, YGR043C, YGR052W, YGR066C, YGR088W, YHR087W, YHR139C, YPL223C"


In [48]:
YMR172W = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [49]:
tf_name = "YOR372C"
gene_ids = "YDR146C, YPR119W"

In [50]:
YOR372C = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [51]:
tf_name = "YGL252C"
gene_ids = "YNR001C"

In [52]:
YGL252C = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [53]:
tf_name = "YML051W"
gene_ids = "YBR019C, YBR020W"

In [54]:
YML051W = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [55]:
tf_name = "YNL229C"
gene_ids = "YER040W"

In [56]:
YER040W = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [57]:
tf_name = "YIL119C"
gene_ids = "YIR019C"

In [58]:
YIL119C = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [59]:
tf_name = "YJL210W"
gene_ids = "YFL007W, YGR180C, YIL066C, YJL210W, YJL026W, YLR345W, YOR280C"

In [60]:
YJL210W = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [61]:
tf_name = "YMR023C"
gene_ids = "YIR019C, YAR050W"

In [62]:
YMR023C = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [63]:
tf_name = "YNL239W"
gene_ids = "YBR093C, YBR298C, YDL246C, YEL069C, YEL021W, YFL011W, YGL157W, YGL130W, YGL035C, YGR022C, YGR249W, YGR292W, YIL057C, YJR158W, YKL216W, YKL031W, YLR035C, YLR286C, YML051W, YMR176W, YMR199W, YMR305C, YNL239W, YNR072W, YOL143C, YOL126C, YOL058W, YOR378W, YPL248C, YJL056C"

In [64]:
YNL239W = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [65]:
tf_name = "YKR077W"
gene_ids = "YDL222C, YFL014W, YIR019C"

In [66]:
YKR077W = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [67]:
tf_name = "YEL007W"
gene_ids = "YIR019C"

In [68]:
YEL007W = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [69]:
tf_name = "YOR066W"
gene_ids = "YIR019C, YKL095W"

In [70]:
YOR066W = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [71]:
tf_name = "YJL124C"
gene_ids = "YML027W"

In [72]:
YJL124C = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [73]:
tf_name = "YDR174W"
gene_ids = "YKL112W"

In [74]:
YDR174W = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [75]:
tf_name = "YBR112C"
gene_ids = "YBR112C, YBR160W, YBR233W, YDR180W, YHR081W, YJL115W, YKL161C, YLL002W, YLR183C, YMR239C, YPR186C, YBL032W"

In [76]:
YBR112C = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [77]:
tf_name = "YDR009W"
gene_ids = "YBR109C, YBR290W, YCL043C, YCR077C, YDL147W, YDL115C, YDL065C, YDL059C, YDL006W, YDR009W, YDR059C, YDR092W, YDR152W, YDR159W, YDR177W, YDR328C, YDR329C, YDR335W, YDR484W, YDR518W, YEL012W, YER025W, YER095W, YGL207W, YGL123W, YGL106W, YGL049C, YGR002C, YGR133W, YGR162W, YGR173W, YGR188C, YGR239C, YHL007C, YHR023W, YHR034C, YHR156C, YHR158C, YHR160C, YIL038C, YIL005W, YJL194W, YJL128C, YJR066W, YJR086W, YJR125C, YKL203C, YKL186C, YKL079W, YKL062W, YKL022C, YKR020W, YKR071C, YLR025W, YLR095C, YLR119W, YLR193C, YLR212C, YLR229C, YLR332W, YML001W, YMR022W, YMR026C, YMR077C, YMR192W, YNL251C, YNL233W, YNL214W, YNL196C, YNL188W, YNL107W, YNL104C, YNL006W, YNR024W, YOL115W, YOL062C, YOL002C, YOR108W, YOR138C, YOR212W, YOR229W, YOR230W, YOR261C, YOR288C, YOR326W, YOR339C, YPL196W, YPL195W, YPL084W, YPL082C, YPR028W, YPR041W, YPR048W, YPR072W, YPR119W, YPR165W, YPR188C"

In [78]:
YDR009W = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [79]:
tf_name = "YGR288W"
gene_ids = "YBR205W, YBR245C, YBR296C, YBR298C, YBR299W, YCL029C, YCR050C, YDL184C, YDL061C, YDR281C, YDR379W, YER029C, YER082C, YFL068W, YFR032C-A, YGL243W, YGL241W, YGL212W, YGL071W, YGR016W, YGR064W, YGR209C, YGR289C, YGR292W, YHR018C, YHR136C, YIL148W, YJL115W, YJR085C, YJR148W, YKL052C, YKL002W, YKR099W, YLL014W, YLR136C, YLR264W, YLR297W, YLR312W-A, YLR319C, YML084W, YMR034C, YMR214W, YMR256C, YMR319C, YNL247W, YNL162W, YNL077W, YNL070W, YNR065C, YOL119C, YOL066C, YOL002C, YOR038C, YOR043W, YOR075W, YOR182C, YOR215C, YOR324C, YOR372C, YPL142C, YPL075W, YPL063W, YPL055C, YPL016W, YPR176C, YBR037C, YJL062W"

In [80]:
YGR288W = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [81]:
tf_name = "YLR183C"
gene_ids = "YNL330C, YKR029C"

In [82]:
YLR183C = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [83]:
tf_name = "YOR290C"
gene_ids = "YBR089C-A, YBR123C, YBR182C, YCL067C, YCL066W, YCR039C, YCR040W, YDL002C, YDR174W, YER148W, YGR047C, YGR140W, YKL032C, YMR072W, YMR094W, YMR168C, YNL216W, YOR110W, YPL089C, YPL007C, YPR052C, YPR065W, YAL001C, R0010W"

In [84]:
YOR290C = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [85]:
tf_name = "YKR064W"
gene_ids = "YBR089C-A, YBR123C, YBR182C, YCL067C, YCL066W, YCR039C, YCR040W, YDL002C, YDR174W, YER148W, YGR047C, YGR140W, YKL032C, YMR072W, YMR094W, YMR168C, YNL216W, YOR110W, YPL089C, YPL007C, YPR052C, YPR065W, YAL001C, R0010W"

In [86]:
YKR064W = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [87]:
tf_name = "YFL052W"
gene_ids = "YBR089C-A, YBR123C, YBR182C, YCL067C, YCL066W, YCR039C, YCR040W, YDL002C, YDR174W, YER148W, YGR047C, YGR140W, YKL032C, YMR072W, YMR094W, YMR168C, YNL216W, YOR110W, YPL089C, YPL007C, YPR052C, YPR065W, YAL001C, R0010W"

In [88]:
YFL052W = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [89]:
tf_name = "YBL066C"
gene_ids = "YBR089C-A, YBR123C, YBR182C, YCL067C, YCL066W, YCR039C, YCR040W, YDL002C, YDR174W, YER148W, YGR047C, YGR140W, YKL032C, YMR072W, YMR094W, YMR168C, YNL216W, YOR110W, YPL089C, YPL007C, YPR052C, YPR065W, YAL001C, R0010W"

In [90]:
YBL066C = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [91]:
tf_name = "YLR113W"
gene_ids = "YBR160W"

In [92]:
YLR113W = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [93]:
tf_name = "YNL097C"
gene_ids = "YML062C, YOL020W"

In [94]:
YNL097C = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [95]:
tf_name = "MAL63"
gene_ids = "YSC0013, YSC0014"

In [96]:
MAL63 = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [97]:
tf_name = "YMR035W"
gene_ids = "YMR105C, YNL239W, YPL248C, YBR018C, YBR020W"

In [99]:
YMR035W = get_tf_promoter_info_as_dataframe(tf_name, gene_ids)

In [100]:
yeastract_derived_promoters = pd.concat([FLO8, YBR297W, YLR223C, YCR084C, YCR084C, YJL206C, 
                                         YOR363C, YMR172W, YOR372C, YGL252C, YML051W, YER040W, 
                                         YIL119C, YJL210W, YMR023C, YNL239W, YKR077W, YEL007W, 
                                         YOR066W, YJL124C, YDR174W, YBR112C, YDR009W, YGR288W,
                                         YLR183C, YOR290C, YKR064W, YFL052W, YBL066C, YLR113W,
                                         YNL097C, YMR035W], 
                                        axis=0, ignore_index=True)


In [101]:
yeastract_derived_promoters

Unnamed: 0,TF,Gene_ID,Promoter_Sequence
0,FLO8,YDL049C,GTACCCGCCCAAGACCAAGAAAGAAGATAGAGGTGTTGTGCCGGGG...
1,FLO8,YDL047W,GTTCTTTTTCTATTCAAAGACTCGAATGAGGCCAACGAAAGAAGTA...
2,FLO8,YDL037C,TATGGCTTACGTAAACATTATTTCTTTGATATACTATTCCGTTAGA...
3,FLO8,YDR055W,AGCTCCTATTATTTTGTAATAGTATGCTCTTTATTGTCCTTCTTAA...
4,FLO8,YDR082W,TGATTGTGACACAGGATCATTCGGTTCATTGGGGAAATAATTCCTG...
...,...,...,...
768,YMR035W,YPL248C,ACCCAATTGATCGCTCACGACAGCGAAGTTTTTGATGTAAGGTTTT...
769,YMR035W,YMR105C,ACAACAGACACCACTCTCCCACTTCCCTCTTTAAATAAAGACCAAG...
770,YMR035W,YBR018C,CCAGTTTGATTGTTGTTTTGTGGTGGATGAAAATGCTAAGCCAAGT...
771,YMR035W,YBR020W,AATGGCGTATTTCGTATGACCATACGGATTAGTAGGCCCTAAGGGA...


The remaining TF sequences were aggregated by using https://www.yeastgenome.org/ defined interactions for each TF in the same way as the yeastract data was used

In [102]:
new_rows = [
    {'TF': 'YDL233W', 'Gene_ID': '', 'Promoter_Sequence': 'TTCCCTTTTCTTTTTCTTTGTTGCGACAGTGGCTTCAAAGAACTGCTGATTGCTCAAGGCAATCAGTCCGAGCGTTTAGAAGGTGATTGTAGGCAGAAATTAACTTTGCGGTAAAAGAATGACATTCTTTCTTAAAAGAAAAATTAGCTTTTTTTTGTCAGGCATTGCACAAACTTTTTTATTTCTGCCTATACTCTTAAACAGATCAGTCATTCATGTTGTCTTTTTAACGGTCGTACTGGGACATCGCATACCTTGGGATTCCGTAATTAGGTGCAACAATACGGGCACAACTCATTCTGCGGTATCTTCACGGACAGAACTTCTATTGCCTATCGGTGGTGTGATTAACAATTGGAAGCGCAGAGCTTGGAATGGATTTTCAATTCAATGGATTTGGAGGTATTCGTTTGTTTACTAATATTTACTTTGAGGACATTGCCCAACCCTAAAAGTGCCTGCTCAAGAATAGAATAACATTATGATACGTTTTCTTGACCGCTGAGCAATTTAAAGCAACTATTAGGGTACGATTGTTTCTAGAGAAATGTGGGTCATCTTTTTAGGTCCGTTCTCTTCTGATGAGGTAACCTTTACAAAAATGTCATAGAGTTACCAATTGGGATTCAAGGCATCATCGCAATATACTTCGTTCTTTTACGGAGAAATTAAGCTCTTTCTACTTTGAATTAACTGTTAGACTTGTCTTATCTGAGGAATGTCCGTGTTCGAATTAAATAAAAAATTAGGGCAGTTTTATTTACCTTAACAAATATGTTCAAGCATTTACGTTACTGCGCTCTCTTCTAGTTCAAGAACGGATAACTCATAGACTTACCAGTACAAATTGTTGAAGGGTTCCCAATTGATAAAAAGGATCTCTTGCTTCCTAAAATAAACGTATAAAAAGCACCCTATTCATCAGTTATTATCCCTCGTCATGTTGTGGTTCTAATTAAAATATACTTTTGTAGGCCTCAAAAATCCATATACGCACACT'},
    {'TF': 'YGL252C', 'Gene_ID': '', 'Promoter_Sequence': 'GTTTAGTAACTACCGGAATCACTATTATATTGGTCATGATTAATATGACCAATCAGCGTGTGTTTTATATACATCTCTTATTTAGTATAAGAAGATCAGTACTTATTTCTTCATTAATACTAATTTTTAACCTCTAATTATCAACAGAGGAATATGAGGATTCCTAGATCCTCGAGGAGAACTTCTAGTATATTCTGTATACCTAATATTATAGCCTTTTTCAACAATGGAATCCCAACAATTATCTCAAAAATCACCTAAGCCTCACATGCATGTTTTACGTTGTGGTTGATGATCTATTGATGTTGAGGTACCATATTTTGAAATTCATTAAAACAACTTCTACCGGGCGTGTGGTCTAGAGGTATGATTCTCGCTTAGGGTGCGGGAGGTCCCGGGTTCGAGTCCCGGCTCGCCCCCATTGATTTATTTTTATCAAATTTTTTAATATTGCATATCTTTAGTGGGCAAAAAATTTTTCCCTGGGCAAAAATATAAGAGAGCTCATCTTATTGTTGTCAGCCCAATGATTCCCTTGTCAAATTGAATTTTCGGATTTACTTGTTCAGGTACCCGCGTTAAGGGGCTGCCGCGCCTGTCACTCTAAGAAAAAAGGAGCCATCAAAAACCATTCAGCATTAACTAAAAACGCGGGTAGAGATTACTACATATTCCAACAAGACCTTCGCAGGAAAGTATACCTAAACTAATTAAAGAAATCTCCGAAGTTCGCATTTCATTGAACGGCTCAATTAATCTTTGTAAATATGAGCGTTTTTACGTTCACATTGCCTTTTTTTTTATGTATTTACCTTGCATTTTTGTGCTAAAAGGCGTCACGTTTTTTTCCGCCGCAGCCGCCCGGAAATGAAAAGTATGACCCCCGCTAGACCAAAAATACTTTTGTGTTATTGGAGGATCGCAATCCCTTTGGAGCTTTTCCGATACTATCGACTTATCCGACCTCTTGTTGTTTGAAAATGTCAATTGATATCCATCC'},
    {'TF': 'YBR081C', 'Gene_ID': '', 'Promoter_Sequence': 'TTAACCGTACCTTTTTCATTTCTAGTCTATCTGTAGGTTAATTACTATTGTCATTAACATCATTTCTGGGGTGAAGCCTATTTAAATTTTTGAAGTTCAACGCATAGCTAGTATATGTAATCAACGATCAATGACTGGTTCTCTGTTTGGCAAAAATTCTGAGGAGCATTACACTGTACTAAGGAGGCAGAAGAATAACTGCAGGAGTAGCCAAAAAAATCTCCGCGACGGGGAATTGAACCCCGATCTGGCACGCGACAAGCGCCCATTCTGACCATTAAACTATCACGGAAGAAACAAAGCACTCACGATGGGGGTCGAACCCATAATCTTCTGATTAGAAGTCAGACGCGTTGCCATTACGCCACGCGAGCTACTATTTGTTGAAGGTTTTATGAAATAACGAAAACATGTTTCCTCTAAAGATGGATGTGCTTCCAGTATTATTCTTGTATGACAATTCATTTGTTTTTGGTGTCGGCTTGTATTAAAGACTTATATCTGTAATATATCTGTATAAATACCAATGCAAGGATATATTAAATGAAAGTTAGCATATTTCAATTGGCTTTTTAGAATAAAGACGAAATTTTTTTAAAGGTAAAATACCCCACTGAAACCTTATGCTCTTTTATTGTACATATAGGAAAATTGGTCAAATGCTGGCCAGCTTTAACTGAAATAAATCTCTACAGGAATGCTTGTTGGGACTAACTCCAGTGAATAATTATTAGGGCTGCGATTATATTCTTGATGGTACGGGTAACTTTCTAATATTTCAGATCCAAGTAATGAAAAGTGAATGTACACTCGTAAGTCTTCAATTATGTTAGATGTACAAAGGAAAAAAGATCATTGGAGAAGTTGGCAAAATACATCTTTTATTTATTTAGCAAGTCAAACAATAAGATCTTTCCTTTTTTCAGATTGAACTTCGGATCAGTGAAAATTGTTGGTTACTCTTTAAACTGTAACACTACAAAGAAATTAAGTCTGAATA'},    
    {'TF': 'YOR372C', 'Gene_ID': '', 'Promoter_Sequence': 'TGTCACAATTTGCAGACTGGCGCTTTTTCTTAGACATTTCTGGTGAGGCTGTGCCGACCAAGTAAGTTCGGGGCATTGCGTCACTGCTTGTAAGTGAATAATTGGTGTACATTATGCCAAATTGCAGATCTGTATCTGTTAATTTCTGCTACTACTTTCGTACAAACTATAGCTGCGTACCACAACAATTTAGTAATTTTGTGCCAATCTGTTATCTCAACAACGACATCTATTGTTTTGTAGATTTGCATACACTTTCTTTTGCAAACGACCTCCGGGTAGCCAAAACCCAAACGATATTTTTTTACTATCATCACCGTTTTCACATGCTGTCTAGTGGCCTAACTTATGAAACATAGTCAAACTACTTATAAAACTTCGGCAGTCAGACAGTTCTTTCATTAAATGGTGTCAAGTAGATATTTACGGCATTGGATTCTAGGGCCAATGTTATTTCTGTCTTAAAGGAGAGCGAATCAAATATCGGAGTGTTATTGTTAGATATTCTGAAAGGTAGTATGATTATCTGAAAGCCGATAGGAGTGGTACACTCCGCTCACACCTTGTTAAATATTCATTAGAACATCTCTGTTCCAATTCATAGCAGCATTGTTGGAAATATCTTGGTCTGAATCCCAATCCGGGCAAGTAGAGAGGATGAAATTTTCACTTTCGTACTTTAACCTGTTTAGGAAAAAGGTAAACAATAACAATACCTACCATTAGCGACTAATTATCTTTCTTTCTCCCTATAGATCTGCTCTATAAACAATGATTGAGTACTATAGAGGATCGTCTGCAATTTTATTCACTATTTATGTATTGGCACGGAAGACAAAAAGGCATCGTTGGAAGAAAGATTTGAGAACGACAATATAGATACCACCATATAACTCGTTCTTGGAGCTAGGTAAATAGATCCTGAGAACGTGTTTAACATCTGCGATATAATTAAAGCGCAATCAAATAAAATATTAACGTCATTGTTTTTTCGAAAAAG'},
    {'TF': 'YLR182W', 'Gene_ID': '', 'Promoter_Sequence': 'GACGCGAA'}, 
    {'TF': 'YLR182W', 'Gene_ID': '', 'Promoter_Sequence': 'ACGCGAAA'}, 
    {'TF': 'YLR182W', 'Gene_ID': '', 'Promoter_Sequence': 'ACGCGAAA'}, 
    {'TF': 'YLR182W', 'Gene_ID': '', 'Promoter_Sequence': 'NNNNACGCGT'}, 
    {'TF': 'YLR182W', 'Gene_ID': '', 'Promoter_Sequence': 'NWCGCGT'}, 
    {'TF': 'YLR182W', 'Gene_ID': '', 'Promoter_Sequence': 'NACGCGT'}, 
    {'TF': 'YLR055C', 'Gene_ID': '', 'Promoter_Sequence': 'NNTATAW'}, 
    {'TF': 'YLR055C', 'Gene_ID': '', 'Promoter_Sequence': 'NTCSAATATATATANCTANNC'}, 
    {'TF': 'YPL049C', 'Gene_ID': '', 'Promoter_Sequence': 'NTGTTTCANA'}, 
    {'TF': 'YPL049C', 'Gene_ID': '', 'Promoter_Sequence': 'NTGTTTCAKN'}, 
    {'TF': 'YPL049C', 'Gene_ID': '', 'Promoter_Sequence': 'TGTTTCA'}, 
    {'TF': 'YPL049C', 'Gene_ID': '', 'Promoter_Sequence': 'TGTTTCA'}, 
    {'TF': 'YHR177W', 'Gene_ID': '', 'Promoter_Sequence': 'TTAAGGTT'}, 
    {'TF': 'YDR501W', 'Gene_ID': '', 'Promoter_Sequence': 'CGCGAAAA'}, 
    {'TF': 'YDR480W', 'Gene_ID': '', 'Promoter_Sequence': 'TGAAACA'}, 
    {'TF': 'YDR480W', 'Gene_ID': '', 'Promoter_Sequence': 'MTGAAACA'}, 
    {'TF': 'YDR480W', 'Gene_ID': '', 'Promoter_Sequence': 'ATGAAACA'}, 
    {'TF': 'YDR480W', 'Gene_ID': '', 'Promoter_Sequence': 'GGAAACA'}, 
    {'TF': 'YDR480W', 'Gene_ID': '', 'Promoter_Sequence': 'ATAAAACA'}, 
    {'TF': 'YDR480W', 'Gene_ID': '', 'Promoter_Sequence': 'ATGCAACA'}, 
    {'TF': 'YDR480W', 'Gene_ID': '', 'Promoter_Sequence': 'ATGAAACAATGAAACA'}, 
    {'TF': 'YDR480W', 'Gene_ID': '', 'Promoter_Sequence': 'TTGAAACAATGAAACA'}, 
    {'TF': 'YDR480W', 'Gene_ID': '', 'Promoter_Sequence': 'ATGAAACAATGAGACA'}, 
    {'TF': 'YDR480W', 'Gene_ID': '', 'Promoter_Sequence': 'ATGAAACAATGAAACG'}, 
    {'TF': 'YDR480W', 'Gene_ID': '', 'Promoter_Sequence': 'TTGAAACAATGAAACG'}, 
    {'TF': 'YDR480W', 'Gene_ID': '', 'Promoter_Sequence': 'CGTTTCAAAATGAAACA'}, 
    {'TF': 'YDR480W', 'Gene_ID': '', 'Promoter_Sequence': 'TGTTTCA'}, 
    {'TF': 'YDR480W', 'Gene_ID': '', 'Promoter_Sequence': 'GTTTCANNNTGAAAC'}, 
    {'TF': 'YDR176W', 'Gene_ID': '', 'Promoter_Sequence': 'NTCGGAG'}, 
    {'TF': 'YDR176W', 'Gene_ID': '', 'Promoter_Sequence': 'GGAASACT'}, 
    {'TF': 'YDR176W', 'Gene_ID': '', 'Promoter_Sequence': 'GGGAAAGGGTC'}, 
    {'TF': 'YDR176W', 'Gene_ID': '', 'Promoter_Sequence': 'AGAACAA'}, 
    {'TF': 'YDR176W', 'Gene_ID': '', 'Promoter_Sequence': 'CGGAGGACTCTNGNCCG'}, 
    {'TF': 'YDR176W', 'Gene_ID': '', 'Promoter_Sequence': 'GYTCGGASGACWGTSCTCCGATG'}, 
    {'TF': 'YDR176W', 'Gene_ID': '', 'Promoter_Sequence': 'CGYTCGGAGGACAGTGCTCCGA'}, 
    {'TF': 'YDR277C', 'Gene_ID': '', 'Promoter_Sequence': 'CGGAAAAWNT'}, 
    {'TF': 'YDR277C', 'Gene_ID': '', 'Promoter_Sequence': 'YCGGAANAWT'}, 
    {'TF': 'YDR277C', 'Gene_ID': '', 'Promoter_Sequence': 'NCGGAAAAWNT'}, 
    {'TF': 'YDR277C', 'Gene_ID': '', 'Promoter_Sequence': 'NTCNNCCGGAAAAAYTTNNA'}, 
    {'TF': 'YDR392W', 'Gene_ID': '', 'Promoter_Sequence': 'NNTATAW'}, 
    {'TF': 'YDR392W', 'Gene_ID': '', 'Promoter_Sequence': 'NTCSAATATATATANCTANNC'} 
]

In [103]:
yeastract_derived_promoters_2 = pd.concat([yeastract_derived_promoters, pd.DataFrame(new_rows)], ignore_index=True)


In [105]:
yeastract_derived_promoters_2

Unnamed: 0,TF,Gene_ID,Promoter_Sequence
0,FLO8,YDL049C,GTACCCGCCCAAGACCAAGAAAGAAGATAGAGGTGTTGTGCCGGGG...
1,FLO8,YDL047W,GTTCTTTTTCTATTCAAAGACTCGAATGAGGCCAACGAAAGAAGTA...
2,FLO8,YDL037C,TATGGCTTACGTAAACATTATTTCTTTGATATACTATTCCGTTAGA...
3,FLO8,YDR055W,AGCTCCTATTATTTTGTAATAGTATGCTCTTTATTGTCCTTCTTAA...
4,FLO8,YDR082W,TGATTGTGACACAGGATCATTCGGTTCATTGGGGAAATAATTCCTG...
...,...,...,...
813,YDR277C,,YCGGAANAWT
814,YDR277C,,NCGGAAAAWNT
815,YDR277C,,NTCNNCCGGAAAAAYTTNNA
816,YDR392W,,NNTATAW


In [106]:
yeastract_derived_promoters_2.rename(columns={'TF': 'DBID', 'Gene_ID': 'Motif_ID', 'Promoter_Sequence': 'Consensus'}, inplace=True)

# Add new empty columns
yeastract_derived_promoters_2['Information_Content'] = ""
yeastract_derived_promoters_2['Shannon_Entropy'] = ""
yeastract_derived_promoters_2['Length'] = ""
yeastract_derived_promoters_2['Motif_Name'] = ""


In [107]:
yeastract_derived_promoters_2 = yeastract_derived_promoters_2[['Motif_ID', 'Motif_Name', 'Information_Content', 'Shannon_Entropy', 'Length', 'Consensus', 'DBID']]


In [108]:
yeastract_derived_promoters_2.to_csv("../data/yeast/missing_TF_sequences.tsv", sep = '\t')

In [109]:
combined = pd.concat([TFs, yeastract_derived_promoters_2], axis=0, ignore_index=True)
combined

Unnamed: 0,Motif_ID,Motif_Name,Information_Content,Shannon_Entropy,Length,Consensus,DBID
0,M00001_2.00,ABF1,15.21,1.874,8,TTATCACT,YKL112W
1,M00002_2.00,AFT2,7.829,13.994,10,NGGGTGTNNN,YPL202C
2,M00003_2.00,MBP1,10.704,6.334,8,GACGCGTA,YDL056W
3,M00004_2.00,SWI4,11.038,6.441,8,GACGCGAA,YER111C
4,M00005_2.00,XBP1,11.143,5.838,8,TCTCGAAG,YIL101C
...,...,...,...,...,...,...,...
1456,,,,,,YCGGAANAWT,YDR277C
1457,,,,,,NCGGAAAAWNT,YDR277C
1458,,,,,,NTCNNCCGGAAAAAYTTNNA,YDR277C
1459,,,,,,NNTATAW,YDR392W


In [None]:
combined.to_csv("../data/yeast/TF_info_scores_with_DBID_and_missing_sequences.tsv", sep = '\t')