In [1]:
import os
import pandas as pd
from Bio import SeqIO
from datasets import Dataset

In [40]:
def process_fname(fname):
    fname = os.path.split(fname)[1]
    org = fname[:2]
    is_neg = '_neg' in fname
    is_pas = '_polyA' in fname
    is_tis = 'TIS' in fname
    signal = 'PAS' if is_pas else ('TIS' if is_tis else None)
    return org, 1 - is_neg, signal

def get_motif(seq):
    if len(seq) == 606:
        motif_len = 6
    elif len(seq) == 603:
        motif_len = 3
    else:
        motif_len = 0
    return ''.join(seq[300:300+motif_len])

def process_fasta(fname):
    attrs = process_fname(fname)
    seqs = SeqIO.parse(fname, 'fasta')
    seqs = [(os.path.split(fname)[1], record.id, str(record.seq), get_motif(record.seq), *attrs) for record in seqs]
    return seqs

In [27]:
data_folder = '../Data'

signal = 'PAS'
fnames_pas = list(filter(lambda f: '.fa' in f, 
                     [f'{data_folder}/{org}/{signal}/{fname}' 
                      for org in ['Bovine', 'Human', 'Mouse', 'FruitFly'] 
                      for fname in os.listdir(f'../Data/{org}/{signal}')]))
signal = 'TIS'
fnames_tis = list(filter(lambda f: '.fa' in f, 
                     [f'{data_folder}/{org}/{signal}/{fname}' 
                      for org in ['Bovine', 'Human', 'Mouse', 'FruitFly'] 
                      for fname in os.listdir(f'../Data/{org}/{signal}')]))
fnames = fnames_pas + fnames_tis

In [41]:
all_seqs = [seq for fname in fnames for seq in process_fasta(fname)]

df = pd.DataFrame(all_seqs, columns=['fname', 'id', 'sequence', 'motif', 'organism', 'label', 'signal'])

dataset = Dataset.from_pandas(df)
dataset.push_to_hub("dvgodoy/DeepGSR_sequences")