# Motif Analysis

In [1]:
import os
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

In [2]:
target_directory = "/mnt/d/ebola/data/FIMO/target_cds"
background_directory = "/mnt/d/ebola/data/FIMO/background_cds"

In [3]:
def load_fasta_files(directory):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith(".fa"):
            filepath = os.path.join(directory, filename)
            for record in SeqIO.parse(filepath, "fasta"):
                data.append(
                    {
                        "gene": filename.split(".")[0],
                        "header": record.description,
                        "sequence": str(record.seq),
                    }
                )

    df = pd.DataFrame(data).set_index("gene")
    df["sequence_length"] = df["sequence"].str.len()
    return df


def dataframe_to_fasta(df, output_file):
    records = []
    for gene, row in df.iterrows():
        seq_record = SeqRecord(
            Seq(row["sequence"]),
            id=gene,
            description="",
        )
        records.append(seq_record)
    SeqIO.write(records, output_file, "fasta")

In [4]:
target_df = load_fasta_files(target_directory)
background_df = load_fasta_files(background_directory)

In [5]:
avg_length = target_df["sequence_length"].mean()
avg_length

1896.4285714285713

In [6]:
background_df["distance"] = (background_df["sequence"].str.len() - avg_length).abs()
background_60 = background_df.sort_values(by="distance").head(60)

In [7]:
dataframe_to_fasta(target_df, "target_cds.fa")
dataframe_to_fasta(background_60, "background_cds.fa")

In [15]:
background_df.sort_values(by="sequence_length", ascending=True).head(15)

Unnamed: 0_level_0,header,sequence,sequence_length,distance
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AMZ1,AMZ1-206 cds:protein_coding_CDS_not_defined,,0,1896.428571
ANXA1,ANXA1-208 cds:protein_coding_CDS_not_defined,,0,1896.428571
CMPK2,CMPK2-206 cds:protein_coding_CDS_not_defined,,0,1896.428571
TMEM268,TMEM268-204 cds:protein_coding_CDS_not_defined,,0,1896.428571
GBP2,GBP2-202 cds:retained_intron,,0,1896.428571
PSMA5,PSMA5-204 cds:protein_coding_CDS_not_defined,,0,1896.428571
SAT1,SAT1-202 cds:retained_intron,,0,1896.428571
GPNMB,GPNMB-210 cds:protein_coding_CDS_not_defined,,0,1896.428571
MNDA,MNDA-203 cds:protein_coding_CDS_not_defined,,0,1896.428571
MX2,MX2-213 cds:retained_intron,,0,1896.428571
