In [1]:
!pip install biopython pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import boto3
import os
import pandas as pd
from Bio import SeqIO
import io

In [None]:
s3_bucket = ""
s3_key_gtf = "" 
s3_key_fasta = ""
s3_key_gct = ""
s3_key_transcripts = ""

In [4]:
s3 = boto3.client("s3")

In [5]:
print("üì• Streaming FASTA file from S3...")
fasta_obj = s3.get_object(Bucket=s3_bucket, Key=s3_key_fasta)
fasta_stream = io.BytesIO(fasta_obj["Body"].read())


üì• Streaming FASTA file from S3...


In [None]:
fasta_stream.seek(0)  
fasta_content = fasta_stream.read().decode("utf-8")
print(fasta_content)

In [7]:
print("üîç Parsing FASTA file...")
transcripts = []

fasta_stream.seek(0)

fasta_text_stream = io.StringIO(fasta_stream.read().decode("utf-8"))

for record in SeqIO.parse(fasta_text_stream, "fasta"):
    header_parts = record.description.split("|")

    # Extract key identifiers
    transcript_id = header_parts[0]  # e.g., ENST00000641515.2
    gene_id = header_parts[1]  # e.g., ENSG00000186092.7
    gene_name = header_parts[5]  # e.g., OR4F5
    
    utr5, cds, utr3 = "NA", "NA", "NA"

    # Extract UTR/CDS from header (if present)
    for part in header_parts:
        if part.startswith("UTR5:"):
            utr5 = part.split(":")[1] 
        elif part.startswith("CDS:"):
            cds = part.split(":")[1]  
        elif part.startswith("UTR3:"):
            utr3 = part.split(":")[1] 

    # Store transcript sequence
    sequence = str(record.seq)

    # Append to transcript list
    transcripts.append([transcript_id, gene_id, gene_name, utr5, cds, utr3, sequence])


üîç Parsing FASTA file...


In [8]:
df = pd.DataFrame(transcripts, columns=["Transcript_ID", "Gene_ID", "Gene_Name", "UTR5", "CDS", "UTR3", "Sequence"])
print("‚úÖ FASTA Parsing Completed!")

‚úÖ FASTA Parsing Completed!


In [9]:
csv_buffer = io.StringIO()
df.to_csv(csv_buffer, index=False)

s3.put_object(Bucket=s3_bucket, Key='parsed_transcripts.csv', Body=csv_buffer.getvalue())
print("‚úÖ CSV file uploaded to S3 successfully!")

‚úÖ CSV file uploaded to S3 successfully!


In [10]:
df.head()

Unnamed: 0,Transcript_ID,Gene_ID,Gene_Name,UTR5,CDS,UTR3,Sequence
0,ENST00000641515.2,ENSG00000186092.7,OR4F5,1-60,61-1041,1042-2618,CCCAGATCTCTTCAGTTTTTATGCCTCATTCTGTGAAAATTGCTGT...
1,ENST00000426406.4,ENSG00000284733.2,OR4F29,,1-939,,ATGGATGGAGAGAATCACTCAGTGGTATCTGAGTTTTTGTTTCTGG...
2,ENST00000332831.5,ENSG00000284662.2,OR4F16,,1-939,,ATGGATGGAGAGAATCACTCAGTGGTATCTGAGTTTTTGTTTCTGG...
3,ENST00000616016.5,ENSG00000187634.13,SAMD11,1-509,510-3044,3045-3465,GGCGGCGGAGTCTCCCAAGTCCCCGCCGGGCGGGCGCGCGCCAGTG...
4,ENST00000618323.5,ENSG00000187634.13,SAMD11,1-509,510-3047,3048-3468,GGCGGCGGAGTCTCCCAAGTCCCCGCCGGGCGGGCGCGCGCCAGTG...


In [11]:
gtf_obj = s3.get_object(Bucket=s3_bucket, Key=s3_key_gtf)
gtf_stream = io.StringIO(gtf_obj["Body"].read().decode("utf-8")) 

print("üîç Previewing GTF File...")
for _ in range(10):
    print(next(gtf_stream).strip())

üîç Previewing GTF File...
##description: evidence-based annotation of the human genome (GRCh38), version 47 (Ensembl 113)
##provider: GENCODE
##contact: gencode-help@ebi.ac.uk
##format: gtf
##date: 2024-07-19
chr1	HAVANA	gene	11121	24894	.	+	.	gene_id "ENSG00000290825.2"; gene_type "lncRNA"; gene_name "DDX11L16"; level 2; tag "overlaps_pseudogene";
chr1	HAVANA	transcript	11121	14413	.	+	.	gene_id "ENSG00000290825.2"; transcript_id "ENST00000832824.1"; gene_type "lncRNA"; gene_name "DDX11L16"; transcript_type "lncRNA"; transcript_name "DDX11L16-260"; level 2; tag "TAGENE";
chr1	HAVANA	exon	11121	11211	.	+	.	gene_id "ENSG00000290825.2"; transcript_id "ENST00000832824.1"; gene_type "lncRNA"; gene_name "DDX11L16"; transcript_type "lncRNA"; transcript_name "DDX11L16-260"; exon_number 1; exon_id "ENSE00004248723.1"; level 2; tag "TAGENE";
chr1	HAVANA	exon	12010	12227	.	+	.	gene_id "ENSG00000290825.2"; transcript_id "ENST00000832824.1"; gene_type "lncRNA"; gene_name "DDX11L16"; transcript_t

In [12]:
print("üì• Streaming GTF file from S3...")
response = s3.get_object(Bucket=s3_bucket, Key=s3_key_gtf)
gtf_data = response["Body"].read().decode("utf-8").splitlines()

üì• Streaming GTF file from S3...


KeyboardInterrupt: 

# Looking into the GTEX

In [None]:
print("üì• Loading transcript data from S3...")
transcripts_obj = s3.get_object(Bucket=s3_bucket, Key=s3_key_transcripts)
transcripts_df = pd.read_csv(io.BytesIO(transcripts_obj['Body'].read()))
print("‚úÖ Transcripts data loaded!")

üì• Loading transcript data from S3...


In [None]:
print("üì• Streaming GTEx junction data from S3...")
gct_obj = s3.get_object(Bucket=s3_bucket, Key=s3_key_gct)

junctions_df = pd.read_csv(io.BytesIO(gct_obj['Body'].read()), sep="\t", skiprows=2)

junctions_df = junctions_df[["Chromosome", "Start", "End", "Strand", "ReadCount"]]
print("‚úÖ GTEx junction data streamed!")


üì• Streaming GTEx junction data from S3...


: 

In [None]:
def check_exon_inclusion(exon_start, exon_end, junctions):
    included = junctions[
        (junctions["Start"] >= exon_start) & (junctions["End"] <= exon_end)
    ]["ReadCount"].sum()
    
    skipped = junctions[
        (junctions["Start"] < exon_start) & (junctions["End"] > exon_end)
    ]["ReadCount"].sum()

    # Compute Percent Spliced In (PSI)
    psi = included / (included + skipped + 1e-6)  # Avoid division by zero

    # Assign binary label (1 = included, 0 = skipped)
    label = 1 if psi > 0.5 else 0
    return label

In [None]:
fine_tune_data = []

for _, row in transcripts_df.iterrows():
    transcript_id = row["Transcript_ID"]
    sequence = row["Sequence"]

    if row["CDS"] != "NA":
        cds_regions = row["CDS"].split(",")

        for region in cds_regions:
            exon_start, exon_end = map(int, region.split("-"))

            label = check_exon_inclusion(exon_start, exon_end, junctions_df)

            fine_tune_data.append([sequence, label])

In [None]:
dnabert_df = pd.DataFrame(fine_tune_data, columns=["sequence", "label"])

In [None]:
dnabert_df.head()