In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# 1. Load BED file

In [2]:
BED_colnames = ["chrom", "start_position", "end_position", "base_code", "score", "strand", "start_position2", "end_position2",
            "color", "Nvalid_cov", "percent_modified", "Nmod", "Ncanonical", "Nother_mod", "Ndelete", "Nfail", "Ndiff", "Nnocall"]

NIES_2145 = pd.read_csv("nanno/bed/Nanoce_2145_sorted.bed", sep='\t', header=None, names=BED_colnames)
NIES_2146 = pd.read_csv("nanno/bed/Nanoce_2146_sorted.bed", sep='\t', header=None, names=BED_colnames)
NIES_2145_bta1l = pd.read_csv("nanno/bed/Nanoce_2145_bta1l_sorted.bed", sep='\t', header=None, names=BED_colnames)

# 2. Specify gene coordinates

## 2.1. Run BLAST
To obtain the coordinates of LPAT and DGAT genes

In [57]:
# Create BLAST database
!makeblastdb -in ref/Nanoce_C018.fna -input_type fasta -dbtype nucl -out blast/blastdb/Nanoce_C018



Building a new DB, current time: 04/14/2025 14:10:00
New DB name:   /Users/daffa/Documents/Work/Nannochloropsis/blast/blastdb/Nanoce_C018
New DB title:  ref/Nanoce_C018.fna
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /Users/daffa/Documents/Work/Nannochloropsis/blast/blastdb/Nanoce_C018
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 30 sequences in 0.142379 seconds.




In [58]:
# Search LPAT positions in C018 reference genome
!blastn -db blast/blastdb/Nanoce_C018 -query blast/query/LPAT_NIES2145_sequence/LPAT.fna -out blast/result/blastn_LPAT_C018.txt -outfmt 6
!blastn -db blast/blastdb/Nanoce_C018 -query blast/query/LPAT_NIES2145_sequence/LPAT.fna -out blast/result/blastn_LPAT_C018_long.txt

In [59]:
# Search DGAT positions in C018 reference genome
# DGAT information from (1. convert Nanoce1779 GFF3 to C018 by MetaEuk; 2. Filter rows containing DGAT-related information)

!blastn -db blast/blastdb/Nanoce_C018 -query blast/query/DGAT_MNEG5482_sequence/DGAT.fna -out blast/result/blastn_DGAT_C018.txt -outfmt 6
!blastn -db blast/blastdb/Nanoce_C018 -query blast/query/DGAT_MNEG5482_sequence/DGAT.fna -out blast/result/blastn_DGAT_C018_long.txt

In [3]:
# From BLAST results, look for the coordinates and input them

# LPAT gene paralogs
LPAT1_chrom = 'JBEBFO010000001.1'
LPAT1_start = 1520577
LPAT1_end = 1521443

LPAT2_chrom = 'JBEBFO010000006.1'
LPAT2_start = 382392
LPAT2_end = 383729

LPAT3_chrom = 'JBEBFO010000010.1'
LPAT3_start = 884381
LPAT3_end = 884871

LPAT4_chrom = 'JBEBFO010000010.1'
LPAT4_start = 279778
LPAT4_end = 281688

# DGAT gene paralogs
DGATa_chrom = 'JBEBFO010000001.1'
DGATa_start = 1427596
DGATa_end = 1428693

DGATb_chrom = 'JBEBFO010000001.1'
DGATb_start = 522961
DGATb_end = 524319

DGATc_chrom = 'JBEBFO010000018.1'
DGATc_start = 509296
DGATc_end = 510864

DGATd_chrom = 'JBEBFO010000002.1'
DGATd_start = 405931
DGATd_end = 407490

DGATe_chrom = 'JBEBFO010000022.1'
DGATe_start = 212019
DGATe_end = 213707

DGATf_chrom = 'JBEBFO010000004.1'
DGATf_start = 565364
DGATf_end = 566802

DGATg_chrom = 'JBEBFO010000004.1'
DGATg_start = 447383
DGATg_end = 449011

DGAT2_chrom = 'JBEBFO010000005.1'
DGAT2_start = 739185
DGAT2_end = 740534

DGATh_chrom = 'JBEBFO010000008.1'
DGATh_start = 240363
DGATh_end = 243278

## 2.2. Input coordinates and save as individual BED file

In [6]:
# Step 1: Define gene lists
lpat_genes = ["LPAT1", "LPAT2", "LPAT3", "LPAT4"]
dgat_genes = ["DGATa", "DGATb", "DGATc", "DGATd", "DGATe", "DGATf", "DGATg", "DGAT2", "DGATh"]

# Step 2: Extract gene coordinates using globals()
gene_coords = {}
for gene in lpat_genes + dgat_genes:
    gene_coords[gene] = {
        "chrom": globals()[f"{gene}_chrom"],
        "start": globals()[f"{gene}_start"],
        "end": globals()[f"{gene}_end"]
    }

# Step 3: Sample-to-DataFrame mapping
sample_dfs = {
    "NIES_2145": NIES_2145,
    "NIES_2146": NIES_2146,
    "NIES_2145_bta1l": NIES_2145_bta1l
}

# Step 4: Create output folder
output_dir = "sample_gene_bed"
os.makedirs(output_dir, exist_ok=True)

# Step 5: Filter and save for each sample and gene
for sample_name, df in sample_dfs.items():
    for gene_name in lpat_genes + dgat_genes:
        coords = gene_coords[gene_name]
        filtered_df = df[
            (df['chrom'] == coords['chrom']) &
            (df['start_position'] >= coords['start']) &
            (df['end_position'] <= coords['end'])
        ].reset_index(drop=True)
        
        # Save as variable
        var_name = f"{sample_name}_{gene_name}"
        globals()[var_name] = filtered_df

        # Export as BED
        bed_path = f"{output_dir}/{var_name}.bed"
        filtered_df.to_csv(bed_path, sep="\t", index=False, header=False)