# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import itertools
import math
import pandas as pd
import json
import os
import glob

from tqdm import tqdm
import seaborn as sns


import screed
import sklearn

%matplotlib inline

In [3]:
from path_constants import (
    QFO_EUKARYOTA_FOLDER,
    ORPHEUM_BENCHMARKING_FOLDER,
    ORPHEUM_GROUND_TRUTH_FOLDER,
    ORPHEUM_PIPELINE_RESULTS_FOLDER
)

# Get ids of "good" reads to use for classification

## Subset to only reads from complete protein sequences -- *will not change with input species*

could maybe write these ids to file

In [5]:
busco_mammalia_uniprot_protein_starts_with_m = []

# Use only the busco mammalia proteins
with screed.open(
    os.path.join(QFO_EUKARYOTA_FOLDER, "UP000005640_9606__busco_mammlia_odbv10.fasta")
) as records:
    for record in records:
        if record["sequence"].startswith("M"):
            busco_mammalia_uniprot_protein_starts_with_m.append(
                record["name"].split()[0].split("|")[1]
            )
print(
    "busco_mammalia_uniprot_protein_starts_with_m",
    len(busco_mammalia_uniprot_protein_starts_with_m),
)


uniprot_dna_starts_with_atg = []
with screed.open(
    os.path.join(QFO_EUKARYOTA_FOLDER, "UP000005640_9606_DNA.fasta")
) as records:
    for record in records:
        if record["sequence"].startswith("ATG"):
            uniprot_dna_starts_with_atg.append(record["name"].split()[0].split("|")[1])
print("uniprot_dna_starts_with_atg", len(uniprot_dna_starts_with_atg))

busco_mammalia_uniprot_protein_starts_with_m 8904
uniprot_dna_starts_with_atg 20336


In [6]:
busco_mammalia_startswith_m__and__dna_startswith_atg = set(busco_mammalia_uniprot_protein_starts_with_m).intersection(uniprot_dna_starts_with_atg)
len(busco_mammalia_startswith_m__and__dna_startswith_atg)

8833

## Read gold standard reading frame file

In [7]:
parquet = os.path.join(ORPHEUM_GROUND_TRUTH_FOLDER, 'true_reading_frames.parquet')

true_coding_frame = pd.read_parquet(parquet)
# Create just a series (single column) from this
true_coding_frame = true_coding_frame['is_coding']
true_coding_frame.head()

read1/tr|A0A024R1R8|ENSP00000491117;mate1Start:1;mate2Start:1__frame=1      True
read1/tr|A0A024R1R8|ENSP00000491117;mate1Start:1;mate2Start:1__frame=2     False
read1/tr|A0A024R1R8|ENSP00000491117;mate1Start:1;mate2Start:1__frame=3     False
read1/tr|A0A024R1R8|ENSP00000491117;mate1Start:1;mate2Start:1__frame=-1    False
read1/tr|A0A024R1R8|ENSP00000491117;mate1Start:1;mate2Start:1__frame=-2    False
Name: is_coding, dtype: bool

# Read coding score csvs

## Add read_id_frame and is_coding for computing metrics

In [8]:
def add_read_id_frame_and_is_coding(df):
    df["read_id_frame"] = (
        df.read_id.astype(str) + "__frame=" + df.translation_frame.astype(str)
    )
    df["is_coding"] = df["category"] == "Coding"
    return df

## Get human busco mammalia reads

In [9]:
csv = os.path.join(
    QFO_EUKARYOTA_FOLDER,
    "busco_mammalia_human_uniprot_ids_in_qfo.csv",
)

human_busco_mammalia = pd.read_csv(csv)
print(human_busco_mammalia.shape)
human_busco_mammalia.head()

(22539, 3)


Unnamed: 0,source__uniprot_id,source__id_type,source__db_id
0,P61981,OrthoDB,1176818at2759
1,P27348,OrthoDB,1176818at2759
2,P30443,OrthoDB,1390181at2759
3,Q96QU6,OrthoDB,1156861at2759
4,P10321,OrthoDB,1390181at2759


## Function to read coding scores CSVs consistently

In [17]:
def read_translate_csvs(globber, species=None, 
                        human_busco_mammalia=human_busco_mammalia, 
                        uniprot_dna_starts_with_atg=uniprot_dna_starts_with_atg, 
                        true_coding_frame=true_coding_frame):
    dfs = []
    
    for filename in tqdm(glob.glob(globber)):
        dirname = os.path.dirname(filename)
        basename = os.path.basename(filename)
        _, molecule_ksize, _ = basename.split('__')
        molecule_, ksize_ = molecule_ksize.split('_')
        molecule = molecule_.split('-')[1]
        ksize = int(ksize_.split('-')[1])
        
        df = pd.read_csv(filename)
        df['alphabet'] = molecule
        df['ksize'] = ksize
        df['species'] = species 
        df = add_read_id_frame_and_is_coding(df)
        
        # Get only reads generated from busco mammalia data
        df['protein_id'] = df['read_id'].map(lambda x: x.split('/')[1].split(';')[0])
        df['uniprot_id'] = df['protein_id'].str.split('|').str[1]
        df = df.query('uniprot_id in @human_busco_mammalia.source__uniprot_id')
        
        # Get only reads from transcripts starting with ATG and no Ns, and only busco mammalia
        df = df.query('uniprot_id in @busco_mammalia_startswith_m__and__dna_startswith_atg')
        
        # Only use reads and frames in the gold standard data
        df = df.query('read_id_frame in @true_coding_frame.index')
        
        # Set the read id and frame as the row names
        df = df.set_index('read_id_frame')

        dfs.append(df)
    concatenated = pd.concat(dfs, ignore_index=False)
    concatenated = concatenated.sort_index()
    return concatenated

## Read Busco mammalia coding scores

In [None]:
%%time
dfs = []
globber = os.path.join(
    ORPHEUM_PIPELINE_RESULTS_FOLDER, "nf-predictorthologs--busco-mammalia-*"
)
for subfolder in glob.glob(globber):
    basename = os.path.basename(subfolder)
    print(f'basename: {basename}')
    species = basename.split("--")[-1]
    print(f'\tspecies: {species}')
    csvs = os.path.join(subfolder, "*coding_scores.csv")
    df = read_translate_csvs(
        csvs,
        species=species,
        human_busco_mammalia=human_busco_mammalia,
        uniprot_dna_starts_with_atg=uniprot_dna_starts_with_atg,
        true_coding_frame=true_coding_frame,
    )
    dfs.append(df)
coding_scores = pd.concat(dfs)
print(coding_scores.shape)
coding_scores.head()

nf-predictorthologs--busco-mammalia--baiji
baiji


 80%|████████████████████████████████████████████████████████████▌               | 43/54 [16:33<09:07, 49.81s/it]

In [16]:
for d in dfs:
    print(set(d['species']))

In [13]:
for d in dfs:
    d.species.unique()

In [14]:
coding_scores.species.value_counts()

NameError: name 'coding_scores' is not defined

### Write concatenated n frames per read, categorization to file

In [None]:
coding_scores.to_parquet(
    os.path.join(ORPHEUM_PIPELINE_RESULTS_FOLDER, "coding_scores.parquet")
)