# Imports

In [1]:
# # Python standard library
import itertools
import os

# # Third-party libraries
import pandas as pd
import re
import screed
from tqdm import tqdm

# # Local python files
from path_constants import (
    DATA_FOLDER,
    ORPHEUM_BENCHMARKING_FOLDER,
    ORPHEUM_GROUND_TRUTH_FOLDER,
    QFO_EUKARYOTA_FOLDER,
    SIMULATED_RNASEQ_FOLDER,
    SIMULATED_READS_FASTQ
)

# Subset to only reads from complete protein sequences -- no fragments

## Get good uniprot ids, starting with M amino acid and ATG codon

In [None]:
uniprot_protein_starts_with_m = []


protein_fasta = os.path.join(
    QFO_EUKARYOTA_FOLDER,
    "UP000005640_9606.fasta",
)
cdna_fasta = os.path.join(
    QFO_EUKARYOTA_FOLDER,
    "UP000005640_9606_DNA.fasta",
)


with screed.open(protein_fasta) as records:
    for record in records:
        if record["sequence"].startswith("M"):
            uniprot_protein_starts_with_m.append(
                "|".join(record["name"].split()[0].split("|")[:2])
            )
print("uniprot_protein_starts_with_m", len(uniprot_protein_starts_with_m))


uniprot_dna_starts_with_atg = []
with screed.open(cdna_fasta) as records:
    for record in records:
        if record["sequence"].startswith("ATG"):
            uniprot_dna_starts_with_atg.append(
                "|".join(record["name"].split()[0].split("|")[:2])
            )
print("uniprot_dna_starts_with_atg", len(uniprot_dna_starts_with_atg))

In [None]:
! tail $protein_fasta

In [None]:
uniprot_dna_starts_with_atg[:3]

In [None]:
uniprot_protein_starts_with_m[:3]

In [None]:
uniprot_starts_with_atg_and_m = set(uniprot_dna_starts_with_atg).intersection(set(uniprot_protein_starts_with_m))
len(uniprot_starts_with_atg_and_m)

In [None]:
uniprot_starts_with_atg_and_m_list = list(uniprot_starts_with_atg_and_m)
uniprot_starts_with_atg_and_m_list[:5]

In [None]:
! grep -c '>' $cdna_fasta

In [None]:
! grep -c '>' $protein_fasta

In [None]:
uniprot_dna_starts_with_atg[:3]

## Write good uniprot ids to file

In [None]:
good_uniprot_records = []
with screed.open(cdna_fasta) as records:
    for record in records:
        clean_uniprot_id = '|'.join(record['name'].split('|')[:2])
        if clean_uniprot_id in uniprot_starts_with_atg_and_m:
            good_uniprot_records.append(record)
len(good_uniprot_records)

In [None]:
good_uniprot_records[:3]

In [None]:
record['name']

In [None]:
protein_fasta_good_uniprot_ids = os.path.join(
    QFO_EUKARYOTA_FOLDER,
    "UP000005640_9606_DNA__startswith_atg_and_protein_startswith_m.fasta",
)

with open(protein_fasta_good_uniprot_ids, "w") as f:
    for record in good_uniprot_records:
        f.write(f'>{record["name"]}\n{record["sequence"]}\n')

In [None]:
good_uniprot_records_dict = {'|'.join(r['name'].split('|')[:2]): r['sequence'] for r in good_uniprot_records}
len(good_uniprot_records_dict)

In [None]:
good_uniprot_records_series = pd.Series(good_uniprot_records_dict)

In [None]:
good_uniprot_records_dict['tr|A0A024R1R8']

In [None]:
uniprot_dna_starts_with_atg[:3]

### Grep dna fasta for the sequence

In [None]:
! grep -A 1 'sp|A0A075B6K2|ENSP00000374848' $cdna_fasta

In [None]:
! zgrep -A 3 'read1000/sp|A0A075B6K2|ENSP00000374848;mate1:5-154;mate2:37-186' $SIMULATED_RNASEQ_FOLDER/*.fq.gz

## Get read IDs of reads that don't have an `N`

In [None]:

read_ids_without_n = []
with screed.open(SIMULATED_READS_FASTQ) as records:
    for record in records:
        if "N" not in record["sequence"]:
            read_ids_without_n.append(record["name"])
print(len(read_ids_without_n))
read_ids_without_n[:3]

# Infer reading frame from read start -- assume all reads start with ATG

## Hamming distance function

from http://claresloggett.github.io/python_workshops/improved_hammingdist.html

In [None]:
# Return the Hamming distance between string1 and string2.
# string1 and string2 should be the same length.
def hamming_distance(string1, string2): 
    # Start with a distance of zero, and count up
    distance = 0
    # Loop over the indices of the string
    L = len(string1)
    for i in range(L):
        # Add 1 to the distance if these two characters are not equal
        if string1[i] != string2[i]:
            distance += 1
    # Return the final count of differences
    return distance


# Reveres complemeent

old_chars = "ACGT"
replace_chars = "TGCA"
tab = str.maketrans(old_chars,replace_chars)

def reverse_complement(sequence):
    return sequence.translate(tab)[::-1]

def rev_compl(st):
    nn = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    return "".join(nn[n] for n in reversed(st))

### read 51 is coding in negative frame, and mate2!!

```
@read51/sp|A0A024RBG1|ENSP00000492425;mate1:130-279;mate2:281-430
GCTTTTCCAGATACTCTGCATGTACAGGTTTATGACACTGGAGAACTTTGATAGCATCTTCTACTTTGAACCACTCTCTCTTCCTTCCAATATTAACAGAATCTTCCCAATCTTCTAATATTTCAGTGACTGTTAGAACATAAACATATG
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
```

In [None]:
hamming_distance(good_uniprot_records_dict['sp|A0A024RBG1'][280:429], 
                 'GCTTTTCCAGATACTCTGCATGTACAGGTTTATGACACTGGAGAACTTTGATAGCATCTTCTACTTTGAACCACTCTCTCTTCCTTCCAATATTAACAGAATCTTCCCAATCTTCTAATATTTCAGTGACTGTTAGAACATAAACATATG')

### Read 52 is coding in positive frame

```
@read52/sp|A0A024RBG1|ENSP00000492425;mate1:125-274;mate2:193-342
ACCCAGACCAGTGGATTGTCCCAGGAGGAGGAATGGAACCCGAGGAGGAACCTGGCGGTGCTGCCGTGAGGGAAGTTTATGAGGAGGCTGGAGTCAAAGGAAAACTAGGCAGACTTCTGGGCATATTTGAGCAGAACCAAGACCGAAAGC
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
```


In [None]:
hamming_distance(good_uniprot_records_dict['sp|A0A024RBG1'][124:273], 
                 'ACCCAGACCAGTGGATTGTCCCAGGAGGAGGAATGGAACCCGAGGAGGAACCTGGCGGTGCTGCCGTGAGGGAAGTTTATGAGGAGGCTGGAGTCAAAGGAAAACTAGGCAGACTTCTGGGCATATTTGAGCAGAACCAAGACCGAAAGC')

## Function to actually infer translation frame

In [None]:
# %%time

# https://regex101.com/r/WNtXD8/1/
interval_patterns = 'mate1:(?P<mate1_start>\d+)-(\d+);mate2:(\d+)-(\d+)'



def get_strand(canonical_seq, record_seq):
#     import pdb ; pdb.set_trace()
    try:
        n_mismatches = hamming_distance(canonical_seq, record_seq)
    except IndexError:
        # Lengths don't match, ignore this read
        return None
    if n_mismatches > 10:
        # Make sure it's really the reverse complement
        revcomp = reverse_complement(record_seq)
        try:
            n_mismatches = hamming_distance(canonical_seq, revcomp)
        except IndexError:
            # Lengths don't match, ignore this read
            return None
        # Not too many mismatches
        if n_mismatches <= 10:
            strand = -1
        else:
            strand = None
    else:
        strand = 1
#     if strand is None:
#         raise ValueError
    return strand


def get_correct_reading_frame(record, required_length=150, verbose=False):
    name = record['name']
    if 'mate1Start' in name:
        frame = 1
    else:
        # Subtract 1 since the fastq file uses 1-based indexing for the start/stop but python is 0-based
        try:
            start1, end1, start2, end2 = map(lambda x: int(x) - 1 , re.findall(interval_patterns, name)[0])
        except IndexError:
            # Read id has negative values and otherwise doesn't match my mental model --> ignore
            return None
        
        end1 += 1
#         start2 += 1
        end2 += 1


        uniprot_id = '|'.join(name.split(';')[0].split('/')[-1].split('|')[:2])
        try:
            canonical_sequence = good_uniprot_records_dict[uniprot_id]
        except KeyError:
            # Uniprot record doesn't have clear start/stop site, so difficult to infer frame --> skip
            return None
        
        canonical_length = len(canonical_sequence)
        if end1 > canonical_length or end2 > canonical_length:
            # Read extends past the boundary of the source sequence --> skip
            return None
        

        mate1 = canonical_sequence[start1:end1]
        mate2 = canonical_sequence[start2:end2]
        assert len(mate1) == required_length
        assert len(mate2) == required_length
        
        if verbose:
            print(name)
            print(f'start1: {start1} -- end1: {end1}')
            print(f'start2: {start2} -- end2: {end2}')

        if verbose > 1:
            print(f'>mate1\n{mate1}')
            print(f'>mate1_rc\n{reverse_complement(mate1)}')
        if verbose > 1:
            print(f'>mate2\n{mate2}')
            print(f'>mate2_rc\n{reverse_complement(mate2)}')
        
        frame_number = 3 - ((start1 -1 )% 3) 
        if verbose > 1:
            print(f'{frame_number} = (({start1} + 1) % 3) + 1')

#         frame_number = ((start1)% 3) + 1

        record_seq = record['sequence']
        if verbose > 1:
            print(f'>record\n{record_seq}')
            print(f'>record_rc\n{reverse_complement(record_seq)}')

        if verbose:
            print('--- Trying mate 1 ---')
        strand = get_strand(mate1, record_seq)
        if verbose and strand is not None:
            if strand > 0:
                print('mate1')
            if strand < 0:
                print('mate1, reverse complement')

        
        if strand is None:
            if verbose:
                print('--- Not mate1, trying mate 2 ---')
            # Maybe it's mate2?
#             strand = -1
            strand = get_strand(mate2, record_seq)
            frame_number = 3 - ((start2 - 1 ) % 3)
            
            if verbose and strand is not None:
                print(f'{frame_number} = (({start2} + 1) % 3) + 1')
                if strand > 0:
                    print('mate2')
                if strand < 0:
                    print('mate2, reverse complement')
                

        # Multiply the frame number by the strand multiplier
        try:
            frame = frame_number * strand
            if verbose:
                print(f'{frame} = {frame_number} * {strand}')
        except TypeError:
            # Strand is still none, don't know what's going on so skip this read
            frame = None
    return frame
            

def fastq_per_read_frame(fastq, verbose=False):
    read_id_to_frame = {}
    with screed.open(fastq) as records:
        for record in tqdm(records):
    #         if 'read52/' in record['name']:
    #             break
            if verbose:
                print('\n---')
            frame = get_correct_reading_frame(record, required_length=150, verbose=verbose)
            if verbose:
                print(f'frame: {frame}')
            if frame is not None:
                read_id_to_frame[record['name']] = frame

    read_id_to_frame_series = pd.Series(read_id_to_frame, name='translation_frame')
    print(read_id_to_frame_series.shape)
    read_id_to_frame_series.head()
    return read_id_to_frame_series

## Make mini fastq for testing

In [None]:
# for read_id in protein_k11_good_uniprot_ids_no_ns_coding.sample(5).read_id.values:

#     ! zgrep -A 3 "$read_id" $reads_dir/*

In [None]:
%%file mini.fastq
@read51/sp|A0A024RBG1|ENSP00000492425;mate1:130-279;mate2:281-430__frame=-3
GCTTTTCCAGATACTCTGCATGTACAGGTTTATGACACTGGAGAACTTTGATAGCATCTTCTACTTTGAACCACTCTCTCTTCCTTCCAATATTAACAGAATCTTCCCAATCTTCTAATATTTCAGTGACTGTTAGAACATAAACATATG
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@read52/sp|A0A024RBG1|ENSP00000492425;mate1:125-274;mate2:193-342__frame=3
ACCCAGACCAGTGGATTGTCCCAGGAGGAGGAATGGAACCCGAGGAGGAACCTGGCGGTGCTGCCGTGAGGGAAGTTTATGAGGAGGCTGGAGTCAAAGGAAAACTAGGCAGACTTCTGGGCATATTTGAGCAGAACCAAGACCGAAAGC
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@read302822/sp|P49760|ENSP00000460443;mate1:755-904;mate2:890-1039__frame=3
ATTTCCTCAAAGACAACAACTACCTGCCCTACCCCATCCACCAAGTGCGCCACATGGCCTTCCAGCTGTGCCAGGCTGTCAAGTTCCTCCATGATAACAAGCTGACACATACAGACCTCAAGCCTGAAAATATTCTGATTGTGAATTCAG
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@read780376/sp|Q96PL2|ENSP00000494896;mate1:747-896;mate2:829-978__frame=2
CCNGTTCCAGAACATCCCCAAACTCTCCAAGGTGTGGTTACACTGTGAGACGTTCATCTGCGACAGTGAGAAACTCTCCTGCCCAGTGACCTGCGATAAACGGAAGCGCCTCCTGCGAGACCAGACCGGGGGAGTCCTGGTCGTGGAGCT
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@read192484/sp|P09629|ENSP00000239165;mate1:19-168;mate2:125-274__frame=1
GCGAATACTTTATTTTCTAAATATCCAGCCTCAAGTTCGGTTTTCGCTACCGGAGCCTTCCCAGAACAAACTTCTTGTGCGTTTGCTTCCAACCCCCAGCGCCCGGGCTATGGAGCGGGTTCGGGCGCTTCCTTCGCCGCCTCGATGCAG
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@read335141/sp|P60368|ENSP00000375479;mate1:141-290;mate2:251-400__frame=2
CACCCCAGTGAGCTGTGTGTCCAGCCCCTGCTGCCAGGCGGCCTGTGAGCCCAGCGCCTGCCAATCAGGCTGCACCAGCTCCTGCACGCCCTCGTGCTGCCAGCAGTCTAGCTGCCAGCCGGCTTGCTGCACCTCCTCCCCCTGCCAGCA
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@read494460/sp|Q5VWX1|ENSP00000281156;mate1:81-230;mate2:193-342__frame=2
TTTGGCAGAAGAAATTGAAAAGTTTCAAGGTTCTGATGGAAAAAAGGAAGACGAAGAAAAGAAGTATCTTGATGTCATCAGCAACAAAAACATAAAGCTCTCAGAAAGAGTACTGATTCCTGTCAAGCAGTATCCAAAGTTCAATTTTGT
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@read734191/sp|Q96A65|ENSP00000376868;mate1:842-991;mate2:949-1098__frame=-1
AGTGTCCTGCAGGTATCCCAGGACCACAGAGTGTGCAGCGGCTACAGCATTAAACTTGTCAAACAGTAACTCCAGCAGTTCTAGAAGCAACCTTGGTTGGTTCTCCACAGTAACGTTCTCCCCCCGCTGATAGCCACTGTCTGCCACCTG
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@read640286/sp|Q8N7Q2|BAC05176;mate1:30-179;mate2:140-289__frame=?
TTTGGCCAACTTCGCCTCTTCAATTAAAAGGACACATGCTGTTAACGGGTGCTGTGGATTACAGATGATCGCACTCTGGGCACAGTCCTCTGGAAATGCAGATGCCCGTGTGGAGGAAATTCTGGCGGGAGAGGAGCGGCGACTCGCCGC
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@read325914/sp|P56278|ENSP00000358488;mate1:12-161;mate2:125-274__frame=2
GGATGTGGGGGCTCCACCCGATCACCTCTGGGTTCACCAAGAGGGTATCTACCGCGACGAATACCAGCGCACGTGGGTGGCCGTCGTGGAAGAGGAGACGAGTTTCCTAAGGGCACGAGTCCAGCAAATTCAGGTTCCCTTAGGTGACGC
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@read714894/sp|Q8WYR1|ENSP00000485280;mate1:570-719;mate2:662-811__frame=2
GAGCCAGACGCCCTCACCCCCGACAGACTCCCCTAGGCACGCCAGCCCTGGAGAGCTGGGCACCACCCCATGGGAGGAGAGCACCAATGACATCTCCCACTACCTCGGCATGCTGGACCCCTGGTATGAGCGCAATGTACTGGGCCTCAT
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII


### Test on mini dataset

In [None]:
mini_results = fastq_per_read_frame('mini.fastq', verbose=2)
mini_results

## Make ground truth dataframe for testing

In [None]:
from io import StringIO
s = '''read_id	true_mate	true_frame	guessed_frame
read51/sp|A0A024RBG1|ENSP00000492425;mate1:130-279;mate2:281-430	mate2_rc	-3	-3
read52/sp|A0A024RBG1|ENSP00000492425;mate1:125-274;mate2:193-342	mate1	3	2
read302822/sp|P49760|ENSP00000460443;mate1:755-904;mate2:890-1039	mate1	3	2
read780376/sp|Q96PL2|ENSP00000494896;mate1:747-896;mate2:829-978	mate1	2	3
read192484/sp|P09629|ENSP00000239165;mate1:19-168;mate2:125-274	mate1	1	1
read335141/sp|P60368|ENSP00000375479;mate1:141-290;mate2:251-400	mate1	2	3
read494460/sp|Q5VWX1|ENSP00000281156;mate1:81-230;mate2:193-342	mate1	2	3
read734191/sp|Q96A65|ENSP00000376868;mate1:842-991;mate2:949-1098	mate2_rc	-1	-2
read640286/sp|Q8N7Q2|BAC05176;mate1:30-179;mate2:140-289	mate1	3	2
read325914/sp|P56278|ENSP00000358488;mate1:12-161;mate2:125-274	mate1	2	3
'''
mini_df = pd.read_csv(StringIO(s), sep='\t')
mini_df

In [None]:
mini_df['transcript_id'] = mini_df.read_id.map(lambda x: x.split(';')[0].split('/')[-1])
mini_df['uniprot_id'] = mini_df.transcript_id.map(lambda x: '|'.join(x.split('|')[:2]))
mini_df

In [None]:
for i, row in mini_df.iterrows():
    uniprot_id = row['uniprot_id']
    print(f'\n---\n{row.read_id}')
    print(good_uniprot_records_dict[uniprot_id])

### Spot check some reading frames

# Run code to assign correct reading frame to all read ids

In [None]:
fastq

In [None]:
%%time
read_id_to_frame_series = fastq_per_read_frame(fastq)
print(read_id_to_frame_series.shape)
read_id_to_frame_series.head()

In [None]:
read_id_to_frame_series.head()

## Write correct reading frames to file!

In [None]:
# human_busco_dir = "/mnt/ibm_sm/home/olga/pipeline-results/human-simulated/nf-predictorthologs--busco-mammalia-human"
csv = os.path.join(ORPHEUM_BENCHMARKING_FOLDER, "correct_reading_frames.csv")
read_id_to_frame_series.to_csv(csv, index=True, header=True)

# Create gold standard classification data for all reading frames

## Read gold standard series

In [None]:


read_id_to_frame_series.index.name = 'read_id'
read_id_to_frame = read_id_to_frame_series.reset_index()
read_id_to_frame['is_coding'] = True
read_id_to_frame['read_id_frame'] = read_id_to_frame.read_id.astype(str) + '__frame=' + read_id_to_frame.translation_frame.astype(str)
read_id_to_frame = read_id_to_frame.set_index('read_id_frame')
print(read_id_to_frame.shape)
read_id_to_frame.head()

## Make cartesian product of read id and frames with `itertools`

In [None]:
frames = (1, 2, 3, -1, -2, -3)
all_read_id_frames = [
    f"{read_id}__frame={frame}"
    for read_id, frame in itertools.product(read_id_to_frame["read_id"], frames)
]
len(all_read_id_frames)

## Make true coding frame series

In [None]:
true_coding_frame = pd.Series(False, index=all_read_id_frames, name='is_coding')
true_coding_frame[read_id_to_frame.index] = True
true_coding_frame.sum()

In [None]:
true_coding_frame.head()

## Write to file

In [None]:
basename = "true_reading_frames"

parquet = os.path.join(ORPHEUM_GROUND_TRUTH_FOLDER, f"{basename}.parquet")
csv = os.path.join(ORPHEUM_GROUND_TRUTH_FOLDER, f"{basename}.csv")

true_coding_frame.to_frame().to_parquet(parquet)
true_coding_frame.to_csv(csv)

In [None]:
true_coding_frame.head()