# Open Reading Frames

Link: https://rosalind.info/problems/orf/

In [12]:
use std::fs::File;
use std::io::{BufReader, BufRead};
use std::collections::{HashMap, HashSet};

In [3]:
fn read_codon_table(file_path: &str) -> HashMap<String, String> {
    let mut codon_to_protein_map = HashMap::new();
    let file = File::open(file_path).expect("Invalid filepath");
    let reader = BufReader::new(file);
    
    for line in reader.lines() {
        let line = line.unwrap();
        let items: Vec<&str> = line.split_whitespace().collect();
        let (codon, protein) = (items[0].to_string(), items[1].to_string());
        codon_to_protein_map.insert(codon, protein);
    }
    codon_to_protein_map
}

In [4]:
#[derive(Debug)] 
pub struct RNASeq {
    seq: String,
}

#[derive(Debug)] 
pub struct Protein {
    seq: String,
}

#[derive(Debug)] 
pub struct DNASeq {
    seq: String,
}

impl DNASeq {
    
    pub fn push_str(&mut self, s: &str) {
        self.seq.push_str(s);
    }
    
    pub fn reverse(&self) -> DNASeq {
        let seq = self.seq.chars().rev().collect();
        DNASeq { seq }
    }
    
    pub fn complement(&self) -> DNASeq {
        let seq = self.seq.chars()
            .map(|x| match x {
                'A' => 'T',
                'T' => 'A',
                'G' => 'C',
                'C' => 'G',
                _ => x,
            })
            .collect();
        DNASeq { seq }
    }
    
    // Define a method to transcribe DNA into RNA
    pub fn transcribe(&self) -> RNASeq {
        // Replace every 'T' with 'U' in the sequence
        let seq = self.seq.chars()
            // closures are like lambda functions
            .map(|x| match x {
                'T' => 'U',
                _ => x,
            })
            .collect(); // This transforms an interator into a collection

        // Return a new RNA sequence
        RNASeq { seq }
    }
    
}


impl RNASeq {
    
    pub fn shift(&self) -> Vec<RNASeq> {
        vec![RNASeq{seq: self.seq.to_string()}, RNASeq{seq: self.seq[1..].to_string()}, RNASeq{seq: self.seq[2..].to_string()}]
    }
    
    pub fn translate(&self) -> Protein {
        let codon_to_protein_map = read_codon_table("data/rna_codon_table.txt");
        let seq_vector: Vec<char> = self.seq.chars().collect();
        let mut protein = "".to_string();
        for x in seq_vector.chunks_exact(3) {
            let codon = x.iter().collect::<String>();
            let amino_acid = codon_to_protein_map.get(&codon).unwrap();
            protein += amino_acid;
        }
        Protein{ seq:protein }
    }
}

In [5]:
fn read_fasta(file_path: &str) -> HashMap<String, DNASeq> {
    let mut data = HashMap::new();
    let file = File::open(file_path).expect("Invalid filepath");
    let reader = BufReader::new(file);
    
    let mut seq_id = String::new();
    for line in reader.lines() {
        let line = line.unwrap();
        if line.starts_with('>') {
            seq_id = line.trim_start_matches('>').to_string();
        } else {
            data.entry(seq_id.clone()).or_insert(DNASeq {seq: "".to_string() }).push_str(&line);
        }
    }
    
    data
}

In [16]:
let data: HashMap<String, DNASeq> = read_fasta("data/rosalind_orf.txt");

for (_, forward) in data.iter() {
    
    let reverse = forward.reverse().complement();
    let forward_rna = forward.transcribe();
    let reverse_rna = reverse.transcribe();
    
    let mut orf_set = HashSet::new();
    
    for r in forward_rna.shift().iter() {
        let protein = r.translate();
        let start_indices: Vec<usize> = protein.seq.match_indices("M").map(|(i, _)| i).collect();
        let stop_indicies: Vec<usize> = protein.seq.match_indices("*").map(|(i, _)| i).collect();
        for start in start_indices.iter() {
            for stop in stop_indicies.iter() {
                if start < stop {
                    let orf = &protein.seq[*start..*stop];
                    if !orf.contains("*") {
                        orf_set.insert(orf.to_string());
                    }
                }
            }
        }
        
    }
    for r in reverse_rna.shift().iter() {
        let protein = r.translate();
        let start_indices: Vec<usize> = protein.seq.match_indices("M").map(|(i, _)| i).collect();
        let stop_indicies: Vec<usize> = protein.seq.match_indices("*").map(|(i, _)| i).collect();
        for start in start_indices.iter() {
            for stop in stop_indicies.iter() {
                if start < stop {
                    let orf = &protein.seq[*start..*stop];
                    if !orf.contains("*") {
                        orf_set.insert(orf.to_string());
                    }
                }
            }
        }
    }
    
    for orf in &orf_set {
        println!("{orf}")
    }
}

MGPCLRLVYQGVSPGVSGHSYQRAGSRQRYRGS
MYAVRSCRIYQCLVFHEQRVRFESP
MKFDQGSSFVYCQPSRSRWLFRAPFCRSTSPDSAVGCPPSGRSGHLPQGTRPDTLTAGKDPFTAGHTLKTLGCGIDQAKTIFSRMRR
MVICVV
MYRVNHPVAGLRGYVKAAVFCLYKGCIRNVVLESCVRLGFPPTQAVRRQRDLLI
MRR
MHGWGFVNRV
MC
MLYSKVASV
MGT
MCFRSRKRRMGT
MFQPF
MTCCEWVLACG
MNPRT
MHPL
MRQLLTAYMVICVV
MTVQCIV
MRE
MYPSFACATESTFL
MR
MNTQVRFRRAETSDSIYESPAVHISRKVQMNPRT


()