# Edit Distance Alignment

Link: https://rosalind.info/problems/edta/

In [2]:
:dep ndarray = { version = "0.15.6" }

In [3]:
use std::fs::File;
use std::io::{BufReader, BufRead};
use std::collections::HashMap;
use ndarray::prelude::*;
use std::cmp::Ordering;
use std::cmp;

In [48]:
#[derive(Debug, Clone, Eq)] 
struct Trace {
    value: i32,
    amino_acids: (char, char),
    direction: Option<Direction>,
    from: Option<Box<Trace>>,
}

#[derive(Debug, Clone, Eq, PartialEq)]
enum Direction {
    Left,
    Top,
    Diagonal
}

impl PartialEq for Trace {
    fn eq(&self, other: &Self) -> bool {
        self.value == other.value
    }
}

impl PartialOrd for Trace {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        self.value.partial_cmp(&other.value)
    }
}

impl Ord for Trace {
    fn cmp(&self, other: &Self) -> Ordering {
        self.value.cmp(&other.value)
    }
}

impl Trace {
    
    fn new(value: i32, amino_acids: (char, char), direction: Option<Direction>, from: Option<Box<Trace>>) -> Trace {
        Trace {value, amino_acids, direction, from}
    }
    
    fn get_value(&self) -> i32 {
        self.value
    }
    
    fn get_direction(&self) -> Option<Direction> {
        self.direction.clone()
    }
    
    fn get_from(&self) -> Option<Box<Trace>> {
        self.from.clone()
    }
    
    fn get_amino_acids(&self) -> (char, char) {
        self.amino_acids
    }
    
}

#[derive(Debug)] 
pub struct Protein {
    seq: String,
}

impl Protein {
    
    fn len(&self) -> i32 {
        self.seq.len() as i32
    }
    
    fn push_str(&mut self, s: &str) {
        self.seq.push_str(s);
    }
    
    fn align(seq_1: &mut String, seq_2: &mut String, trace_object: &Trace) -> () {
        let (amino_acid_1, amino_acid_2) = trace_object.get_amino_acids();
        if (amino_acid_1 != ' ') & (amino_acid_2 != ' ') {
            match trace_object.get_direction() {
            Some(Direction::Left) => {
                seq_1.insert_str(0, "-");
                seq_2.insert_str(0, &amino_acid_2.to_string());
            },
            Some(Direction::Top) => {
                seq_1.insert_str(0, &amino_acid_1.to_string());
                seq_2.insert_str(0, "-");
            },
            _ => {
                seq_1.insert_str(0, &amino_acid_1.to_string());
                seq_2.insert_str(0, &amino_acid_2.to_string());
            },
        }
    }
    }
    
    fn optimal_alignment(&self, other: &Self) -> (i32, String, String) {
        let mut seq_1 = "".to_string();
        let mut seq_2 = "".to_string();
        let matrix = self.edit_distance_matrix(other);
        let mut trace_object = matrix[(self.len() as usize, other.len() as usize)].clone();
        //println!("{:#?}", trace_object);
        let edit_distance = trace_object.get_value();
        Self::align(&mut seq_1, &mut seq_2, &trace_object);
        while Option::is_some(&trace_object.get_from()) {
            trace_object = *trace_object.get_from().unwrap();
            Self::align(&mut seq_1, &mut seq_2, &trace_object);
        }
        
        (edit_distance, seq_1, seq_2)
    }
    
    fn edit_distance_matrix(&self, other: &Self) -> Array2<Trace> {
        let mut seq_1 = self.seq.clone();
        seq_1.insert(0, ' ');
        let mut seq_2 = other.seq.clone();
        seq_2.insert(0, ' ');
        let start = Trace {value: 0, amino_acids: (' ', ' '), direction: None, from: None};
        let vec_for_array_initialization = vec![start; seq_1.len() * seq_2.len()];
        let mut matrix = Array::from_shape_vec((seq_1.len(), seq_2.len()), vec_for_array_initialization).unwrap();
        for (i, x) in seq_1.chars().enumerate() {
            for (j, y) in seq_2.chars().enumerate() {
                let trace_object;
                let direction;
                if (x == ' ') & (y != ' ') {
                    direction = None;
                    trace_object = Trace::new(j as i32, (x, y), direction, None);
                    matrix[(i, j)] = trace_object;
                } else if (y == ' ') & (x != ' ') {
                    direction = None;
                    trace_object = Trace::new(i as i32, (x, y), direction, None);
                    matrix[(i, j)] = trace_object;
                } else {
                    if (x == ' ') & (y == ' ') { continue; }
                    else {
                        let left = &matrix[(i, j-1)];
                        let top = &matrix[(i-1, j)];
                        let diagonal = &matrix[(i-1, j-1)];
                        let mut neighbors = vec![left, top, diagonal];
                        neighbors.sort();
                        if left == neighbors[0] {
                            direction = Some(Direction::Left);
                        } else if top == neighbors[0] {
                            direction = Some(Direction::Top);
                        } else {
                            direction = Some(Direction::Diagonal);
                        }
                        if x == y {
                            let trace_object = Trace::new(diagonal.get_value(), (x, y), Some(Direction::Diagonal), Some(Box::new(diagonal.clone())));
                            matrix[(i, j)] = trace_object;
                        } else {
                            let trace_object = Trace::new(neighbors[0].get_value()+1, (x, y), direction, Some(Box::new(neighbors[0].clone())));
                            matrix[(i, j)] = trace_object;
                        } 
                    }
                }
            }
        }
        matrix
    }
    
    
}

In [49]:
fn read_fasta(file_path: &str) -> HashMap<String, Protein> {
    let mut data = HashMap::new();
    let file = File::open(file_path).expect("Invalid filepath");
    let reader = BufReader::new(file);
    
    let mut seq_id = String::new();
    for line in reader.lines() {
        let line = line.unwrap();
        if line.starts_with('>') {
            seq_id = line.trim_start_matches('>').to_string();
        } else {
            data.entry(seq_id.clone()).or_insert(Protein {seq: "".to_string() }).push_str(&line);
        }
    }
    
    data
}

In [39]:
let data: HashMap<String, Protein> = read_fasta("data/rosalind_edta.txt");
let data: Vec<Protein> = data.into_values().collect();

In [50]:
Protein{seq:"PRETTY".to_string()}.optimal_alignment(&Protein{seq:"PRTTEIN".to_string()})

(4, "PRETTY--", "PR-TTEIN")