## IBS 4

n-gram Probability

Code created by <b>Dharshan Kumar K S</b>

<pre>
<u>n-grams:</u>

n-gram (in the context of Sequencing) means a sequence of ‘n’ kmers

Using these n-grams, we are finding the probability of occurrences of next k-mer in the nucleotide sequence
•	In bi-gram model, the probability of occurrence of a certain bi-gram over all the bi-grams will be calculated. 
(Prediction of a new kmer based on the last kmer)
•	In tri-gram model, the probability of occurrence of a certain tri-gram over all the tri-grams will be calculated. 
(Prediction of a new kmer based on the last 2 kmers)
•	In n-gram model, the probability of occurrence of a certain n-gram over all the n-grams will be calculated. 
(Prediction of a new kmer based on the last n-1 kmers)
</pre>

#### Generating n-grams of a list

In [1]:
//Reading the fasta file
var fs = require('fs')
fs.readFile('MN908947.txt', 'utf-8',(err,text) => {
    if (err) throw err;
    data=text;
})

In [2]:
//Splitting the different lines of sequences into different lists
var mseqnp  = data.split('\n');
console.log("Total number of MERS sequences in the dataset :",mseqnp.length-1)
console.log()
console.log(mseqnp)

Total number of MERS sequences in the dataset : 428

[ 'ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAA',
  'CGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAAC',
  'TAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTG',
  'TTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTC',
  'CCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTAC',
  'GTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGG',
  'CTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGAT',
  'GCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTC',
  'GTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCT',
  'TCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTA',
  'GGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTG',
  'TTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGG',
  'CCCTGATGGCTACCCTCTTGAGTGCATTAAAG

### Generating k-mers

In [3]:
//Creating k-mers of sequences based on the input size k
function creating_kmers(k,seq){
    var tot=seq.length
    var l=0
    var kmers = new Array()
    while(l < tot){
        kmers.push(seq.substr(l,k))
        l = l+k+1
    }
    return kmers
}

In [4]:
//Creating k-mers of each sequence and storing it in a list
kmers = new Array()
no_of_kmers = new Array()
for(i=0;i<mseqnp.length;i++){
    seq=mseqnp[i]
    kmers.push(creating_kmers(5,seq))
    no_of_kmers.push(creating_kmers(5,seq).length)
}

mers_tokenized = kmers
console.log("Total number of MERS 5-mers : ",kmers.length*no_of_kmers[0])
console.log()
console.log(mers_tokenized)

Total number of MERS 5-mers :  5148

[ [ 'ATTAA',
    'GGTTT',
    'TACCT',
    'CCCAG',
    'TAACA',
    'ACCAA',
    'CAACT',
    'TCGAT',
    'TCTTG',
    'AGATC',
    'GTTCT',
    'TAAA' ],
  [ 'CGAAC',
    'TTAAA',
    'TCTGT',
    'TGGCT',
    'TCACT',
    'GGCTG',
    'ATGCT',
    'AGTGC',
    'CTCAC',
    'CAGTA',
    'AATTA',
    'TAAC' ],
  [ 'TAATT',
    'CTGTC',
    'TTGAC',
    'GGACA',
    'GAGTA',
    'CTCGT',
    'TATCT',
    'CTGCA',
    'GCTGC',
    'TACGG',
    'TTCGT',
    'CGTG' ],
  [ 'TTGCA',
    'CCGAT',
    'ATCAG',
    'ACATC',
    'AGGTT',
    'CGTCC',
    'GGTGT',
    'ACCGA',
    'AGGTA',
    'GATGG',
    'GAGCC',
    'TGTC' ],
  [ 'CCTGG',
    'TTCAA',
    'GAGAA',
    'ACACA',
    'GTCCA',
    'CTCAG',
    'TTGCC',
    'GTTTT',
    'CAGGT',
    'CGCGA',
    'GTGCT',
    'GTAC' ],
  [ 'GTGGC',
    'TTGGA',
    'ACTCC',
    'TGGAG',
    'AGGTC',
    'TATCA',
    'AGGCA',
    'GTCAA',
    'ATCTT',
    'AAGAT',
    'GCACT',
    'GTGG' ],
  [ 'CTTAG',
    'AGA

### Generating n-grams (unigram, bigram and trigram)

In [5]:
//Generalised function to generate n-grams of k-mers based on the input n
//Input-> List of kmers; Output-> List of ngrams
function n_grams(words_list, n=1){
    ngrams_list = new Array()
    for(num=0; num<words_list.length; num++){
        new_list = words_list
        ngram = (new_list.slice(num,num+n))
        ngrams_list.push(ngram)
    }
    return ngrams_list
}

In [6]:
//Input-> List of all the Lists of kmers; Output-> List of all ngrams
//Input-> List(List(kmers of seq1), List(kmers of seq2),..., List(kmers of seqn))
//Output-> List(ngrams of all kmer Lists)
function n_gram_generator(mers_tokenized,n=1){
    //Generating n-gram for each sequences
    ngram_list = new Array()
    for(i=0; i<mers_tokenized.length; i++){
        input_list = mers_tokenized[i]
        ngram_list.push(n_grams(input_list, n))
    }

    //Appending all the n-grams into a single Array
    ngram_lists = new Array()
    for(i=0; i<ngram_list.length; i++){
        s = ngram_list[i]
        for(j=0; j<s.length; j++){
            g = s[j]
            ngram_lists.push(g)
        }
    }
    return ngram_lists
}

In [7]:
//Function to remove ngrams  that don't form a complete ngram
//If n=3, but few ngrams will be formed as unigram or bigram
//To remove those ngrams less than the size of n
function ngram_remover(ngrams_list,n=1){
    new_ngrams_list = new Array()
    for(i=0; i<ngrams_list.length; i++){
        sub_gram = ngrams_list[i]
        if(sub_gram.length == n ){
            new_ngrams_list.push(sub_gram)
        }
    }
    return new_ngrams_list
}

In [8]:
//Unigrams of the sequence
unigrams = n_gram_generator(mers_tokenized,1)
console.log(unigrams.slice(0,30))

[ [ 'ATTAA' ],
  [ 'GGTTT' ],
  [ 'TACCT' ],
  [ 'CCCAG' ],
  [ 'TAACA' ],
  [ 'ACCAA' ],
  [ 'CAACT' ],
  [ 'TCGAT' ],
  [ 'TCTTG' ],
  [ 'AGATC' ],
  [ 'GTTCT' ],
  [ 'TAAA' ],
  [ 'CGAAC' ],
  [ 'TTAAA' ],
  [ 'TCTGT' ],
  [ 'TGGCT' ],
  [ 'TCACT' ],
  [ 'GGCTG' ],
  [ 'ATGCT' ],
  [ 'AGTGC' ],
  [ 'CTCAC' ],
  [ 'CAGTA' ],
  [ 'AATTA' ],
  [ 'TAAC' ],
  [ 'TAATT' ],
  [ 'CTGTC' ],
  [ 'TTGAC' ],
  [ 'GGACA' ],
  [ 'GAGTA' ],
  [ 'CTCGT' ] ]


In [9]:
//Bigrams of the sequence
bigrams = n_gram_generator(mers_tokenized,2)
new_bigrams = ngram_remover(bigrams,2)
console.log(new_bigrams.slice(0,30))

[ [ 'ATTAA', 'GGTTT' ],
  [ 'GGTTT', 'TACCT' ],
  [ 'TACCT', 'CCCAG' ],
  [ 'CCCAG', 'TAACA' ],
  [ 'TAACA', 'ACCAA' ],
  [ 'ACCAA', 'CAACT' ],
  [ 'CAACT', 'TCGAT' ],
  [ 'TCGAT', 'TCTTG' ],
  [ 'TCTTG', 'AGATC' ],
  [ 'AGATC', 'GTTCT' ],
  [ 'GTTCT', 'TAAA' ],
  [ 'CGAAC', 'TTAAA' ],
  [ 'TTAAA', 'TCTGT' ],
  [ 'TCTGT', 'TGGCT' ],
  [ 'TGGCT', 'TCACT' ],
  [ 'TCACT', 'GGCTG' ],
  [ 'GGCTG', 'ATGCT' ],
  [ 'ATGCT', 'AGTGC' ],
  [ 'AGTGC', 'CTCAC' ],
  [ 'CTCAC', 'CAGTA' ],
  [ 'CAGTA', 'AATTA' ],
  [ 'AATTA', 'TAAC' ],
  [ 'TAATT', 'CTGTC' ],
  [ 'CTGTC', 'TTGAC' ],
  [ 'TTGAC', 'GGACA' ],
  [ 'GGACA', 'GAGTA' ],
  [ 'GAGTA', 'CTCGT' ],
  [ 'CTCGT', 'TATCT' ],
  [ 'TATCT', 'CTGCA' ],
  [ 'CTGCA', 'GCTGC' ] ]


In [10]:
//Trigrams of the sequence
trigrams = n_gram_generator(mers_tokenized,3)
new_trigrams = ngram_remover(trigrams,3)
console.log(new_trigrams.slice(0,30))

[ [ 'ATTAA', 'GGTTT', 'TACCT' ],
  [ 'GGTTT', 'TACCT', 'CCCAG' ],
  [ 'TACCT', 'CCCAG', 'TAACA' ],
  [ 'CCCAG', 'TAACA', 'ACCAA' ],
  [ 'TAACA', 'ACCAA', 'CAACT' ],
  [ 'ACCAA', 'CAACT', 'TCGAT' ],
  [ 'CAACT', 'TCGAT', 'TCTTG' ],
  [ 'TCGAT', 'TCTTG', 'AGATC' ],
  [ 'TCTTG', 'AGATC', 'GTTCT' ],
  [ 'AGATC', 'GTTCT', 'TAAA' ],
  [ 'CGAAC', 'TTAAA', 'TCTGT' ],
  [ 'TTAAA', 'TCTGT', 'TGGCT' ],
  [ 'TCTGT', 'TGGCT', 'TCACT' ],
  [ 'TGGCT', 'TCACT', 'GGCTG' ],
  [ 'TCACT', 'GGCTG', 'ATGCT' ],
  [ 'GGCTG', 'ATGCT', 'AGTGC' ],
  [ 'ATGCT', 'AGTGC', 'CTCAC' ],
  [ 'AGTGC', 'CTCAC', 'CAGTA' ],
  [ 'CTCAC', 'CAGTA', 'AATTA' ],
  [ 'CAGTA', 'AATTA', 'TAAC' ],
  [ 'TAATT', 'CTGTC', 'TTGAC' ],
  [ 'CTGTC', 'TTGAC', 'GGACA' ],
  [ 'TTGAC', 'GGACA', 'GAGTA' ],
  [ 'GGACA', 'GAGTA', 'CTCGT' ],
  [ 'GAGTA', 'CTCGT', 'TATCT' ],
  [ 'CTCGT', 'TATCT', 'CTGCA' ],
  [ 'TATCT', 'CTGCA', 'GCTGC' ],
  [ 'CTGCA', 'GCTGC', 'TACGG' ],
  [ 'GCTGC', 'TACGG', 'TTCGT' ],
  [ 'TACGG', 'TTCGT', 'CGTG' ] ]


## Probabilities

In [11]:
//Function to extract all the unique elements from the given input list
//Removes duplicate elements
function extract_unique(ngrams_list){
    ngrams_list.sort()
    unique = new Array()
    for(i=0; i<(ngrams_list.length-1); i++){
        if(ngrams_list[i].toString() != ngrams_list[i+1].toString()){   
            unique.push(ngrams_list[i])
        }
    }
    return unique
}

In [12]:
//Function to extract the count of the unique elements from the given input list
//Count the no. of occurance of the elements in the list
function extract_count(ngrams_list){
    ngrams_list.sort()
    unique = extract_unique(ngrams_list)
    count_all = new Array()
    for(i=0; i<unique.length; i++){
        variable = unique[i]
        count = 0
        for(j=0; j<ngrams_list.length-1; j++){
            if(variable.toString() == ngrams_list[j].toString()){
                count+=1
            }
        }
        count_all.push(count)
    }
    return count_all
}

In [13]:
//Function to combine the unique elements and their count into a dictionary
function extract_unique_with_count(ngrams_list){
    unique = extract_unique(ngrams_list)
    count_all = extract_count(ngrams_list)
    n_count =  count_all.reduce(function(result, field, index) {
      result[unique[index]] = field;
      return result;
    }, {})
    
    return n_count
}

### Unigram Probability

In [14]:
uni_count_all = extract_count(unigrams)
uni_count = extract_unique_with_count(unigrams)
console.log(uni_count)

{ A: 1,
  AAAA: 6,
  AAAAA: 16,
  AAAAC: 13,
  AAAAG: 8,
  AAAAT: 11,
  AAAC: 2,
  AAACA: 14,
  AAACC: 1,
  AAACG: 2,
  AAACT: 6,
  AAAG: 4,
  AAAGA: 12,
  AAAGC: 5,
  AAAGG: 2,
  AAAGT: 13,
  AAAT: 3,
  AAATA: 6,
  AAATC: 8,
  AAATG: 12,
  AAATT: 18,
  AACA: 4,
  AACAA: 15,
  AACAC: 11,
  AACAG: 14,
  AACAT: 11,
  AACC: 2,
  AACCA: 11,
  AACCC: 4,
  AACCG: 1,
  AACCT: 8,
  AACG: 1,
  AACGA: 1,
  AACGC: 1,
  AACGG: 1,
  AACGT: 3,
  AACT: 2,
  AACTA: 5,
  AACTC: 6,
  AACTG: 11,
  AACTT: 10,
  AAGA: 2,
  AAGAA: 12,
  AAGAC: 4,
  AAGAG: 10,
  AAGAT: 9,
  AAGCA: 1,
  AAGCC: 4,
  AAGCG: 1,
  AAGCT: 7,
  AAGGA: 9,
  AAGGC: 4,
  AAGGG: 5,
  AAGGT: 3,
  AAGT: 1,
  AAGTA: 2,
  AAGTC: 3,
  AAGTG: 8,
  AAGTT: 5,
  AATA: 1,
  AATAA: 9,
  AATAC: 5,
  AATAG: 6,
  AATAT: 2,
  AATCA: 8,
  AATCC: 5,
  AATCT: 5,
  AATG: 1,
  AATGA: 10,
  AATGC: 7,
  AATGG: 12,
  AATGT: 15,
  AATT: 2,
  AATTA: 17,
  AATTC: 10,
  AATTG: 6,
  AATTT: 10,
  ACAA: 1,
  ACAAA: 13,
  ACAAC: 14,
  ACAAG: 3,
  ACAAT: 12,
  ACAC: 

In [15]:
//No. of times element 'TGATG' is present in unigrams list
uni_count['TGATG']

21

In [16]:
//Total no. of all the unigrams
uni_count_all.reduce((a, b) => a + b, 0)

5116

In [17]:
//Probability of choosing unigram 'TGATG' from the all the unigrams
uni_count['TGATG']/uni_count_all.reduce((a, b) => a + b, 0)

0.004104769351055512

### Bigram Probability

In [18]:
bi_count_all = extract_count(new_bigrams)
bi_count = extract_unique_with_count(new_bigrams)
console.log(bi_count)

{ 'AAAAA,A': 1,
  'AAAAA,AAAA': 1,
  'AAAAA,AAAAA': 2,
  'AAAAA,ACAG': 1,
  'AAAAA,AGAAA': 1,
  'AAAAA,AGTAA': 1,
  'AAAAA,AGTAC': 1,
  'AAAAA,ATTTG': 1,
  'AAAAA,CTTAG': 1,
  'AAAAA,GACTG': 1,
  'AAAAA,TAATT': 1,
  'AAAAA,TACAG': 1,
  'AAAAA,TGATT': 1,
  'AAAAA,TTATC': 1,
  'AAAAA,TTTC': 1,
  'AAAAC,AACA': 1,
  'AAAAC,AATAC': 1,
  'AAAAC,ACAGA': 1,
  'AAAAC,AGTTG': 1,
  'AAAAC,ATTTC': 1,
  'AAAAC,CAAAC': 1,
  'AAAAC,CCCAA': 1,
  'AAAAC,CTAGT': 1,
  'AAAAC,CTTGC': 1,
  'AAAAC,GATGG': 1,
  'AAAAC,GGGTA': 1,
  'AAAAC,TACTG': 1,
  'AAAAC,TTTTA': 1,
  'AAAAG,AATCT': 1,
  'AAAAG,ACAAA': 1,
  'AAAAG,ACACT': 1,
  'AAAAG,CTGTG': 1,
  'AAAAG,CTTCT': 1,
  'AAAAG,GCCAA': 1,
  'AAAAG,GCCTT': 1,
  'AAAAG,TGATG': 1,
  'AAAAT,ACTCT': 1,
  'AAAAT,AGCAC': 1,
  'AAAAT,AGCGA': 1,
  'AAAAT,CAAGA': 1,
  'AAAAT,CAGTT': 1,
  'AAAAT,GAAGA': 1,
  'AAAAT,GTTGA': 1,
  'AAAAT,TCTCT': 1,
  'AAAAT,TCTG': 1,
  'AAAAT,TGACT': 1,
  'AAAAT,TTTTA': 1,
  'AAACA,ACAAC': 1,
  'AAACA,AGCAG': 1,
  'AAACA,AGTGT': 1,
  'AAACA,

In [19]:
//No. of times element 'AATCA,GACTA' is present in unigrams list
bi_count['AATCA,GACTA']

2

In [20]:
//Total no. of all the unigrams
bi_count_all.reduce((a, b) => a + b, 0)

4698

In [21]:
//Probability of choosing unigram 'AATCA,GACTA' from the all the unigrams
bi_count['AATCA,GACTA']/bi_count_all.reduce((a, b) => a + b, 0)

0.0004257130693912303

### Trigram Probability

In [22]:
tri_count_all = extract_count(new_trigrams)
tri_count = extract_unique_with_count(new_trigrams)
console.log(tri_count)

{ 'AAAAA,AAAAA,A': 1,
  'AAAAA,AAAAA,AAAA': 1,
  'AAAAA,AGAAA,TCAAC': 1,
  'AAAAA,AGTAA,GTACA': 1,
  'AAAAA,AGTAC,ATTCT': 1,
  'AAAAA,ATTTG,TGATG': 1,
  'AAAAA,CTTAG,GAATT': 1,
  'AAAAA,GACTG,TATG': 1,
  'AAAAA,TAATT,TTGTC': 1,
  'AAAAA,TACAG,AGAGG': 1,
  'AAAAA,TGATT,CCAAC': 1,
  'AAAAA,TTATC,AAGTC': 1,
  'AAAAC,AATAC,TAGTG': 1,
  'AAAAC,ACAGA,TTGTT': 1,
  'AAAAC,AGTTG,AACAT': 1,
  'AAAAC,ATTTC,TAACA': 1,
  'AAAAC,CAAAC,CGTCC': 1,
  'AAAAC,CCCAA,AAGTT': 1,
  'AAAAC,CTAGT,GCAAC': 1,
  'AAAAC,CTTGC,ACTCA': 1,
  'AAAAC,GATGG,ACACT': 1,
  'AAAAC,GGGTA,TTTC': 1,
  'AAAAC,TACTG,CAATG': 1,
  'AAAAC,TTTTA,GTTTT': 1,
  'AAAAG,AATCT,TCAAA': 1,
  'AAAAG,ACAAA,AGAAG': 1,
  'AAAAG,ACACT,TCCTC': 1,
  'AAAAG,CTGTG,TGTAT': 1,
  'AAAAG,CTTCT,CGCAG': 1,
  'AAAAG,GCCAA,AGACA': 1,
  'AAAAG,GCCTT,TACAT': 1,
  'AAAAG,TGATG,TGTTG': 1,
  'AAAAT,ACTCT,AAGAG': 1,
  'AAAAT,AGCAC,TTTAA': 1,
  'AAAAT,AGCGA,ATGCA': 1,
  'AAAAT,CAAGA,GGTGT': 1,
  'AAAAT,CAGTT,CTTAC': 1,
  'AAAAT,GAAGA,TTATT': 1,
  'AAAAT,GTTGA,GAGC

In [23]:
//No. of times element 'TTGCT,CAATT,TGTAC' is present in unigrams list
tri_count['TTGCT,CAATT,TGTAC']

1

In [24]:
//Total no. of all the unigrams
tri_count_all.reduce((a, b) => a + b, 0)

4270

In [25]:
//Probability of choosing unigram 'TTGCT,CAATT,TGTAC' from the all the unigrams
tri_count['TTGCT,CAATT,TGTAC']/tri_count_all.reduce((a, b) => a + b, 0)

0.000234192037470726