---
description: We define all the usefull functions to predict the score from a given
  sequence
output-file: predictions.html
title: predictions

---

In [1]:
#| hide
%load_ext autoreload
%autoreload 2

In [2]:
#| default_exp predictions

In [3]:
#| export
import pickle
import sklearn
import numpy as np
import pandas as pd
from dgrec.example_data import get_example_data_dir
import os
from dgrec import encoding

In [4]:
# | export

data_path=get_example_data_dir()
model_name='e_TRSp_classifier.pkl'
model_path=os.path.join(data_path,model_name)
model_Sp=pickle.load(open(model_path,"rb"))

model_name2='e_TRSpAvd_classifier.pkl'
model_path2=os.path.join(data_path,model_name2)
model_Avd_Sp=pickle.load(open(model_path2,"rb"))

model_name_whole='whole_model.pkl'
model_path_whole=os.path.join(data_path,model_name_whole)
model_whole=pickle.load(open(model_path_whole,"rb"))

In [5]:
# | export

def score(TR_seq:str #A string of the TR DNA sequence
,features=1 #The classifier model, no need to specify it (one feature by default). If two: uses the two features model
         ):

    """Calculates the predicted score of a given TR sequence (1 = perfect TR and 0 = crappy TR)"""
    encoded_TR=encoding.encode_tr_list([TR_seq],features)
    if features == 1:
        score=np.round(model_Sp.predict_proba([encoded_TR[0]])[:,1],decimals=2).item()
    elif features==2:
        score=[np.round(model_Sp.predict_proba([encoded_TR[0][:1]])[:,1],decimals=2).item(),np.round(model_Avd_Sp.predict_proba([encoded_TR[0][1:2]])[:,1],decimals=2).item()]
    return score

In [6]:
TR_bad='TTAGCGAATGGCGAAATTCGTAAACGCCCTCTGATCGAAACCAACGGCGAAACGGGTGAGATCGTGTGGG'
print('TR bad score =',score(TR_bad))
TR_good='AAATGATCGCCAAATCTGAACAGGAAATTGGCAAAGCAACCGCTAAATACTTTTTCTACTCAAACATTAT'
print('TR good score =',score(TR_good))

TR bad score = 0.23
TR good score = 0.84


In [7]:
# | export
def score_list(TR_seq_list:list, #A list of strings of TRs DNA sequences
TR_name_list:list, #A list of strings of TRs names
features=1 #The number of features to use
              ):
    """Calculates the score for every TR in the list and returns them in a dataframe format"""

    encoded_TR=encoding.encode_tr_list(TR_seq_list,features)
    if features==1:
        score=np.round(model_Sp.predict_proba(encoded_TR)[:,1],decimals=2)
        score_df=pd.DataFrame({
            'TR_Name':TR_name_list,
            'TR_Seq':TR_seq_list,
            'TR_Score':score
        })
        return score_df
    else:
        score_Sp=np.round(model_Sp.predict_proba(encoded_TR[:,:1])[:,1],decimals=2)
        score_Avd_Sp=np.round(model_Avd_Sp.predict_proba(encoded_TR[:,1:2])[:,1],decimals=2)
        score_df=pd.DataFrame({
            'TR_Name':TR_name_list,
            'TR_Seq':TR_seq_list,
            'TR_Score_Sp':score_Sp,
            'TR_Score_Avd':score_Avd_Sp
        })
        return score_df

In [8]:
TR_bad=[
     'TTAGCGAATGGCGAAATTCGTAAACGCCCTCTGATCGAAACCAACGGCGAAACGGGTGAGATCGTGTGGG',
     'AAACGCCCTCTGATCGAAACCAACGGCGAAACGGGTGAGATCGTGTGGGACAAAGGTCGTGATTTCGCTA',
    'GGTTTCTCTAAGGAGTCCATTCTGCCGAAGCGCAACTCCGACAAGCTGATCGCGCGTAAGAAGGACTGGG',
     'CAAGCTGATCGCGCGTAAGAAGGACTGGGATCCGAAGAAGTACGGTGGCTTCGATTCTCCGACCGTGGCG',
     'ACCCGATTGACTTCCTCGAGGCGAAGGGGTACAAGGAGGTGAAGAAGGATCTGATTATCAAGCTGCCGAA',
     'AGTACTCCCTGTTCGAGCTGGAGAATGGTCGTAAGCGTATGCTGGCGTCTGCGGGTGAGCTGCAGAAGGG',
     'CAGCACAAGCACTACCTGGACGAGATTATTGAGCAGATTTCTGAGTTTTCTAAGCGCGTGATTCTGGCGG',
     'ACGCGAATCTGGATAAGGTCCTGTCTGCCTACAATAAGCACCGTGATAAGCCGATCCGTGAGCAGGCGGA',   
 ]

score_list(TR_bad,['TR_bad_'+str(k) for k in range (1,9)])

Unnamed: 0,TR_Name,TR_Seq,TR_Score
0,TR_bad_1,TTAGCGAATGGCGAAATTCGTAAACGCCCTCTGATCGAAACCAACG...,0.23
1,TR_bad_2,AAACGCCCTCTGATCGAAACCAACGGCGAAACGGGTGAGATCGTGT...,0.05
2,TR_bad_3,GGTTTCTCTAAGGAGTCCATTCTGCCGAAGCGCAACTCCGACAAGC...,0.0
3,TR_bad_4,CAAGCTGATCGCGCGTAAGAAGGACTGGGATCCGAAGAAGTACGGT...,0.0
4,TR_bad_5,ACCCGATTGACTTCCTCGAGGCGAAGGGGTACAAGGAGGTGAAGAA...,0.01
5,TR_bad_6,AGTACTCCCTGTTCGAGCTGGAGAATGGTCGTAAGCGTATGCTGGC...,0.08
6,TR_bad_7,CAGCACAAGCACTACCTGGACGAGATTATTGAGCAGATTTCTGAGT...,0.06
7,TR_bad_8,ACGCGAATCTGGATAAGGTCCTGTCTGCCTACAATAAGCACCGTGA...,0.12


In [9]:
TR_good=[
     'AAATGATCGCCAAATCTGAACAGGAAATTGGCAAAGCAACCGCTAAATACTTTTTCTACTCAAACATTAT',
     'TCAAACATTATGAATTTCTTCAAAACCGAAATCACCTTAGCGAATGGCGAAATTCGTAAACGCCCTCTGA',
     'ATGCCTCAAGTAAACATCGTTAAAAAGACTGAGGTGCAGACTGGCGGTTTCTCTAAGGAGTCCATTCTGC',
     'GGATCCGAAGAAGTACGGTGGCTTCGATTCTCCGACCGTGGCGTACTCTGTTCTGGTGGTCGCCAAGGTC',
     'AGCGTATGCTGGCGTCTGCGGGTGAGCTGCAGAAGGGGAACGAGTTGGCCCTTCCGTCCAAGTACGTGAA',
     'GCAGAAGGGGAACGAGTTGGCCCTTCCGTCCAAGTACGTGAACTTCCTGTACCTGGCCTCGCACTACGAG',
     'CAGAAGCAGCTGTTCGTGGAGCAGCACAAGCACTACCTGGACGAGATTATTGAGCAGATTTCTGAGTTTT',
     'CTAAGCGCGTGATTCTGGCGGACGCGAATCTGGATAAGGTCCTGTCTGCCTACAATAAGCACCGTGATAA'
     ]

score_list(TR_good,['TR_good_'+str(k) for k in range (1,9)])

Unnamed: 0,TR_Name,TR_Seq,TR_Score
0,TR_good_1,AAATGATCGCCAAATCTGAACAGGAAATTGGCAAAGCAACCGCTAA...,0.84
1,TR_good_2,TCAAACATTATGAATTTCTTCAAAACCGAAATCACCTTAGCGAATG...,0.82
2,TR_good_3,ATGCCTCAAGTAAACATCGTTAAAAAGACTGAGGTGCAGACTGGCG...,0.76
3,TR_good_4,GGATCCGAAGAAGTACGGTGGCTTCGATTCTCCGACCGTGGCGTAC...,0.74
4,TR_good_5,AGCGTATGCTGGCGTCTGCGGGTGAGCTGCAGAAGGGGAACGAGTT...,0.83
5,TR_good_6,GCAGAAGGGGAACGAGTTGGCCCTTCCGTCCAAGTACGTGAACTTC...,0.55
6,TR_good_7,CAGAAGCAGCTGTTCGTGGAGCAGCACAAGCACTACCTGGACGAGA...,0.81
7,TR_good_8,CTAAGCGCGTGATTCTGGCGGACGCGAATCTGGATAAGGTCCTGTC...,0.81


In [10]:
TR_bad=[
     'TTAGCGAATGGCGAAATTCGTAAACGCCCTCTGATCGAAACCAACGGCGAAACGGGTGAGATCGTGTGGG',
     'AAACGCCCTCTGATCGAAACCAACGGCGAAACGGGTGAGATCGTGTGGGACAAAGGTCGTGATTTCGCTA',
    'GGTTTCTCTAAGGAGTCCATTCTGCCGAAGCGCAACTCCGACAAGCTGATCGCGCGTAAGAAGGACTGGG',
     'CAAGCTGATCGCGCGTAAGAAGGACTGGGATCCGAAGAAGTACGGTGGCTTCGATTCTCCGACCGTGGCG',
     'ACCCGATTGACTTCCTCGAGGCGAAGGGGTACAAGGAGGTGAAGAAGGATCTGATTATCAAGCTGCCGAA',
     'AGTACTCCCTGTTCGAGCTGGAGAATGGTCGTAAGCGTATGCTGGCGTCTGCGGGTGAGCTGCAGAAGGG',
     'CAGCACAAGCACTACCTGGACGAGATTATTGAGCAGATTTCTGAGTTTTCTAAGCGCGTGATTCTGGCGG',
     'ACGCGAATCTGGATAAGGTCCTGTCTGCCTACAATAAGCACCGTGATAAGCCGATCCGTGAGCAGGCGGA',   
 ]

score_list(TR_bad,['TR_bad_'+str(k) for k in range (1,9)],2)

Unnamed: 0,TR_Name,TR_Seq,TR_Score_Sp,TR_Score_Avd
0,TR_bad_1,TTAGCGAATGGCGAAATTCGTAAACGCCCTCTGATCGAAACCAACG...,0.23,0.63
1,TR_bad_2,AAACGCCCTCTGATCGAAACCAACGGCGAAACGGGTGAGATCGTGT...,0.05,0.42
2,TR_bad_3,GGTTTCTCTAAGGAGTCCATTCTGCCGAAGCGCAACTCCGACAAGC...,0.0,0.3
3,TR_bad_4,CAAGCTGATCGCGCGTAAGAAGGACTGGGATCCGAAGAAGTACGGT...,0.0,0.51
4,TR_bad_5,ACCCGATTGACTTCCTCGAGGCGAAGGGGTACAAGGAGGTGAAGAA...,0.01,0.59
5,TR_bad_6,AGTACTCCCTGTTCGAGCTGGAGAATGGTCGTAAGCGTATGCTGGC...,0.08,0.54
6,TR_bad_7,CAGCACAAGCACTACCTGGACGAGATTATTGAGCAGATTTCTGAGT...,0.06,0.29
7,TR_bad_8,ACGCGAATCTGGATAAGGTCCTGTCTGCCTACAATAAGCACCGTGA...,0.12,0.04


In [11]:
TR_good=[
     'AAATGATCGCCAAATCTGAACAGGAAATTGGCAAAGCAACCGCTAAATACTTTTTCTACTCAAACATTAT',
     'TCAAACATTATGAATTTCTTCAAAACCGAAATCACCTTAGCGAATGGCGAAATTCGTAAACGCCCTCTGA',
     'ATGCCTCAAGTAAACATCGTTAAAAAGACTGAGGTGCAGACTGGCGGTTTCTCTAAGGAGTCCATTCTGC',
     'GGATCCGAAGAAGTACGGTGGCTTCGATTCTCCGACCGTGGCGTACTCTGTTCTGGTGGTCGCCAAGGTC',
     'AGCGTATGCTGGCGTCTGCGGGTGAGCTGCAGAAGGGGAACGAGTTGGCCCTTCCGTCCAAGTACGTGAA',
     'GCAGAAGGGGAACGAGTTGGCCCTTCCGTCCAAGTACGTGAACTTCCTGTACCTGGCCTCGCACTACGAG',
     'CAGAAGCAGCTGTTCGTGGAGCAGCACAAGCACTACCTGGACGAGATTATTGAGCAGATTTCTGAGTTTT',
     'CTAAGCGCGTGATTCTGGCGGACGCGAATCTGGATAAGGTCCTGTCTGCCTACAATAAGCACCGTGATAA'
     ]

score_list(TR_good,['TR_good_'+str(k) for k in range (1,9)],2)

Unnamed: 0,TR_Name,TR_Seq,TR_Score_Sp,TR_Score_Avd
0,TR_good_1,AAATGATCGCCAAATCTGAACAGGAAATTGGCAAAGCAACCGCTAA...,0.84,0.8
1,TR_good_2,TCAAACATTATGAATTTCTTCAAAACCGAAATCACCTTAGCGAATG...,0.82,0.78
2,TR_good_3,ATGCCTCAAGTAAACATCGTTAAAAAGACTGAGGTGCAGACTGGCG...,0.76,0.84
3,TR_good_4,GGATCCGAAGAAGTACGGTGGCTTCGATTCTCCGACCGTGGCGTAC...,0.74,0.75
4,TR_good_5,AGCGTATGCTGGCGTCTGCGGGTGAGCTGCAGAAGGGGAACGAGTT...,0.83,0.88
5,TR_good_6,GCAGAAGGGGAACGAGTTGGCCCTTCCGTCCAAGTACGTGAACTTC...,0.55,0.58
6,TR_good_7,CAGAAGCAGCTGTTCGTGGAGCAGCACAAGCACTACCTGGACGAGA...,0.81,0.34
7,TR_good_8,CTAAGCGCGTGATTCTGGCGGACGCGAATCTGGATAAGGTCCTGTC...,0.81,0.82


In [None]:
# | export

def DGR_percentage(TR_seq:str #A string of the TR DNA sequence
         ):

    """Calculates the predicted DGR mutagenesis percentage of a given TR sequence (100 = perfect TR and 0 = crappy TR)"""
    encoded_TR=encoding.encode_tr_list([TR_seq],2)
    rate=model_whole.predict(encoded_TR)[0]
    return 10**rate

In [None]:
# | export
def DGR_percentage_list(TR_seq_list:list, #A list of strings of TRs DNA sequences
TR_name_list:list, #A list of strings of TRs names
              ):
    """Calculates the predicted DGR mutagenesis percentage for every TR in the list and returns them in a dataframe format"""

    encoded_TR=encoding.encode_tr_list(TR_seq_list,2)
    rates=model_whole.predict(encoded_TR)
    score_df=pd.DataFrame({
        'TR_Name':TR_name_list,
        'TR_Seq':10**rates
    })
    return score_df

In [None]:
TR_bad=[
     'TTAGCGAATGGCGAAATTCGTAAACGCCCTCTGATCGAAACCAACGGCGAAACGGGTGAGATCGTGTGGG',
     'AAACGCCCTCTGATCGAAACCAACGGCGAAACGGGTGAGATCGTGTGGGACAAAGGTCGTGATTTCGCTA',
    'GGTTTCTCTAAGGAGTCCATTCTGCCGAAGCGCAACTCCGACAAGCTGATCGCGCGTAAGAAGGACTGGG',
     'CAAGCTGATCGCGCGTAAGAAGGACTGGGATCCGAAGAAGTACGGTGGCTTCGATTCTCCGACCGTGGCG',
     'ACCCGATTGACTTCCTCGAGGCGAAGGGGTACAAGGAGGTGAAGAAGGATCTGATTATCAAGCTGCCGAA',
     'AGTACTCCCTGTTCGAGCTGGAGAATGGTCGTAAGCGTATGCTGGCGTCTGCGGGTGAGCTGCAGAAGGG',
     'CAGCACAAGCACTACCTGGACGAGATTATTGAGCAGATTTCTGAGTTTTCTAAGCGCGTGATTCTGGCGG',
     'ACGCGAATCTGGATAAGGTCCTGTCTGCCTACAATAAGCACCGTGATAAGCCGATCCGTGAGCAGGCGGA',   
 ]

DGR_percentage_list(TR_bad,['TR_bad_'+str(k) for k in range (1,9)])

In [None]:
TR_good=[
     'AAATGATCGCCAAATCTGAACAGGAAATTGGCAAAGCAACCGCTAAATACTTTTTCTACTCAAACATTAT',
     'TCAAACATTATGAATTTCTTCAAAACCGAAATCACCTTAGCGAATGGCGAAATTCGTAAACGCCCTCTGA',
     'ATGCCTCAAGTAAACATCGTTAAAAAGACTGAGGTGCAGACTGGCGGTTTCTCTAAGGAGTCCATTCTGC',
     'GGATCCGAAGAAGTACGGTGGCTTCGATTCTCCGACCGTGGCGTACTCTGTTCTGGTGGTCGCCAAGGTC',
     'AGCGTATGCTGGCGTCTGCGGGTGAGCTGCAGAAGGGGAACGAGTTGGCCCTTCCGTCCAAGTACGTGAA',
     'GCAGAAGGGGAACGAGTTGGCCCTTCCGTCCAAGTACGTGAACTTCCTGTACCTGGCCTCGCACTACGAG',
     'CAGAAGCAGCTGTTCGTGGAGCAGCACAAGCACTACCTGGACGAGATTATTGAGCAGATTTCTGAGTTTT',
     'CTAAGCGCGTGATTCTGGCGGACGCGAATCTGGATAAGGTCCTGTCTGCCTACAATAAGCACCGTGATAA'
     ]

DGR_percentage_list(TR_good,['TR_good_'+str(k) for k in range (1,9)])

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()