---
description: We define all the usefull functions to predict the score from a given
  sequence
output-file: predictions.html
title: predictions_bis

---

In [6]:
#| default_exp predictions_bis

In [2]:
#| export
import pickle
import sklearn
import dgrec
import numpy as np
import pandas as pd
from dgrec.example_data import get_example_data_dir
import os

In [3]:
data_path=get_example_data_dir()
model_name='e_TRSp_classifier.pkl'
model_path=os.path.join(data_path,model_name)
model=pickle.load(open(model_path,"rb"))

In [4]:
# | export

def score(TR_seq:str #A string of the TR DNA sequence
,model):

    """Calculates the predicted score of a given TR sequence (1 = perfect TR and 0 = crappy TR)"""
    encoded_TR=dgrec.encoding_bis.encode_tr_list([TR_seq])
    score=np.round(model.predict_proba([encoded_TR[0]])[:,1],decimals=2).item()
    return score

In [5]:
TR_bad='TTAGCGAATGGCGAAATTCGTAAACGCCCTCTGATCGAAACCAACGGCGAAACGGGTGAGATCGTGTGGG'
print('TR bad score =',score(TR_bad,model))
TR_good='AAATGATCGCCAAATCTGAACAGGAAATTGGCAAAGCAACCGCTAAATACTTTTTCTACTCAAACATTAT'
print('TR good score =',score(TR_good,model))

AttributeError: module 'dgrec' has no attribute 'encoding'

In [None]:
# | export
def score_list(TR_seq_list:list, #A list of strings of TRs DNA sequences
TR_name_list:list, #A list of strings of TRs names
model):
    """Calculates the score for every TR in the list and returns them in a dataframe format"""

    encoded_TR=dgrec.encoding.encode_tr_list(TR_seq_list)
    score=np.round(model.predict_proba(encoded_TR)[:,1],decimals=2)
    score_df=pd.DataFrame({
        'TR_Name':TR_name_list,
        'TR_Seq':TR_seq_list,
        'TR_Score':score
    })
    return score_df

In [None]:
TR_bad=[
     'TTAGCGAATGGCGAAATTCGTAAACGCCCTCTGATCGAAACCAACGGCGAAACGGGTGAGATCGTGTGGG',
     'AAACGCCCTCTGATCGAAACCAACGGCGAAACGGGTGAGATCGTGTGGGACAAAGGTCGTGATTTCGCTA',
    'GGTTTCTCTAAGGAGTCCATTCTGCCGAAGCGCAACTCCGACAAGCTGATCGCGCGTAAGAAGGACTGGG',
     'CAAGCTGATCGCGCGTAAGAAGGACTGGGATCCGAAGAAGTACGGTGGCTTCGATTCTCCGACCGTGGCG',
     'ACCCGATTGACTTCCTCGAGGCGAAGGGGTACAAGGAGGTGAAGAAGGATCTGATTATCAAGCTGCCGAA',
     'AGTACTCCCTGTTCGAGCTGGAGAATGGTCGTAAGCGTATGCTGGCGTCTGCGGGTGAGCTGCAGAAGGG',
     'CAGCACAAGCACTACCTGGACGAGATTATTGAGCAGATTTCTGAGTTTTCTAAGCGCGTGATTCTGGCGG',
     'ACGCGAATCTGGATAAGGTCCTGTCTGCCTACAATAAGCACCGTGATAAGCCGATCCGTGAGCAGGCGGA',   
 ]

score_list(TR_bad,['TR_bad_'+str(k) for k in range (1,9)],model)

In [None]:
TR_good=[
     'AAATGATCGCCAAATCTGAACAGGAAATTGGCAAAGCAACCGCTAAATACTTTTTCTACTCAAACATTAT',
     'TCAAACATTATGAATTTCTTCAAAACCGAAATCACCTTAGCGAATGGCGAAATTCGTAAACGCCCTCTGA',
     'ATGCCTCAAGTAAACATCGTTAAAAAGACTGAGGTGCAGACTGGCGGTTTCTCTAAGGAGTCCATTCTGC',
     'GGATCCGAAGAAGTACGGTGGCTTCGATTCTCCGACCGTGGCGTACTCTGTTCTGGTGGTCGCCAAGGTC',
     'AGCGTATGCTGGCGTCTGCGGGTGAGCTGCAGAAGGGGAACGAGTTGGCCCTTCCGTCCAAGTACGTGAA',
     'GCAGAAGGGGAACGAGTTGGCCCTTCCGTCCAAGTACGTGAACTTCCTGTACCTGGCCTCGCACTACGAG',
     'CAGAAGCAGCTGTTCGTGGAGCAGCACAAGCACTACCTGGACGAGATTATTGAGCAGATTTCTGAGTTTT',
     'CTAAGCGCGTGATTCTGGCGGACGCGAATCTGGATAAGGTCCTGTCTGCCTACAATAAGCACCGTGATAA'
     ]

score_list(TR_good,['TR_good_'+str(k) for k in range (1,9)],model)