<a href="https://colab.research.google.com/github/collvey/Biopython/blob/main/Testing_Devil625_Codon_Optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports and Functions

In [4]:
!git clone https://github.com/Devil625/Codon_Optimization.git 

fatal: destination path 'Codon_Optimization' already exists and is not an empty directory.


In [5]:
mv ./Codon_Optimization/* ./

mv: cannot stat './Codon_Optimization/*': No such file or directory


In [2]:
import sys

sys.path.insert(0,'/content/Codon_Optimization')

In [3]:
from __future__ import print_function
import time
import sys
import argparse
import random
import torch
import gc
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os
from utils.metric import get_ner_fmeasure
from model.seqmodel import SeqModel
from utils.data import Data

In [6]:
try:
    import cPickle as pickle
except ImportError:
    import pickle

os.environ["CUDA_VISIBLE_DEVICES"]="0"
seed_num = 42
random.seed(seed_num)
torch.manual_seed(seed_num)
np.random.seed(seed_num)




def data_initialization(data):
    data.initial_feature_alphabets()
    data.build_alphabet(data.train_dir)
    data.build_alphabet(data.dev_dir)
    data.build_alphabet(data.test_dir)
    data.fix_alphabet()


def predict_check(pred_variable, gold_variable, mask_variable):
    """
        input:
            pred_variable (batch_size, sent_len): pred tag result, in numpy format
            gold_variable (batch_size, sent_len): gold result variable
            mask_variable (batch_size, sent_len): mask variable
    """
    pred = pred_variable.cpu().data.numpy()
    gold = gold_variable.cpu().data.numpy()
    mask = mask_variable.cpu().data.numpy()
    overlaped = (pred == gold)
    right_token = np.sum(overlaped * mask)
    total_token = mask.sum()
    # print("right: %s, total: %s"%(right_token, total_token))
    return right_token, total_token


def recover_label(pred_variable, gold_variable, mask_variable, label_alphabet, word_recover):
    """
        input:
            pred_variable (batch_size, sent_len): pred tag result
            gold_variable (batch_size, sent_len): gold result variable
            mask_variable (batch_size, sent_len): mask variable
    """

    pred_variable = pred_variable[word_recover]
    gold_variable = gold_variable[word_recover]
    mask_variable = mask_variable[word_recover]
    batch_size = gold_variable.size(0)
    seq_len = gold_variable.size(1)
    mask = mask_variable.cpu().data.numpy()
    pred_tag = pred_variable.cpu().data.numpy()
    gold_tag = gold_variable.cpu().data.numpy()
    batch_size = mask.shape[0]
    pred_label = []
    gold_label = []
    for idx in range(batch_size):
        pred = [label_alphabet.get_instance(pred_tag[idx][idy]) for idy in range(seq_len) if mask[idx][idy] != 0]
        gold = [label_alphabet.get_instance(gold_tag[idx][idy]) for idy in range(seq_len) if mask[idx][idy] != 0]
        assert(len(pred)==len(gold))
        pred_label.append(pred)
        gold_label.append(gold)
    return pred_label, gold_label


def recover_nbest_label(pred_variable, mask_variable, label_alphabet, word_recover):
    """
        input:
            pred_variable (batch_size, sent_len, nbest): pred tag result
            mask_variable (batch_size, sent_len): mask variable
            word_recover (batch_size)
        output:
            nbest_pred_label list: [batch_size, nbest, each_seq_len]
    """
    # exit(0)
    pred_variable = pred_variable[word_recover]
    mask_variable = mask_variable[word_recover]
    batch_size = pred_variable.size(0)
    seq_len = pred_variable.size(1)
    nbest = pred_variable.size(2)
    mask = mask_variable.cpu().data.numpy()
    pred_tag = pred_variable.cpu().data.numpy()
    batch_size = mask.shape[0]
    pred_label = []
    for idx in range(batch_size):
        pred = []
        for idz in range(nbest):
            each_pred = [label_alphabet.get_instance(pred_tag[idx][idy][idz]) for idy in range(seq_len) if mask[idx][idy] != 0]
            pred.append(each_pred)
        pred_label.append(pred)
    return pred_label



# def save_data_setting(data, save_file):
#     new_data = copy.deepcopy(data)
#     ## remove input instances
#     new_data.train_texts = []
#     new_data.dev_texts = []
#     new_data.test_texts = []
#     new_data.raw_texts = []

#     new_data.train_Ids = []
#     new_data.dev_Ids = []
#     new_data.test_Ids = []
#     new_data.raw_Ids = []
#     ## save data settings
#     with open(save_file, 'w') as fp:
#         pickle.dump(new_data, fp)
#     print("Data setting saved to file:",save_file)


# def load_data_setting(save_file):
#     with open(save_file, 'r') as fp:
#         data = pickle.load(fp)
#     print("Data setting loaded from file: ", save_file)
#     data.show_data_summary()
#     return data

def lr_decay(optimizer, epoch, decay_rate, init_lr):
    lr = init_lr/(1+decay_rate*epoch)
    print(" Learning rate is setted as:", lr)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer



def evaluate(data, model, name, nbest=None):
    if name == "train":
        instances = data.train_Ids
    elif name == "dev":
        instances = data.dev_Ids
    elif name == 'test':
        instances = data.test_Ids
    elif name == 'raw':
        instances = data.raw_Ids
    else:
        print("Error: wrong evaluate name,", name)
        exit(1)
    right_token = 0
    whole_token = 0
    nbest_pred_results = []
    pred_scores = []
    pred_results = []
    gold_results = []
    ## set model in eval model
    model.eval()
    batch_size = data.HP_batch_size
    start_time = time.time()
    train_num = len(instances)
    total_batch = train_num//batch_size+1
    for batch_id in range(total_batch):
        start = batch_id*batch_size
        end = (batch_id+1)*batch_size
        if end > train_num:
            end =  train_num
        instance = instances[start:end]
        if not instance:
            continue
        batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask  = batchify_with_label(instance, data.HP_gpu, True)
        if nbest:
            scores, nbest_tag_seq = model.decode_nbest(batch_word,batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, nbest)
            nbest_pred_result = recover_nbest_label(nbest_tag_seq, mask, data.label_alphabet, batch_wordrecover)
            nbest_pred_results += nbest_pred_result
            pred_scores += scores[batch_wordrecover].cpu().data.numpy().tolist()
            ## select the best sequence to evalurate
            tag_seq = nbest_tag_seq[:,:,0]
        else:
            tag_seq = model(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask)
        # print("tag:",tag_seq)
        pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover)
        pred_results += pred_label
        gold_results += gold_label
    decode_time = time.time() - start_time
    speed = len(instances)/decode_time
    acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme)
    if nbest:
        return speed, acc, p, r, f, nbest_pred_results, pred_scores
    return speed, acc, p, r, f, pred_results, pred_scores


def batchify_with_label(input_batch_list, gpu, volatile_flag=False):
    """
        input: list of words, chars and labels, various length. [[words,chars, labels],[words,chars,labels],...]
            words: word ids for one sentence. (batch_size, sent_len)
            chars: char ids for on sentences, various length. (batch_size, sent_len, each_word_length)
        output:
            zero padding for word and char, with their batch length
            word_seq_tensor: (batch_size, max_sent_len) Variable
            word_seq_lengths: (batch_size,1) Tensor
            char_seq_tensor: (batch_size*max_sent_len, max_word_len) Variable
            char_seq_lengths: (batch_size*max_sent_len,1) Tensor
            char_seq_recover: (batch_size*max_sent_len,1)  recover char sequence order
            label_seq_tensor: (batch_size, max_sent_len)
            mask: (batch_size, max_sent_len)
    """
    batch_size = len(input_batch_list)
    words = [sent[0] for sent in input_batch_list]
    features = [np.asarray(sent[1]) for sent in input_batch_list]
    feature_num = len(features[0][0])
    chars = [sent[2] for sent in input_batch_list]
    labels = [sent[3] for sent in input_batch_list]
    word_seq_lengths = torch.LongTensor(list(map(len, words)))
    max_seq_len = word_seq_lengths.max()
    with torch.no_grad():
        word_seq_tensor = autograd.Variable(torch.zeros((batch_size, max_seq_len))).long()
        label_seq_tensor = autograd.Variable(torch.zeros((batch_size, max_seq_len))).long()
    feature_seq_tensors = []
    for idx in range(feature_num):
        feature_seq_tensors.append(autograd.Variable(torch.zeros((batch_size, max_seq_len)),volatile =  volatile_flag).long())
    with torch.no_grad():
        mask = autograd.Variable(torch.zeros((batch_size, max_seq_len))).byte()
    for idx, (seq, label, seqlen) in enumerate(zip(words, labels, word_seq_lengths)):
        word_seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
        label_seq_tensor[idx, :seqlen] = torch.LongTensor(label)
        mask[idx, :seqlen] = torch.Tensor([1]*seqlen)
        for idy in range(feature_num):
            feature_seq_tensors[idy][idx,:seqlen] = torch.LongTensor(features[idx][:,idy])
    word_seq_lengths, word_perm_idx = word_seq_lengths.sort(0, descending=True)
    word_seq_tensor = word_seq_tensor[word_perm_idx]
    for idx in range(feature_num):
        feature_seq_tensors[idx] = feature_seq_tensors[idx][word_perm_idx]

    label_seq_tensor = label_seq_tensor[word_perm_idx]
    mask = mask[word_perm_idx]
    ### deal with char
    # pad_chars (batch_size, max_seq_len)
    pad_chars = [chars[idx] + [[0]] * (max_seq_len-len(chars[idx])) for idx in range(len(chars))]
    length_list = [list(map(len, pad_char)) for pad_char in pad_chars]
    max_word_len = max(map(max, length_list))
    with torch.no_grad():
        char_seq_tensor = autograd.Variable(torch.zeros((batch_size, max_seq_len, max_word_len))).long()
    char_seq_lengths = torch.LongTensor(length_list)
    for idx, (seq, seqlen) in enumerate(zip(pad_chars, char_seq_lengths)):
        for idy, (word, wordlen) in enumerate(zip(seq, seqlen)):
            # print len(word), wordlen
            char_seq_tensor[idx, idy, :wordlen] = torch.LongTensor(word)

    char_seq_tensor = char_seq_tensor[word_perm_idx].view(batch_size*max_seq_len,-1)
    char_seq_lengths = char_seq_lengths[word_perm_idx].view(batch_size*max_seq_len,)
    char_seq_lengths, char_perm_idx = char_seq_lengths.sort(0, descending=True)
    char_seq_tensor = char_seq_tensor[char_perm_idx]
    _, char_seq_recover = char_perm_idx.sort(0, descending=False)
    _, word_seq_recover = word_perm_idx.sort(0, descending=False)
    if gpu:
        word_seq_tensor = word_seq_tensor.cuda()
        for idx in range(feature_num):
            feature_seq_tensors[idx] = feature_seq_tensors[idx].cuda()
        word_seq_lengths = word_seq_lengths.cuda()
        word_seq_recover = word_seq_recover.cuda()
        label_seq_tensor = label_seq_tensor.cuda()
        char_seq_tensor = char_seq_tensor.cuda()
        char_seq_recover = char_seq_recover.cuda()
        mask = mask.cuda()
    return word_seq_tensor,feature_seq_tensors, word_seq_lengths, word_seq_recover, char_seq_tensor, char_seq_lengths, char_seq_recover, label_seq_tensor, mask


def train(data):
    print("Training model...")
    data.show_data_summary()
    save_data_name = data.model_dir +".dset"
    data.save(save_data_name)
    model = SeqModel(data)
    loss_function = nn.NLLLoss()
    if data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum,weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adam":
        optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    else:
        print("Optimizer illegal: %s"%(data.optimizer))
        exit(1)
    best_dev = -10
    # data.HP_iteration = 1
    ## start training
    # for idx in range(data.HP_iteration):
    for idx in range(500):
        epoch_start = time.time()
        temp_start = epoch_start
        print(f"Epoch: {idx} / {500}")
        if data.optimizer == "SGD":
            optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)
        instance_count = 0
        sample_id = 0
        sample_loss = 0
        total_loss = 0
        right_token = 0
        whole_token = 0
        random.shuffle(data.train_Ids)
        ## set model in train model
        model.train()
        model.zero_grad()
        batch_size = data.HP_batch_size
        batch_id = 0
        train_num = len(data.train_Ids)
        total_batch = train_num//batch_size+1
        for batch_id in range(total_batch):
            print(f"Batch id: {batch_id} / {total_batch}")
            start = batch_id*batch_size
            end = (batch_id+1)*batch_size
            if end >train_num:
                end = train_num
            instance = data.train_Ids[start:end]
            if not instance:
                continue
            batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask  = batchify_with_label(instance, data.HP_gpu)
            instance_count += 1
            loss, tag_seq = model.neg_log_likelihood_loss(batch_word,batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask)
            right, whole = predict_check(tag_seq, batch_label, mask)
            right_token += right
            whole_token += whole
            #sample_loss += loss.data[0]
            #total_loss += loss.data[0]
            sample_loss += loss.item()
            total_loss += loss.item()
            if end%500 == 0:
                temp_time = time.time()
                temp_cost = temp_time - temp_start
                temp_start = temp_time
                print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token))
                if sample_loss > 1e8 or str(sample_loss) == "nan":
                    print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....")
                    exit(1)
                sys.stdout.flush()
                sample_loss = 0
            loss.backward()
            optimizer.step()
            model.zero_grad()
        temp_time = time.time()
        temp_cost = temp_time - temp_start
        print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token))

        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s"%(idx, epoch_cost, train_num/epoch_cost, total_loss))
        print("totalloss:", total_loss)
        with open('./sample_data/loss','a') as f:
              f.write(str(total_loss)+'\n')
        if total_loss > 1e8 or str(total_loss) == "nan":
            print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....")
            exit(1)
        # continue
        speed, acc, p, r, f, _,_ = evaluate(data, model, "dev")
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish
        if data.seg:
            current_score = f
            print("Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(dev_cost, speed, acc, p, r, f))
        else:
            current_score = acc
            print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f"%(dev_cost, speed, acc))

        if current_score > best_dev:
            if data.seg:
                print("Exceed previous best f score:", best_dev)
            else:
                print("Exceed previous best acc score:", best_dev)
            model_name = data.model_dir +'.'+ str(idx) + ".model"
            print("Save current best model in file:", model_name)
            torch.save(model.state_dict(), model_name)
            best_dev = current_score
        # ## decode test
        speed, acc, p, r, f, _,_ = evaluate(data, model, "test")
        test_finish = time.time()
        test_cost = test_finish - dev_finish
        if data.seg:
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(test_cost, speed, acc, p, r, f))
        else:
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f"%(test_cost, speed, acc))
        gc.collect()


def load_model_decode(data, name):
    print("Load Model from file: ", data.model_dir)
    model = SeqModel(data)
    ## load model need consider if the model trained in GPU and load in CPU, or vice versa
    # if not gpu:
    #     model.load_state_dict(torch.load(model_dir))
    #     # model.load_state_dict(torch.load(model_dir), map_location=lambda storage, loc: storage)
    #     # model = torch.load(model_dir, map_location=lambda storage, loc: storage)
    # else:
    #     model.load_state_dict(torch.load(model_dir))
    #     # model = torch.load(model_dir)
    # model.load_state_dict(torch.load(data.load_model_dir))
    # model = torch.load(data.load_model_dir, map_location='cpu')
    model.load_state_dict(torch.load(data.load_model_dir, map_location='cpu'))

    print("Decode %s data, nbest: %s ..."%(name, data.nbest))
    start_time = time.time()
    speed, acc, p, r, f, pred_results, pred_scores = evaluate(data, model, name, data.nbest)
    end_time = time.time()
    time_cost = end_time - start_time
    if data.seg:
        print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(name, time_cost, speed, acc, p, r, f))
    else:
        print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f"%(name, time_cost, speed, acc))
    return pred_results, pred_scores

## Main

In [21]:
# !python main.py --config demo.train.config

In [20]:
# parser = argparse.ArgumentParser(description='Tuning with NCRF++')
# # parser.add_argument('--status', choices=['train', 'decode'], help='update algorithm', default='train')
# parser.add_argument('--config',  help='Configuration File' )

# args = parser.parse_args()

data = Data()
data.HP_gpu = torch.cuda.is_available()
data.read_config('demo.train.config')
status = data.status.lower()
print("Seed num:",seed_num)

Seed num: 42


In [23]:
# Executed when status equals 'train'
start=time.time()
print("MODEL: train")
data_initialization(data)
data.generate_instance('train')
data.generate_instance('dev')
data.generate_instance('test')
data.build_pretrain_emb()

MODEL: train
Load pretrained word embedding, norm: False, dir: sample_data/20coilanino.vector
Embedding:
     pretrain word:21, prefect match:21, case_match:0, oov:1, oov%:0.043478260869565216


In [30]:
data.show_data_summary()

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
DATA SUMMARY START:
 I/O:
     Tag          scheme: NoSeg
     MAX SENTENCE LENGTH: 500
     MAX   WORD   LENGTH: -1
     Number   normalized: True
     Word  alphabet size: 23
     Char  alphabet size: 23
     Label alphabet size: 28
     Word embedding  dir: sample_data/20coilanino.vector
     Char embedding  dir: None
     Word embedding size: 21
     Char embedding size: 30
     Norm   word     emb: False
     Norm   char     emb: False
     Train  file directory: sample_data/cboxtrain.txt
     Dev    file directory: sample_data/cboxdev.txt
     Test   file directory: sample_data/cboxtest.txt
     Raw    file directory: None
     Dset   file directory: None
     Model  file directory: sample_data/lstmcrf
     Loadmodel   directory: None
     Decode file directory: None
     Train instance number: 4133
     Dev   instance number: 352
     Test  instance number: 421
     Raw   instanc

In [None]:
# Model Training
train(data)
endt=time.time()
print(endt-start)

In [7]:
data = Data()
data.HP_gpu = torch.cuda.is_available()
data.read_config('demo.decode.config')
status = data.status.lower()
print("Seed num:",seed_num)

Seed num: 42


In [8]:
# Executed when status equals 'decode'
print("MODEL: decode")
data.load(data.dset_dir)
data.read_config('demo.decode.config')
print(data.raw_dir)
# exit(0)
data.show_data_summary()
data.generate_instance('raw')
print("nbest: %s"%(data.nbest))
decode_results, pred_scores = load_model_decode(data, 'raw')
if data.nbest:
    data.write_nbest_decoded_results(decode_results, pred_scores, 'raw')
else:
    data.write_decoded_results(decode_results, 'raw')

MODEL: decode
sample_data/CPA.txt
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
DATA SUMMARY START:
 I/O:
     Tag          scheme: NoSeg
     MAX SENTENCE LENGTH: 500
     MAX   WORD   LENGTH: -1
     Number   normalized: True
     Word  alphabet size: 23
     Char  alphabet size: 23
     Label alphabet size: 26
     Word embedding  dir: sample_data/20coilanino.vector
     Char embedding  dir: None
     Word embedding size: 21
     Char embedding size: 30
     Norm   word     emb: False
     Norm   char     emb: False
     Train  file directory: sample_data/cboxtrain.txt
     Dev    file directory: sample_data/cboxdev.txt
     Test   file directory: sample_data/cboxtest.txt
     Raw    file directory: sample_data/CPA.txt
     Dset   file directory: sample_data/lstmcrf.dset
     Model  file directory: sample_data/lstmcrf
     Loadmodel   directory: sample_data/lstmcrf.0.model
     Decode file directory: sample_data/CPA.out
     Tra

RuntimeError: ignored

In [9]:
protein = 'MAHHHHHHHHHHGALEVLFQGPGDPTVFHKRYLKKIRDLGEGHFGKVSLYCYDPTNDGTGEMVAVKALKADAGPQHRSGWKQEIDILRTLYHEHIIKYKGCCEDAGAASLQLVMEYVPLGSLRDYLPRHSIGLAQLLLFAQQICEGMAYLHSQHYIHRDLAARNVLLDNDRLVKIGDFGLAKAVPEGHEYYRVREDGDSPVFWYAPECLKEYKFYYASDVWSFGVTLYELLTHCDSSQSPPTKFLELIGIAQGQMTVLRLTELLERGERLPRPDKCPAEVYHLMKNCWETEASFRPTFENLIPILKTVHEKYQGQAPS'
gene = ''

In [51]:
train_data = readdata('./sample_data/cboxtrain.txt')
print(countgenes(train_data))

test_data = readdata('./sample_data/cboxtest.txt')
print(countgenes(test_data))

dev_data = readdata('./sample_data/cboxdev.txt')
print(countgenes(dev_data))

4705
496
404


In [53]:
def readdata(filepath):
  with open(filepath, 'r') as f:
    data = f.readlines()
  return data

def countgenes(data):
  n = 1
  for line in data:
    if line == '\n':
      n += 1
  return n

In [64]:
cpa = readdata('./sample_data/CPA.txt')
print(countgenes(cpa))

2


In [60]:
seq = []
seq_anno = []

for line in train_data:
  split_line = line.split()
  if len(split_line) > 1:
    seq.append(split_line[0])
    seq_anno.append(split_line[1])
  else:
    seq.append(' ')
    seq_anno.append(' ')

# print(''.join(seq))
# print(''.join(seq_anno))
joined_seq = ''.join(seq)
split_joined_seq = joined_seq.split()

In [61]:
len(split_joined_seq)

4705

In [62]:
split_joined_seq

['MKRISTTITTTITITTGNGAGX',
 'MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDALPNISDAERIFAELLTGLAAAQPGFPLAQLKTFVDQEFAQIKHVLHGISLLGQCPDSINAALICRGEKMSIAIMAGVLEARGHNVTVIDPVEKLLAVGHYLESTVDIAESTRRIAASRIPADHMVLMAGFTAGNEKGELVVLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGASRDEDELPVKGISNLNNMAMFSVSGPGMKGMVGMAARVFAAMSRARISVVLITQSSSEYSISFCVPQSDCVRAERAMQEEFYLELKEGLLEPLAVTERLAIISVVGDGMRTLRGISAKFFAALARANINIVAIAQGSSERSISVVVNNDDATTGVRVTHQMLFNTDQVIEVFVIGVGGVGGALLEQLKRQQSWLKNKHIDLRVCGVANSKALLTNVHGLNLENWQEELAQAKEPFNLGRLIRLVKEYHLLNPVIVDCTSSQAVADQYADFLREGFHVVTPNKKANTSSMDYYHLLRHAAEKSRRKFLYDTNVGAGLPVIENLQNLLNAGDELMKFSGILSGSLSYIFGKLDEGMSFSEATTLAREMGYTEPDPRDDLSGMDVARKLLILARETGRELELADIEIEPVLPAEFNAEGDVAAFMANLSQLDDLFAARVAKARDEGKVLRYVGNIDEDGVCRVKIAEVDGNDPLFKVKNGENALAFYSHYYQPLPLVLRGYGAGNDVTAAGVFADLLRTLSWKLGVX',
 'MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWERFCQELGKQIPVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHYDNVAP

In [67]:
seq_count = 0
max_len = 0
for i, seq in enumerate(split_joined_seq):
  # print(len(seq))
  seq_count += len(seq)
  max_len = max(len(seq), max_len)

seq_count / len(split_joined_seq)
print(max_len)

3531


In [68]:
temp_str = 'VTGAAAGDTVTVTLGGATYTATVQANLSWSIDVPAPALQALGNGELTISASVTNSVGNTGNGTREITIDANLPGLRIDTVAGDDVVNIIEHNQALVITGSSSDLAAGSNVTLTINGQTYVAAVLADGSWSVGVPAADVSAWPAGTVTIAASGNTTAGNPVSVTHPVTVDLTAVAVSINAITVDDVINAAEKGAALTLSGSTSGVEAGQTVTVTFGGKTYSATVAANGSWSTSVPAADMAALRDGDASAQASVSNVNGNSATTTHAYSVDASAPTVTINTIAGDDILNAAEAGAALTITGSSTAEAGQTVTVTLNGENYTGTVQTDGSWSVSVPQADVSALTASNYTVNAAVSDKAGNPASVNHNLTVDTSVPVVTINTVAGDDVINATEHAQAQIISGSATGVATGSTVTVTIGTNTFTTVLDASGNWSVGVPASVVSALANGTVTINASVTDAGGNSGSTTHQVTVNTGLPTITFNAISGDNVLNADEKGQPLTISGGSTGLATGAQVTVTLNGHNYSATTDASGNWTLAVPVSDLAALGQANYTVSASATSAAGNTASSQANLLVDSGLPDVTINTVAGDDIINAAEAGADQTISGVVTRAAAGDTVTVTLGGNTYTAQVQPDLSWSVTVPAADLQALGNGDLTITASVTNANGNTGSGTRDITIDANLPGLRVDTVAGDDIVNSIEHGQALVITGGSSGLNAGVPLTITINGTPYSATVQADGSWSVGIPAANVSAWPAGALTVDVAGQSSAGNPVSVSHPFTVDLTAVAISINTVASDDVINAAEKGTDLTLSGSTSGIESGQTVTVTFGGKTYTASVAANGSWSVNVPAADLASLPDGAANVQASVSSASGNSASATHAYSVDASAPTLTINTIASDDILNAAEAGSPLTISGTSTAETGQTVTVTLNGATYTGNVQAGGSWSVSVPSSALGALSASNYTVSATVNDKAGNPGSASHNLAVDTTAPVLTINTVAGDDIINDAEHAQALVISGTSTGGEAGDVVSVVLNGKTYTTTLDASGNWSVGVPAADVAALGSGAQAITASVSDRAGNSDDASRTVTVSLSAPVISINTIAGDDVINATEKGSDLALSGTSDQPAGTTITVTLNGQNYSATTDASGNWSVTVPASAVSALGEATYSVTASVTNAQGNSSTASHNVQVNTALPGVTLNPVATDDIINAAEAGSAQTISGQVTGAAAGSTVTVELGGKTYTAIVQVDLSWNVSVPAADWQALGNGELTVNASVTNAVGNTGSGTRDITIDASLPGLRVDTVAGDDVVNIIEHAQAQVITGSSSGFAAGTALTVVINNQTYAATVLANGSWSVGVPAADVSNWPAGTLNITVSGANSAGTQTSITHPLTVDLTTVAISMNSITSDDVINAAEKGAALTLSGSTSGVEAGQAVTVTFGGKTYTTTVAANGSWSTTVPAADLAALRDGDASAQVRVTNVNGNSATATHEYSVDSAAPTVTINTIASDNIINASEAAAGVTVSGTSTAQTGQMLTVTLNGTNYQTTVQADGSWSLTLPATDLTALANNGYTLTATVSDLAGNPGSASKGVTVDTTAPVISFNTVAGDDVINNVEHTQAQIISGTTTGAVAGDRLVVTIAGQQYVTSTDASGNWSVGVPASVISGLADGTVTISATITDSAGNSSTQTHNVQVNTAAVSLSVSTISGDNIINAAEAGSALTLSGTGTNFATGTVVTVLLNGKGYSATIQSNGSWSVNVPAADVAALSDGTSYTVSASAQDSAGNSATASRSVAVDLTAPVISINTVSTDDRLSAAEQQQPLTLNGSTSAEVGQTVTVTFGGKTYTATVAANGTWALNVPAADLATLGQGAQTITASVNDRAGNPGQATHALTVDTVAPTVTIATVAGDDIINNAEQLAGQTISGTTTAEVGQTVTVTFNGQTWSATVGSGGSWSVFIPAQQFAGLSDGSYTISATVSDQAGNPGSASRGVTLNGDVPTVTINTFAGDDVVNAAEHGSSLVISGTTTAPVGQTLTLTLNGKTYTTTVQTGGSWSYTLGSADVTALADGNAYVINASVSNAIGNTGSSNHTITVDLSAPAMGINIDSLQADTGLSASDFITSVSPVVVNGSLTATLASNETAQISIDGGVTWTTLTVTGTTWRYNDSRTLTDGNYLYQVRVIDVAGNVGATDSQNVVIDTTAPDPAVKTIAISAITTDTGLITNDFVTSDTTLAVSGTLGATLSAGEFAQISIDGGTTWQNLSVSGLTWSYLDGRTLTDGNYNYQVRVIDTAGNIGATASQIVTVDTTAPLASKTIAIASISDDTGLSSSDFVTRDTTLTVRGTLGAALAADERAQISLDGGVTWTTLTVIGTSWSYADGRTLTDGTWNYTVRVVDLAGNVGQTATQNVVVDTTSPEAAKSITITGISDDTGASSSDFITSDTTLTVRGVLGAALGANEFAQISTDNGATWVNVTLAEDGLNWSYVDGRTLTNGTTTWQVRVVDLAGNVGATGSQSAQIDTVNPAQVLTIASISTDTGSSATDFITSDTTLTLTGSLGAGLASGEVAQISLDSGATWITLTTNGTQWTYTDSRTLTDGSYVYQVRVLDLAGNTGPVVSKTVVVDTINPTATPGIVSYTDDVGQRQGTLSSSQATDDTTPLLNGVLSAPLASGEVVYLYRNGLLLGAVTMVGALNWTYSDSGLVSGAYTYSARVVDLAGNITSSSDFVLTVDTSIPTTLAQITSQTTRDTTPIISGVITAALASGQYVEVVINGKTYTSEPGGAVVVDPAHNTWYVQLPDTDALTVSATAYTVTAQVKSSAGNGNNANISNGTVTVNAAIDYTPTWTTASKTTAWGLTYGLDSHGMWTVLANQQVMQSTDPLTWSKTALTLYQSGNNYATSSIADYDRNGTGDLFITRDDYGTGYINGFTNNGDGTFSSAIQVTVGTLTWYGSIVAFDKEGDGYLDFWIGDAGGPDSNTFLWNNAGTLVGNSTTSNSGGSATVGGAVTGYLSLNEGSGVDLNNDGRIDLVQHTYNLNNYYTLSSLINQGNGTFVWGQNTTNTFLSGTGSGAMSSSVSMTWADFDGDGDMDLFLPASQGRANYGSLLFNTNGVLGSPVAVGVTATTYASQFSLAVDWNHDGLMDIARIAQTGQSYLYTNVGGASNWTQSALGGSQSGTTSGVAAMDYDWDGAVDVLVTKQSGSVFLIRNTNTVSYGTSLHLRITDPNGINVYYGNTVKLYNSAGVLVATQIINPQSGMGVNDTSALVNFYGLNAGETYNAVLIKSTGTNASNIDQTVNTSWGGLQATDATHAYDLSAEAGTASNNGKFVGTGYNDTFFATAGTDTYDGSGGWVYSSGTGTWLANGGMDVVDFRLSTVGVTANLSSTAAQATGFNTSTFTNIEGISGSNFNDILTGSSGDNQLEGRGGNDTLNIGNGGHDTLLYKLLNASDATGGNGSDVVNGFTVGTWEGTADTDRIDIRELLQGSGYTGNGKASYVNGVATLDAQAGNIGDFVKVTQSGSDTIVQIDRDGTGGTFATTNVVTLTGVHTDLATLLANHQLMVVX'
len(temp_str)

3531

In [26]:
tyk2_aa = readdata('./sample_data/tyk2_aa.txt')
print(''.join(tyk2_aa))
tyk2_aa_seq = ''.join(''.join(tyk2_aa).split())

MPLRHWGMARGSKPVGDGAQPMAAMGGLKVLLHWAGPGGGEPWVTFSESSLTAEEVCIH
IAHKVGITPPCFNLFALFDAQAQVWLPPNHILEIPRDASLMLYFR
IRFYFRNWHGMNPREPAVYRCGPPGTEASSDQTAQGMQLLDPASFEYLFEQGKHEFVND
VASLWELSTEEEIHHFKNESLGMAFLHLCHLALRHGIPLEEVAKKTSFKDCIPRSFRRH
IRQHSALTRLRLRNVFRRFLRDFQPGRLSQQMVMVKYLATLERLAPRFGTERVPVCHLR
LLAQAEGEPCYIRDSGVAPTDPGPESAAGPPTHEVLVTGTGGIQWWPVEEEVNKEEGSS
GSSGRNPQASLFGKKAKAHKAFGQPADRPREPLWAYFCDFRDITHVVLKEHCVSIHRQD
NKCLELSLPSRAAALSFVSLVDGYFRLTADSSHYLCHEVAPPRLVMSIRDGIHGPLLEP
FVQAKLRPEDGLYLIHWSTSHPYRLILTVAQRSQAPDGMQSLRLRKFPIEQQDGAFVLE
GWGRSFPSVRELGAALQGCLLRAGDDCFSLRRCCLPQPGETSNLIIMRGARASPRTLNL
SQLSFHRVDQKEITQLSHLGQGTRTNVYEGRLRVEGSGDPEEGKMDDEDPLVPGRDRGQ
ELRVVLKVLDPSHHDIALAFYETASLMSQVSHTHLAFVHGVCVRGPENSMVTEYVEHGP
LDVWLRRERGHVPMAWKMVVAQQLASALSYLENKNLVHGNVCGRNILLARLGLAEGTSP
FIKLSDPGVGLGALSREERVERIPWLAPECLPGGANSLSTAMDKWGFGATLLEICFDGE
APLQSRSPSEKEHFYQRQHRLPEPSCPQLATLTSQCLTYEPTQRPSFRTILRDLTRVQP
HNLADVLTVNRDSPAVGPTTFHKRYLKKIRDLGEGHFGKVSLYCYDPTNDGTGEMVAVK
ALKADCGPQHRSGWKQEIDILRTLYHEHIIKYKGCCEDQGEKSLQLVMEYVPLG

In [41]:
tyk2_dna = readdata('./sample_data/tyk2_dna.txt')
print(''.join(tyk2_dna))
tyk2_dna_seq = ''.join(''.join(tyk2_dna).split())[307:3868]

GACGCGGGCGCGGAAGGAGCGCGGCCGGAGGTCCTCAGGAAGAAGCCGCGGGGACTGGCT
GCGCTTGACAGGCTGCACTTGGATGGGAGCACCTGGTGCCTCGGGACTGCTCCGATGCCC
GGGTCTGTGCTGAATGTGTAATATGCGGAACTATATTGAAACATTACAACCATCTTTTGA
TGGCAACACCCTGAGGACCTCCCTTTTCCAGATGGGGAAACTGAGGCCCAGAATTGCTAA
GTGGCTTGCTTGAGTTGACACAGGGAGCTCCAGGACTCACCCTCAGCTGAGCCACCTGCC
GGGAGCATGCCTCTGCGCCACTGGGGGATGGCCAGGGGCAGTAAGCCCGTTGGGGATGGA
GCCCAGCCCATGGCTGCCATGGGAGGCCTGAAGGTGCTTCTGCACTGGGCTGGTCCAGGC
GGCGGGGAGCCCTGGGTCACTTTCAGTGAGTCATCGCTGACAGCTGAGGAAGTCTGCATC
CACATTGCACATAAAGTTGGTATCACTCCTCCTTGCTTCAATCTCTTTGCCCTCTTCGAT
GCTCAGGCCCAAGTCTGGTTGCCCCCAAACCACATCCTAGAGATCCCCAGAGATGCAAGC
CTGATGCTATATTTCCGCATAAGGTTTTATTTCCGGAACTGGCATGGCATGAATCCTCGG
GAACCGGCTGTGTACCGTTGTGGGCCCCCAGGAACCGAGGCATCCTCAGATCAGACAGCA
CAGGGGATGCAACTCCTGGACCCAGCCTCATTTGAGTACCTCTTTGAGCAGGGCAAGCAT
GAGTTTGTGAATGACGTGGCATCACTGTGGGAGCTGTCGACCGAGGAGGAGATCCACCAC
TTTAAGAATGAGAGCCTGGGCATGGCCTTTCTGCACCTCTGTCACCTCGCTCTCCGCCAT
GGCATCCCCCTGGAGGAGGTGGCCAAGAAGACCAGCTTCAAGGACTGCATCCCGCGCTCC
TTCCGCCGGCATATCCGGCAGCAC

In [42]:
tyk2_dna_seq

'TGCCTCTGCGCCACTGGGGGATGGCCAGGGGCAGTAAGCCCGTTGGGGATGGAGCCCAGCCCATGGCTGCCATGGGAGGCCTGAAGGTGCTTCTGCACTGGGCTGGTCCAGGCGGCGGGGAGCCCTGGGTCACTTTCAGTGAGTCATCGCTGACAGCTGAGGAAGTCTGCATCCACATTGCACATAAAGTTGGTATCACTCCTCCTTGCTTCAATCTCTTTGCCCTCTTCGATGCTCAGGCCCAAGTCTGGTTGCCCCCAAACCACATCCTAGAGATCCCCAGAGATGCAAGCCTGATGCTATATTTCCGCATAAGGTTTTATTTCCGGAACTGGCATGGCATGAATCCTCGGGAACCGGCTGTGTACCGTTGTGGGCCCCCAGGAACCGAGGCATCCTCAGATCAGACAGCACAGGGGATGCAACTCCTGGACCCAGCCTCATTTGAGTACCTCTTTGAGCAGGGCAAGCATGAGTTTGTGAATGACGTGGCATCACTGTGGGAGCTGTCGACCGAGGAGGAGATCCACCACTTTAAGAATGAGAGCCTGGGCATGGCCTTTCTGCACCTCTGTCACCTCGCTCTCCGCCATGGCATCCCCCTGGAGGAGGTGGCCAAGAAGACCAGCTTCAAGGACTGCATCCCGCGCTCCTTCCGCCGGCATATCCGGCAGCACAGCGCCCTGACCCGGCTGCGCCTTCGGAACGTCTTCCGCAGGTTCCTGCGGGACTTCCAGCCGGGCCGACTCTCCCAGCAGATGGTCATGGTCAAATACCTAGCCACACTCGAGCGGCTGGCACCCCGCTTCGGCACAGAGCGTGTGCCCGTGTGCCACCTGAGGCTGCTGGCCCAGGCCGAGGGGGAGCCCTGCTACATCCGGGACAGTGGGGTGGCCCCTACAGACCCTGGCCCTGAGTCTGCTGCTGGGCCCCCAACCCACGAGGTGCTGGTGACAGGCACTGGTGGCATCCAGTGGTGGCCAGTAGAGGAGGAGGTGA

In [29]:
tyk2_aa_seq

'MPLRHWGMARGSKPVGDGAQPMAAMGGLKVLLHWAGPGGGEPWVTFSESSLTAEEVCIHIAHKVGITPPCFNLFALFDAQAQVWLPPNHILEIPRDASLMLYFRIRFYFRNWHGMNPREPAVYRCGPPGTEASSDQTAQGMQLLDPASFEYLFEQGKHEFVNDVASLWELSTEEEIHHFKNESLGMAFLHLCHLALRHGIPLEEVAKKTSFKDCIPRSFRRHIRQHSALTRLRLRNVFRRFLRDFQPGRLSQQMVMVKYLATLERLAPRFGTERVPVCHLRLLAQAEGEPCYIRDSGVAPTDPGPESAAGPPTHEVLVTGTGGIQWWPVEEEVNKEEGSSGSSGRNPQASLFGKKAKAHKAFGQPADRPREPLWAYFCDFRDITHVVLKEHCVSIHRQDNKCLELSLPSRAAALSFVSLVDGYFRLTADSSHYLCHEVAPPRLVMSIRDGIHGPLLEPFVQAKLRPEDGLYLIHWSTSHPYRLILTVAQRSQAPDGMQSLRLRKFPIEQQDGAFVLEGWGRSFPSVRELGAALQGCLLRAGDDCFSLRRCCLPQPGETSNLIIMRGARASPRTLNLSQLSFHRVDQKEITQLSHLGQGTRTNVYEGRLRVEGSGDPEEGKMDDEDPLVPGRDRGQELRVVLKVLDPSHHDIALAFYETASLMSQVSHTHLAFVHGVCVRGPENSMVTEYVEHGPLDVWLRRERGHVPMAWKMVVAQQLASALSYLENKNLVHGNVCGRNILLARLGLAEGTSPFIKLSDPGVGLGALSREERVERIPWLAPECLPGGANSLSTAMDKWGFGATLLEICFDGEAPLQSRSPSEKEHFYQRQHRLPEPSCPQLATLTSQCLTYEPTQRPSFRTILRDLTRVQPHNLADVLTVNRDSPAVGPTTFHKRYLKKIRDLGEGHFGKVSLYCYDPTNDGTGEMVAVKALKADCGPQHRSGWKQEIDILRTLYHEHIIKYKGCCEDQGEKSLQLVMEYVPLGSLRDYLPRHSIGLAQ

In [32]:
len(tyk2_aa_seq)

1187

In [43]:
len(tyk2_dna_seq)

3561

In [44]:
1187 * 3

3561

In [None]:
tyk2_dna_seq

In [50]:
# f2 = open("./sample_data/tyk2_.txt",'w') #output
# f3 = open("./sample_data/tyk2_dna.txt",'r') #input

seq = tyk2_dna_seq
i = 0
count = 0
err=0
output = []
output_encrypted = []


while i <len(seq):
    s=seq[i]+seq[i+1]+seq[i+2]
    if s == 'TTT':
        o = 'F'
        w='a'
    elif s == 'TTC':
        o = 'F'
        w = 'b'
    elif s == 'TTA':
        o = 'L'
        w = 'c'
    elif s == 'TTG':
        o = 'L'
        w = 'd'
    elif s == 'TCT':
        o = 'S'
        w = 'b'
    elif s == 'TCC':
        o = 'S'
        w = 'f'
    elif s == 'TCA':
        o = 'S'
        w = 'h'
    elif s == 'TCG':
        o = 'S'
        w = 'g'
    elif s == 'TAT':
        o = 'Y'
        w = 'c'
    elif s == 'TAC':
        o = 'Y'
        w = 'h'
    elif s == 'TAA':
        o = 'X'
        w = 'w'
    elif s == 'TAG':
        o = 'X'
        w = 'w'
    elif s == 'TGT':
        o = 'C'
        w = 'd'
    elif s == 'TGC':
        o = 'C'
        w = 'g'
    elif s == 'TGA':
        o = 'X'
        w = 'w'
    elif s == 'TGG':
        o = 'W'
        w = 'k'
    elif s == 'CTT':
        o = 'L'
        w = 'b'
    elif s == 'CTC':
        o = 'L'
        w = 'f'
    elif s == 'CTA':
        o = 'L'
        w = 'h'
    elif s == 'CTG':
        o = 'L'
        w = 'g'
    elif s == 'CCT':
        o = 'P'
        w = 'f'
    elif s == 'CCC':
        o = 'P'
        w = 'l'
    elif s == 'CCA':
        o = 'P'
        w = 'm'
    elif s == 'CCG':
        o = 'P'
        w = 'n'
    elif s == 'CAT':
        o = 'H'
        w = 'h'
    elif s == 'CAC':
        o = 'H'
        w = 'm'
    elif s == 'CAA':
        o = 'Q'
        w = 'o'
    elif s == 'CAG':
        o = 'Q'
        w = 'r'
    elif s == 'CGT':
        o = 'R'
        w = 'g'
    elif s == 'CGC':
        o = 'R'
        w = 'n'
    elif s == 'CGA':
        o = 'R'
        w = 'r'
    elif s == 'CGG':
        o = 'R'
        w = 's'
    elif s == 'ATT':
        o = 'I'
        w = 'c'
    elif s == 'ATC':
        o = 'I'
        w = 'h'
    elif s == 'ATA':
        o = 'I'
        w = 'i'
    elif s == 'ATG':
        o = 'M'
        w = 'j'
    elif s == 'ACT':
        o = 'T'
        w = 'h'
    elif s == 'ACC':
        o = 'T'
        w = 'm'
    elif s == 'ACA':
        o = 'T'
        w = 'o'
    elif s == 'ACG':
        o = 'T'
        w = 'r'
    elif s == 'AAT':
        o = 'N'
        w = 'i'
    elif s == 'AAC':
        o = 'N'
        w = 'o'
    elif s == 'AAG':
        o = 'K'
        w = 'u'
    elif s == 'AAA':
        o = 'K'
        w = 't'
    elif s == 'AGT':
        o = 'S'
        w = 'j'
    elif s == 'AGC':
        o = 'S'
        w = 'r'
    elif s == 'AGA':
        o = 'R'
        w = 'u'
    elif s == 'AGG':
        o = 'R'
        w = 'q'
    elif s == 'GTT':
        o = 'V'
        w = 'd'
    elif s == 'GTC':
        o = 'V'
        w = 'g'
    elif s == 'GTA':
        o = 'V'
        w = 'j'
    elif s == 'GTG':
        o = 'V'
        w = 'k'
    elif s == 'GCT':
        o = 'A'
        w = 'g'
    elif s == 'GCC':
        o = 'A'
        w = 'n'
    elif s == 'GCA':
        o = 'A'
        w = 'r'
    elif s == 'GCG':
        o = 'A'
        w = 's'
    elif s == 'GAT':
        o = 'D'
        w = 'j'
    elif s == 'GAC':
        o = 'D'
        w = 'r'
    elif s == 'GAA':
        o = 'E'
        w = 'u'
    elif s == 'GAG':
        o = 'E'
        w = 'q'
    elif s == 'GGT':
        o = 'G'
        w = 'k'
    elif s == 'GGC':
        o = 'G'
        w = 's'
    elif s == 'GGA':
        o = 'G'
        w = 'q'
    elif s == 'GGG':
        o = 'G'
        w = 'p'
    else:
        err = err+1
    parsed = o + " " + w
    encrypted = o + " " + 'a'
    output.append(parsed)
    output_encrypted.append(encrypted)
    i=i+3

with open('./sample_data/tyk2_parsed.txt', 'w') as f:
  f.writelines('\n'.join(output))

with open('./sample_data/tyk2_encrypted.txt', 'w') as f:
  f.writelines('\n'.join(output_encrypted))
