In [None]:
!pip install esm
!pip install biopython

Collecting esm
  Downloading esm-3.2.0-py3-none-any.whl.metadata (17 kB)
Collecting torchtext (from esm)
  Downloading torchtext-0.18.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting transformers<4.48.2 (from esm)
  Downloading transformers-4.48.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting biotite>=1.0.0 (from esm)
  Downloading biotite-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Collecting msgpack-numpy (from esm)
  Downloading msgpack_numpy-0.4.8-py2.py3-none-any.whl.metadata (5.0 kB)
Collecting biopython (from esm)
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting brotli (from esm)
  Downloading Brotli-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting zstd (from esm)
  Downloading zstd-1.5.6.7-cp311-c

In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from Bio import SeqIO
from esm import pretrained
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [None]:
from datasets import load_dataset

# Load the dataset in streaming mode
streamed_dataset = load_dataset("bloyal/uniref50", split="train", streaming=True)

# Take only the first 25,000 items
from itertools import islice
dataset = list(islice(streamed_dataset, 25000))

Resolving data files:   0%|          | 0/40 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/40 [00:00<?, ?it/s]

In [None]:
# Setup
fasta_path = "input.fasta"
output_dir = "data"
os.makedirs(output_dir, exist_ok=True)

In [None]:
from transformers import EsmModel, EsmTokenizer

model_name = "facebook/esm2_t6_8M_UR50D"
tokenizer = EsmTokenizer.from_pretrained(model_name)
model = EsmModel.from_pretrained(model_name)
model.eval().cuda()  # If using a GPU

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EsmModel(
  (embeddings): EsmEmbeddings(
    (word_embeddings): Embedding(33, 320, padding_idx=1)
    (dropout): Dropout(p=0.0, inplace=False)
    (position_embeddings): Embedding(1026, 320, padding_idx=1)
  )
  (encoder): EsmEncoder(
    (layer): ModuleList(
      (0-5): 6 x EsmLayer(
        (attention): EsmAttention(
          (self): EsmSelfAttention(
            (query): Linear(in_features=320, out_features=320, bias=True)
            (key): Linear(in_features=320, out_features=320, bias=True)
            (value): Linear(in_features=320, out_features=320, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (rotary_embeddings): RotaryEmbedding()
          )
          (output): EsmSelfOutput(
            (dense): Linear(in_features=320, out_features=320, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (LayerNorm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
        )
        (intermediate): EsmIntermediate(
    

In [None]:
# Output directory for token representations
output_dir = "data/"
os.makedirs(output_dir, exist_ok=True)
os.makedirs("data/token_representations", exist_ok=True)


reps_data = []
master_data = []

for entry in tqdm(dataset):
    # Extract UniRef name (e.g. UniRef50_A0A1H6RCR4)
    name = entry["ids"].split()[0]

    # Flatten multi-line FASTA to single string sequence
    lines = entry["text"].splitlines()
    seq = "".join(line.strip() for line in lines if not line.startswith(">"))

    if len(seq) == 0:
        continue

    # Tokenize and run model
    inputs = tokenizer(seq, return_tensors="pt", truncation=True, max_length=1022)
    inputs = {k: v.cuda() for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    # Extract representation, removing [CLS]
    last_hidden = outputs.last_hidden_state[0]
    token_reps = last_hidden[1:len(seq)+1].cpu()

    # Save as data/token_representations/UniRef50_XXXX.pt
    pt_filename = f"{name}.pt"
    pt_path = os.path.join("data/token_representations", pt_filename)
    torch.save(token_reps, pt_path)

    # Save relative path
    rel_path = os.path.join("data/token_representations", pt_filename)
    reps_data.append([name, seq, rel_path])

    try:
        analysis = ProteinAnalysis(seq)
        master_data.append([
            name, seq, rel_path, name, "", len(seq),
            token_reps.shape[1], analysis.molecular_weight(), analysis.aromaticity(),
            analysis.instability_index(), analysis.flexibility(),
            analysis.gravy(), analysis.isoelectric_point(), analysis.charge_at_pH(7.0)
        ])
    except Exception as e:
        print(f"Skipping {name} due to ProteinAnalysis error: {e}")
        continue

# Save CSVs
pd.DataFrame(reps_data, columns=["Name", "Sequence", "Token Representations"]).to_csv("data/reps_paths.csv", index=False)
pd.DataFrame(master_data, columns=[
    "Name", "Sequence", "Token Representations", "ID", "Description", "Length",
    "Num_Features", "Molecular Weight", "Aromaticity", "Instability Index",
    "Flexibility", "GRAVY", "Isoelectric Point", "Charge at pH:7.0"
]).to_csv("data/master_tokens_dataframe.csv", index=False)


  0%|          | 8/25000 [00:02<1:21:48,  5.09it/s]

Skipping UniRef50_K7G060 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  0%|          | 18/25000 [00:04<1:17:14,  5.39it/s]

Skipping UniRef50_UPI00244459C2 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  0%|          | 22/25000 [00:05<1:17:42,  5.36it/s]

Skipping UniRef50_G1P5X9 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  0%|          | 25/25000 [00:05<59:10,  7.03it/s]  

Skipping UniRef50_L5K2L4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  0%|          | 29/25000 [00:06<1:07:56,  6.13it/s]

Skipping UniRef50_A0A2I4CMN8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  0%|          | 32/25000 [00:06<1:12:21,  5.75it/s]

Skipping UniRef50_A0A8J7NXK8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  0%|          | 34/25000 [00:06<1:06:13,  6.28it/s]

Skipping UniRef50_A0A2I4CVD6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  0%|          | 43/25000 [00:08<1:06:48,  6.23it/s]

Skipping UniRef50_UPI001864B27E due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  0%|          | 48/25000 [00:09<1:41:34,  4.09it/s]

Skipping UniRef50_A0A6J2PCY7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  0%|          | 58/25000 [00:12<1:25:09,  4.88it/s]

Skipping UniRef50_UPI001643D7A3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  0%|          | 61/25000 [00:12<1:12:48,  5.71it/s]

Skipping UniRef50_UPI00155EA062 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  0%|          | 69/25000 [00:13<1:08:02,  6.11it/s]

Skipping UniRef50_UPI0024919CF4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  0%|          | 83/25000 [00:15<1:05:27,  6.34it/s]

Skipping UniRef50_UPI001130E85F due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  0%|          | 91/25000 [00:17<1:03:02,  6.59it/s]

Skipping UniRef50_A0A9D2Y405 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  0%|          | 116/25000 [00:20<49:07,  8.44it/s]

Skipping UniRef50_UPI001D06D21B due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|          | 142/25000 [00:26<56:39,  7.31it/s]  

Skipping UniRef50_A0A2I0MII3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|          | 151/25000 [00:27<57:00,  7.26it/s]

Skipping UniRef50_UPI001BE4D5DC due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|          | 155/25000 [00:27<49:10,  8.42it/s]

Skipping UniRef50_UPI00203562A8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|          | 158/25000 [00:28<48:13,  8.59it/s]

Skipping UniRef50_A0A1S6Q8L9 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_UPI000F08E794 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|          | 178/25000 [00:30<50:24,  8.21it/s]

Skipping UniRef50_UPI0022E16D95 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|          | 188/25000 [00:32<55:27,  7.46it/s]

Skipping UniRef50_G0MXP9 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|          | 198/25000 [00:33<57:30,  7.19it/s]

Skipping UniRef50_A0A913XPM5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|          | 205/25000 [00:34<53:38,  7.70it/s]

Skipping UniRef50_UPI001CF73CE5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|          | 222/25000 [00:37<1:21:25,  5.07it/s]

Skipping UniRef50_UPI001C716072 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|          | 226/25000 [00:38<1:03:40,  6.49it/s]

Skipping UniRef50_UPI000B8E250A due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|          | 246/25000 [00:40<54:26,  7.58it/s]

Skipping UniRef50_A0A6P8IAI0 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|          | 248/25000 [00:41<52:55,  7.79it/s]

Skipping UniRef50_A0A6P4EYV9 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|          | 255/25000 [00:42<53:51,  7.66it/s]

Skipping UniRef50_UPI001F037830 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|          | 260/25000 [00:42<53:46,  7.67it/s]

Skipping UniRef50_UPI0021496CA3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|          | 293/25000 [00:46<40:59, 10.05it/s]

Skipping UniRef50_UPI0021C86AC4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|          | 303/25000 [00:47<39:47, 10.35it/s]

Skipping UniRef50_A0A833W593 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|          | 307/25000 [00:47<37:20, 11.02it/s]

Skipping UniRef50_UPI001C890AC7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|▏         | 315/25000 [00:48<37:47, 10.89it/s]

Skipping UniRef50_UPI0014591907 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|▏         | 324/25000 [00:49<1:08:48,  5.98it/s]

Skipping UniRef50_A0A8K0CF94 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|▏         | 330/25000 [00:51<1:19:09,  5.19it/s]

Skipping UniRef50_UPI000C2546D7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|▏         | 341/25000 [00:53<56:16,  7.30it/s]

Skipping UniRef50_UPI0008F9B8CB due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|▏         | 350/25000 [00:54<46:53,  8.76it/s]

Skipping UniRef50_A0A6J3LFW1 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|▏         | 354/25000 [00:54<39:43, 10.34it/s]

Skipping UniRef50_A0A357BHU0 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  1%|▏         | 365/25000 [00:55<50:20,  8.16it/s]

Skipping UniRef50_A0A6J3KK64 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 381/25000 [00:58<49:34,  8.28it/s]

Skipping UniRef50_UPI000B8E9EE2 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_UPI00193D70CD due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 386/25000 [00:58<50:59,  8.04it/s]

Skipping UniRef50_UPI000E6E3136 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 389/25000 [00:59<49:21,  8.31it/s]

Skipping UniRef50_UPI0024946E24 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 397/25000 [01:00<49:19,  8.31it/s]

Skipping UniRef50_A0A8N4L651 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 401/25000 [01:00<42:41,  9.60it/s]

Skipping UniRef50_A0A383WN25 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 408/25000 [01:01<38:58, 10.52it/s]

Skipping UniRef50_UPI001D07D43B due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 417/25000 [01:02<50:13,  8.16it/s]

Skipping UniRef50_H3DPY8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 437/25000 [01:08<1:11:20,  5.74it/s]

Skipping UniRef50_A0A9C6SXT0 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 441/25000 [01:09<58:00,  7.06it/s]

Skipping UniRef50_A0A815Q749 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 443/25000 [01:09<52:17,  7.83it/s]

Skipping UniRef50_UPI000C6CD619 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 456/25000 [01:10<49:49,  8.21it/s]

Skipping UniRef50_UPI00067B9069 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 459/25000 [01:11<45:08,  9.06it/s]

Skipping UniRef50_UPI0022EC4958 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 461/25000 [01:11<46:30,  8.79it/s]

Skipping UniRef50_UPI00244DE9E0 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 469/25000 [01:12<43:34,  9.38it/s]

Skipping UniRef50_UPI00193E5D7D due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_UPI002379A0F6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_A0A834MUH9 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 482/25000 [01:13<47:05,  8.68it/s]

Skipping UniRef50_UPI00083C4E40 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 490/25000 [01:14<47:36,  8.58it/s]

Skipping UniRef50_A0A6J0BQK6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 496/25000 [01:15<45:49,  8.91it/s]

Skipping UniRef50_A0A7I0ZYW4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 498/25000 [01:15<45:38,  8.95it/s]

Skipping UniRef50_UPI0007674B8B due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 502/25000 [01:16<46:43,  8.74it/s]

Skipping UniRef50_A0A8B8D257 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 504/25000 [01:16<50:54,  8.02it/s]

Skipping UniRef50_A0A8R2NTY6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 515/25000 [01:18<1:00:30,  6.74it/s]

Skipping UniRef50_A0A9C6TYK9 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_UPI0012FEEE69 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 520/25000 [01:19<1:08:02,  6.00it/s]

Skipping UniRef50_A0A913YCT5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 543/25000 [01:22<47:44,  8.54it/s]

Skipping UniRef50_UPI001CC289FD due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 555/25000 [01:23<46:06,  8.84it/s]

Skipping UniRef50_UPI002165F7C1 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  2%|▏         | 614/25000 [01:30<1:02:02,  6.55it/s]

Skipping UniRef50_UPI0020D0411D due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  3%|▎         | 631/25000 [01:33<1:03:18,  6.42it/s]

Skipping UniRef50_UPI00143D178F due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  3%|▎         | 649/25000 [01:35<45:21,  8.95it/s]

Skipping UniRef50_UPI0010FADC28 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  3%|▎         | 652/25000 [01:36<44:32,  9.11it/s]

Skipping UniRef50_UPI000D6298A0 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  3%|▎         | 662/25000 [01:37<43:26,  9.34it/s]

Skipping UniRef50_A0A6P8XWM1 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  3%|▎         | 669/25000 [01:37<44:19,  9.15it/s]

Skipping UniRef50_UPI00083DB3F2 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  3%|▎         | 677/25000 [01:38<42:38,  9.51it/s]

Skipping UniRef50_UPI000CEB30AE due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  3%|▎         | 680/25000 [01:39<42:53,  9.45it/s]

Skipping UniRef50_A0A0L7LUF1 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  3%|▎         | 683/25000 [01:39<42:44,  9.48it/s]

Skipping UniRef50_A0A6I8TVF9 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  3%|▎         | 697/25000 [01:40<39:26, 10.27it/s]

Skipping UniRef50_UPI001E1D09C9 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  3%|▎         | 709/25000 [01:42<42:52,  9.44it/s]

Skipping UniRef50_A0A814VI71 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  3%|▎         | 717/25000 [01:43<43:54,  9.22it/s]

Skipping UniRef50_A0A7F5RIX7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  3%|▎         | 729/25000 [01:44<1:05:41,  6.16it/s]

Skipping UniRef50_A0A6J2YYV3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  3%|▎         | 757/25000 [01:48<43:24,  9.31it/s]

Skipping UniRef50_A0A7M7QVF9 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  3%|▎         | 800/25000 [01:53<43:54,  9.19it/s]

Skipping UniRef50_UPI0010FB14F7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  3%|▎         | 817/25000 [01:55<37:38, 10.71it/s]

Skipping UniRef50_UPI001569AC86 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  3%|▎         | 845/25000 [01:59<1:01:36,  6.53it/s]

Skipping UniRef50_UPI0013DDAC61 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  3%|▎         | 864/25000 [02:01<39:26, 10.20it/s]

Skipping UniRef50_UPI0023789547 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  4%|▎         | 910/25000 [02:07<41:35,  9.65it/s]

Skipping UniRef50_UPI0010A20A6E due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  4%|▎         | 914/25000 [02:07<41:16,  9.72it/s]

Skipping UniRef50_A0A6P8HDK5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  4%|▍         | 960/25000 [02:14<56:12,  7.13it/s]  

Skipping UniRef50_A0A8I6TM87 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  4%|▍         | 965/25000 [02:14<34:54, 11.47it/s]

Skipping UniRef50_C3Z8P2 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  4%|▍         | 973/25000 [02:15<41:54,  9.56it/s]

Skipping UniRef50_UPI001CF22F47 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  4%|▍         | 990/25000 [02:17<43:02,  9.30it/s]

Skipping UniRef50_A0A8I6SGK9 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  4%|▍         | 1022/25000 [02:21<44:06,  9.06it/s]

Skipping UniRef50_A0A4S2KS19 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  4%|▍         | 1027/25000 [02:21<41:01,  9.74it/s]

Skipping UniRef50_A0A6P7Z8W5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  4%|▍         | 1032/25000 [02:22<40:47,  9.79it/s]

Skipping UniRef50_I6MZ87 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  4%|▍         | 1037/25000 [02:22<40:14,  9.93it/s]

Skipping UniRef50_UPI001470C628 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  4%|▍         | 1063/25000 [02:27<39:49, 10.02it/s]

Skipping UniRef50_A0A8J6HHT3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  4%|▍         | 1085/25000 [02:30<53:28,  7.45it/s]

Skipping UniRef50_UPI0009E5E2B1 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  5%|▍         | 1134/25000 [02:35<42:26,  9.37it/s]

Skipping UniRef50_A0A1Q3HH56 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  5%|▍         | 1138/25000 [02:36<40:38,  9.78it/s]

Skipping UniRef50_A0A8J5ZL40 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  5%|▍         | 1219/25000 [02:49<1:37:48,  4.05it/s]

Skipping UniRef50_UPI0014430854 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  5%|▍         | 1234/25000 [02:52<40:29,  9.78it/s]

Skipping UniRef50_UPI00189713F5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  5%|▍         | 1242/25000 [02:53<37:01, 10.69it/s]

Skipping UniRef50_A0A6P7GF15 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  5%|▌         | 1309/25000 [03:03<1:27:49,  4.50it/s]

Skipping UniRef50_A0A1B0BHC7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  5%|▌         | 1317/25000 [03:06<1:21:24,  4.85it/s]

Skipping UniRef50_A0A817WHA6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  5%|▌         | 1357/25000 [03:12<53:35,  7.35it/s]

Skipping UniRef50_A0A817C6P0 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  6%|▌         | 1380/25000 [03:18<41:33,  9.47it/s]

Skipping UniRef50_A0A2D6VJ48 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  6%|▌         | 1395/25000 [03:21<45:21,  8.68it/s]

Skipping UniRef50_UPI0018A7C2DE due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  6%|▌         | 1419/25000 [03:25<37:38, 10.44it/s]

Skipping UniRef50_A0A818I3J3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  6%|▌         | 1479/25000 [03:38<47:13,  8.30it/s]

Skipping UniRef50_E3LUM9 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_UPI001F5C0978 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  6%|▌         | 1487/25000 [03:40<2:21:13,  2.77it/s]

Skipping UniRef50_A0A6P8J3K3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  6%|▌         | 1492/25000 [03:41<1:28:47,  4.41it/s]

Skipping UniRef50_A0A0B7K3X6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  6%|▌         | 1495/25000 [03:41<59:16,  6.61it/s]  

Skipping UniRef50_UPI0022EC2DD4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  6%|▌         | 1524/25000 [03:48<44:05,  8.87it/s]

Skipping UniRef50_A0A1Q3HR82 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  6%|▌         | 1527/25000 [03:48<42:34,  9.19it/s]

Skipping UniRef50_A0A1T4QC30 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  6%|▌         | 1561/25000 [03:55<37:25, 10.44it/s]

Skipping UniRef50_A0A814W4L8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  7%|▋         | 1642/25000 [04:15<2:24:26,  2.70it/s]

Skipping UniRef50_UPI001CF58545 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  7%|▋         | 1675/25000 [04:23<1:02:19,  6.24it/s]

Skipping UniRef50_UPI001300FACB due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  7%|▋         | 1696/25000 [04:26<45:57,  8.45it/s]

Skipping UniRef50_UPI0005CF0034 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  7%|▋         | 1747/25000 [04:42<4:13:29,  1.53it/s]

Skipping UniRef50_A0A3M8W9B9 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  7%|▋         | 1774/25000 [04:47<45:13,  8.56it/s]

Skipping UniRef50_UPI000D7276EF due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  7%|▋         | 1844/25000 [05:05<3:07:12,  2.06it/s]

Skipping UniRef50_UPI00196576ED due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  8%|▊         | 1974/25000 [05:41<4:02:23,  1.58it/s]

Skipping UniRef50_UPI001955AF97 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  8%|▊         | 2004/25000 [05:48<59:28,  6.44it/s]  

Skipping UniRef50_A0A6P4Z7X0 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  8%|▊         | 2032/25000 [05:54<1:53:39,  3.37it/s]

Skipping UniRef50_A0A834HZK0 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_UPI001CF74E40 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  8%|▊         | 2060/25000 [06:01<1:07:54,  5.63it/s]

Skipping UniRef50_A0A7M3P2V9 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  9%|▊         | 2130/25000 [06:25<2:16:15,  2.80it/s]

Skipping UniRef50_A0A0L0MYI0 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  9%|▊         | 2135/25000 [06:25<1:03:13,  6.03it/s]

Skipping UniRef50_UPI001A7E18AA due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  9%|▊         | 2139/25000 [06:27<3:28:58,  1.82it/s]

Skipping UniRef50_L9JUT9 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  9%|▊         | 2187/25000 [06:46<1:25:10,  4.46it/s]

Skipping UniRef50_UPI0009E21D27 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  9%|▉         | 2206/25000 [06:55<2:25:49,  2.61it/s]

Skipping UniRef50_UPI00083C0204 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  9%|▉         | 2210/25000 [06:56<1:20:56,  4.69it/s]

Skipping UniRef50_UPI000C6D2ADC due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_F4WE47 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  9%|▉         | 2226/25000 [07:02<1:22:40,  4.59it/s]

Skipping UniRef50_A0A8K0GFM5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  9%|▉         | 2231/25000 [07:04<2:17:59,  2.75it/s]

Skipping UniRef50_A0A913WYD5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  9%|▉         | 2243/25000 [07:07<1:28:35,  4.28it/s]

Skipping UniRef50_UPI001B88C6AF due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  9%|▉         | 2244/25000 [07:08<2:00:31,  3.15it/s]

Skipping UniRef50_UPI001CFAC5AE due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  9%|▉         | 2264/25000 [07:16<4:26:01,  1.42it/s]

Skipping UniRef50_A0A0L8IAK7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  9%|▉         | 2297/25000 [07:27<1:08:54,  5.49it/s]

Skipping UniRef50_UPI002378EA92 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  9%|▉         | 2310/25000 [07:28<37:53,  9.98it/s]

Skipping UniRef50_A0A2E7EGR8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  9%|▉         | 2359/25000 [07:41<1:48:33,  3.48it/s]

Skipping UniRef50_UPI00174E004A due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


  9%|▉         | 2363/25000 [07:42<1:16:09,  4.95it/s]

Skipping UniRef50_A0A090M772 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 10%|▉         | 2379/25000 [09:16<63:10:52, 10.05s/it]

Skipping UniRef50_A0A2I0KDF8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 10%|▉         | 2387/25000 [10:53<56:53:39,  9.06s/it]

Skipping UniRef50_A0A2C6KZC2 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 10%|▉         | 2391/25000 [11:13<41:27:32,  6.60s/it]

Skipping UniRef50_A0A913YK56 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 10%|▉         | 2405/25000 [15:58<143:07:19, 22.80s/it]

Skipping UniRef50_UPI001640FFE7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 10%|▉         | 2432/25000 [16:02<37:28, 10.04it/s]

Skipping UniRef50_UPI0022B14398 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 10%|▉         | 2458/25000 [16:07<52:54,  7.10it/s]  

Skipping UniRef50_UPI0012938F3D due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_UPI001747B062 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 10%|▉         | 2478/25000 [16:08<30:38, 12.25it/s]

Skipping UniRef50_A0A2C6KIT0 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 10%|▉         | 2482/25000 [16:09<30:25, 12.33it/s]

Skipping UniRef50_UPI0011E9DDCD due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 10%|█         | 2504/25000 [16:11<31:15, 11.99it/s]

Skipping UniRef50_UPI00193EC7D3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 10%|█         | 2522/25000 [16:13<43:13,  8.67it/s]

Skipping UniRef50_A0A814VIW3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 10%|█         | 2528/25000 [16:13<46:26,  8.07it/s]

Skipping UniRef50_UPI00083CF26B due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 10%|█         | 2546/25000 [16:17<40:36,  9.21it/s]

Skipping UniRef50_A0A1B3SNY6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_UPI001BEABFEE due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 10%|█         | 2562/25000 [16:19<31:15, 11.97it/s]

Skipping UniRef50_A0A074ZTN8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 10%|█         | 2600/25000 [16:22<30:46, 12.13it/s]

Skipping UniRef50_UPI00083C2150 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 10%|█         | 2602/25000 [16:22<30:42, 12.16it/s]

Skipping UniRef50_A0A918KBD4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_UPI000F7D4C50 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 10%|█         | 2622/25000 [16:24<31:19, 11.91it/s]

Skipping UniRef50_UPI001176BFB0 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 11%|█         | 2643/25000 [16:27<47:02,  7.92it/s]

Skipping UniRef50_A0A2P9GRV6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 11%|█         | 2645/25000 [16:27<46:17,  8.05it/s]

Skipping UniRef50_A0A6J0C6T2 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 11%|█         | 2668/25000 [16:29<40:46,  9.13it/s]

Skipping UniRef50_A0A7R5KPG4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 11%|█         | 2703/25000 [16:33<29:13, 12.72it/s]

Skipping UniRef50_A0A2A5AI72 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 11%|█         | 2726/25000 [16:36<34:14, 10.84it/s]

Skipping UniRef50_UPI000A1D09B1 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 11%|█         | 2746/25000 [16:38<38:29,  9.64it/s]

Skipping UniRef50_A0A9Q9YRX1 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 11%|█         | 2749/25000 [16:40<2:32:10,  2.44it/s]

Skipping UniRef50_W3VCZ3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 11%|█         | 2758/25000 [16:42<51:49,  7.15it/s]  

Skipping UniRef50_UPI002264C7DB due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 11%|█         | 2764/25000 [16:42<39:09,  9.46it/s]

Skipping UniRef50_UPI002434C197 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 11%|█         | 2774/25000 [16:43<32:13, 11.50it/s]

Skipping UniRef50_A0A1Q3HSB7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 11%|█         | 2776/25000 [16:43<31:05, 11.91it/s]

Skipping UniRef50_A0A7W5DYB3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 11%|█         | 2792/25000 [16:45<42:17,  8.75it/s]

Skipping UniRef50_A0A814IQ99 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 11%|█         | 2808/25000 [16:46<30:11, 12.25it/s]

Skipping UniRef50_A0A913YU04 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 11%|█▏        | 2829/25000 [16:48<27:05, 13.64it/s]

Skipping UniRef50_A0A814XNW6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 12%|█▏        | 2898/25000 [16:56<29:10, 12.63it/s]

Skipping UniRef50_UPI0010BDD81E due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 12%|█▏        | 2937/25000 [17:02<1:20:52,  4.55it/s]

Skipping UniRef50_UPI00234E9155 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 12%|█▏        | 2949/25000 [17:03<35:37, 10.32it/s]

Skipping UniRef50_UPI000F092CAF due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 12%|█▏        | 3010/25000 [17:12<1:50:37,  3.31it/s]

Skipping UniRef50_UPI001E1CB16F due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 12%|█▏        | 3025/25000 [17:13<33:58, 10.78it/s]

Skipping UniRef50_UPI002548CAD5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 12%|█▏        | 3059/25000 [17:16<28:40, 12.75it/s]

Skipping UniRef50_UPI001EEB5939 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 12%|█▏        | 3119/25000 [17:24<29:55, 12.19it/s]

Skipping UniRef50_UPI001CF749D6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 12%|█▎        | 3125/25000 [17:24<28:18, 12.88it/s]

Skipping UniRef50_A0A6P8IIQ5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_A0A3Q0IN39 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 13%|█▎        | 3146/25000 [17:26<26:33, 13.72it/s]

Skipping UniRef50_A0A2D7H9J3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 13%|█▎        | 3184/25000 [17:30<1:09:56,  5.20it/s]

Skipping UniRef50_A0A9W6ZRQ0 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 13%|█▎        | 3215/25000 [17:33<29:19, 12.38it/s]

Skipping UniRef50_A0A2E4YVJ6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 13%|█▎        | 3240/25000 [17:38<27:49, 13.03it/s]

Skipping UniRef50_A0A834IL42 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_UPI001BFCFBD7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 13%|█▎        | 3248/25000 [17:38<29:15, 12.39it/s]

Skipping UniRef50_A0A1I8P2P9 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 13%|█▎        | 3311/25000 [17:45<42:34,  8.49it/s]

Skipping UniRef50_A0A6P8VLA0 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 13%|█▎        | 3314/25000 [17:45<36:19,  9.95it/s]

Skipping UniRef50_A0A397GK35 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 14%|█▎        | 3396/25000 [17:54<25:06, 14.34it/s]

Skipping UniRef50_UPI00234E32ED due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_A0A840B0F6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 14%|█▎        | 3414/25000 [17:58<33:22, 10.78it/s]

Skipping UniRef50_UPI00235AAE89 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 14%|█▎        | 3418/25000 [17:58<31:59, 11.25it/s]

Skipping UniRef50_UPI0021C6CBBC due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 14%|█▎        | 3428/25000 [17:59<29:48, 12.06it/s]

Skipping UniRef50_A0A815FFV0 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 14%|█▍        | 3442/25000 [18:00<35:29, 10.12it/s]

Skipping UniRef50_UPI002403A8AC due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 14%|█▍        | 3481/25000 [18:03<27:29, 13.05it/s]

Skipping UniRef50_A0A3N0Y4D8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 14%|█▍        | 3509/25000 [18:06<27:41, 12.94it/s]

Skipping UniRef50_A0A1Q9EH80 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 14%|█▍        | 3524/25000 [18:09<2:03:21,  2.90it/s]

Skipping UniRef50_UPI0007B850A2 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 14%|█▍        | 3525/25000 [18:10<2:33:31,  2.33it/s]

Skipping UniRef50_UPI000782273A due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 14%|█▍        | 3540/25000 [18:12<34:55, 10.24it/s]

Skipping UniRef50_UPI000B457A43 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_UPI00146E8E65 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 14%|█▍        | 3554/25000 [18:13<32:30, 11.00it/s]

Skipping UniRef50_A0A2I0JZ66 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 14%|█▍        | 3571/25000 [18:16<36:01,  9.92it/s]

Skipping UniRef50_N6U3Z2 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 14%|█▍        | 3613/25000 [18:23<33:22, 10.68it/s]

Skipping UniRef50_UPI0023B8B695 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 15%|█▍        | 3670/25000 [18:27<24:33, 14.48it/s]

Skipping UniRef50_UPI001864DAD5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 15%|█▍        | 3719/25000 [18:35<42:05,  8.43it/s]

Skipping UniRef50_A0A816SK52 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 15%|█▌        | 3754/25000 [18:38<26:47, 13.22it/s]

Skipping UniRef50_UPI0022420EFC due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 15%|█▌        | 3774/25000 [18:43<31:11, 11.34it/s]

Skipping UniRef50_A0A4Q7JLR3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 15%|█▌        | 3784/25000 [18:43<27:41, 12.77it/s]

Skipping UniRef50_A0A4Q7JMT6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 15%|█▌        | 3786/25000 [18:43<27:05, 13.05it/s]

Skipping UniRef50_A0A2E7PW77 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 15%|█▌        | 3793/25000 [18:46<1:31:38,  3.86it/s]

Skipping UniRef50_A0A814BFQ5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 15%|█▌        | 3843/25000 [18:53<30:29, 11.56it/s]

Skipping UniRef50_A0A1Q9CHN3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 15%|█▌        | 3847/25000 [18:54<29:35, 11.92it/s]

Skipping UniRef50_UPI0024B4CA01 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 16%|█▌        | 3882/25000 [18:59<32:38, 10.78it/s]

Skipping UniRef50_A0A7E4W4F4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_A2E2C1 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 16%|█▌        | 3904/25000 [19:03<33:21, 10.54it/s]

Skipping UniRef50_UPI002223227D due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 16%|█▌        | 3932/25000 [19:08<31:16, 11.23it/s]

Skipping UniRef50_UPI001E1CD444 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 16%|█▌        | 3958/25000 [19:12<42:38,  8.22it/s]

Skipping UniRef50_A0A2E3H6V5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 16%|█▌        | 3961/25000 [19:12<37:14,  9.41it/s]

Skipping UniRef50_A0A6P8I808 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_D5GN99 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 16%|█▌        | 3971/25000 [19:14<39:19,  8.91it/s]

Skipping UniRef50_UPI00226445AF due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 16%|█▌        | 3979/25000 [19:17<1:24:35,  4.14it/s]

Skipping UniRef50_A0A2E5LU35 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 16%|█▌        | 4001/25000 [19:20<1:38:42,  3.55it/s]

Skipping UniRef50_UPI00145822C8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 16%|█▌        | 4006/25000 [19:21<1:08:53,  5.08it/s]

Skipping UniRef50_A0A0Q4LN61 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_UPI001CA87F49 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 16%|█▌        | 4015/25000 [19:21<29:27, 11.87it/s]

Skipping UniRef50_A0A0N8WHY8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 16%|█▌        | 4029/25000 [19:23<28:47, 12.14it/s]

Skipping UniRef50_A0A814LRF8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_UPI002544A7BF due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 16%|█▌        | 4037/25000 [19:25<1:03:36,  5.49it/s]

Skipping UniRef50_A0A3N0Y4E8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 16%|█▌        | 4045/25000 [19:26<35:13,  9.91it/s]

Skipping UniRef50_A0A814LHL2 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 16%|█▌        | 4059/25000 [19:27<28:43, 12.15it/s]

Skipping UniRef50_A0A1Z8VFW6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 16%|█▌        | 4061/25000 [19:27<27:40, 12.61it/s]

Skipping UniRef50_A0A8S1KZP9 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 16%|█▋        | 4069/25000 [19:28<26:50, 13.00it/s]

Skipping UniRef50_A0A8B8CRU4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 16%|█▋        | 4087/25000 [19:32<37:12,  9.37it/s]

Skipping UniRef50_A0A8N5F0Q8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 16%|█▋        | 4099/25000 [19:33<30:46, 11.32it/s]

Skipping UniRef50_A0A8B8CGQ7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 17%|█▋        | 4138/25000 [19:40<36:48,  9.45it/s]

Skipping UniRef50_UPI001CF246BF due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 17%|█▋        | 4164/25000 [19:44<43:13,  8.03it/s]

Skipping UniRef50_A0A2J7ZUW5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 17%|█▋        | 4178/25000 [19:48<59:42,  5.81it/s]  

Skipping UniRef50_UPI0018E2C29F due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 17%|█▋        | 4186/25000 [19:48<33:39, 10.30it/s]

Skipping UniRef50_A0A8R1HYK5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 17%|█▋        | 4195/25000 [19:50<1:47:56,  3.21it/s]

Skipping UniRef50_UPI00146F116B due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 17%|█▋        | 4223/25000 [19:53<25:13, 13.72it/s]

Skipping UniRef50_UPI00273B474A due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 17%|█▋        | 4231/25000 [19:54<26:15, 13.18it/s]

Skipping UniRef50_A0A6H5K0L4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 17%|█▋        | 4268/25000 [19:58<25:50, 13.37it/s]

Skipping UniRef50_A0A0S7Z2A6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 17%|█▋        | 4272/25000 [19:58<26:30, 13.04it/s]

Skipping UniRef50_A0A212D0V4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 17%|█▋        | 4281/25000 [20:02<1:22:54,  4.17it/s]

Skipping UniRef50_UPI002158A4E3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 17%|█▋        | 4320/25000 [20:07<27:13, 12.66it/s]

Skipping UniRef50_A0A2I0JUM4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_A0A2E3L5R5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 18%|█▊        | 4384/25000 [20:17<28:25, 12.08it/s]

Skipping UniRef50_A0A2W1BNE8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 18%|█▊        | 4403/25000 [20:23<45:43,  7.51it/s]

Skipping UniRef50_A0A836EZ19 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 18%|█▊        | 4413/25000 [20:24<31:18, 10.96it/s]

Skipping UniRef50_UPI0013C45282 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 18%|█▊        | 4460/25000 [20:33<36:12,  9.46it/s]

Skipping UniRef50_A0A7I0ZTF4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 18%|█▊        | 4497/25000 [20:38<28:31, 11.98it/s]

Skipping UniRef50_A0A2D5Y8W9 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_UPI00174D66BE due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 18%|█▊        | 4499/25000 [20:39<1:22:27,  4.14it/s]

Skipping UniRef50_A0A814WZ62 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 18%|█▊        | 4526/25000 [20:46<2:46:36,  2.05it/s]

Skipping UniRef50_UPI0023501412 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 18%|█▊        | 4538/25000 [20:47<41:18,  8.26it/s]

Skipping UniRef50_UPI001EE596B5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 18%|█▊        | 4588/25000 [20:55<1:09:41,  4.88it/s]

Skipping UniRef50_UPI00067DA091 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 18%|█▊        | 4589/25000 [20:56<1:43:03,  3.30it/s]

Skipping UniRef50_UPI001AE4A6FE due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 18%|█▊        | 4606/25000 [20:58<25:19, 13.42it/s]

Skipping UniRef50_UPI000F676DC4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 19%|█▊        | 4649/25000 [21:05<59:17,  5.72it/s]  

Skipping UniRef50_A0A6P8QWY4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 19%|█▊        | 4656/25000 [21:06<40:20,  8.40it/s]

Skipping UniRef50_A0A2E6LH85 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 19%|█▊        | 4658/25000 [21:06<35:55,  9.44it/s]

Skipping UniRef50_A0A2I0HSV6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 19%|█▊        | 4668/25000 [21:09<2:19:20,  2.43it/s]

Skipping UniRef50_UPI000B7D9673 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 19%|█▉        | 4719/25000 [21:22<1:49:56,  3.07it/s]

Skipping UniRef50_A0A6P8H4W4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 19%|█▉        | 4729/25000 [21:22<37:31,  9.00it/s]

Skipping UniRef50_A0A0K0PDD1 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 19%|█▉        | 4751/25000 [21:24<33:13, 10.16it/s]

Skipping UniRef50_UPI002443E628 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 19%|█▉        | 4765/25000 [21:28<44:10,  7.63it/s]

Skipping UniRef50_A0A2A5AIY5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 19%|█▉        | 4773/25000 [21:29<32:18, 10.44it/s]

Skipping UniRef50_A0A6B7HGV7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_UPI000D72D31C due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 19%|█▉        | 4796/25000 [21:33<29:36, 11.38it/s]

Skipping UniRef50_A0A814BCZ8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 19%|█▉        | 4844/25000 [21:43<40:33,  8.28it/s]

Skipping UniRef50_A0A8B7WX62 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 19%|█▉        | 4848/25000 [21:44<37:04,  9.06it/s]

Skipping UniRef50_UPI0004D04517 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 19%|█▉        | 4849/25000 [21:44<42:20,  7.93it/s]

Skipping UniRef50_A0A9D2Z0P5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 19%|█▉        | 4860/25000 [21:47<44:01,  7.63it/s]

Skipping UniRef50_L5LN39 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 19%|█▉        | 4866/25000 [21:47<31:20, 10.71it/s]

Skipping UniRef50_A0A2E4ZUD2 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_A0A814HZ30 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 19%|█▉        | 4872/25000 [21:48<35:11,  9.53it/s]

Skipping UniRef50_A0A8D0JTI7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 20%|█▉        | 4881/25000 [21:52<2:00:11,  2.79it/s]

Skipping UniRef50_UPI001425A330 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 20%|█▉        | 4924/25000 [21:59<1:25:12,  3.93it/s]

Skipping UniRef50_A0A177Y6E0 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 20%|█▉        | 4930/25000 [22:03<2:11:16,  2.55it/s]

Skipping UniRef50_UPI0023A90170 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 20%|█▉        | 4953/25000 [22:07<1:33:55,  3.56it/s]

Skipping UniRef50_A0A8T0DTJ6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 20%|█▉        | 4975/25000 [22:11<1:17:09,  4.33it/s]

Skipping UniRef50_A0A4S2KPJ3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 20%|█▉        | 4987/25000 [22:12<30:23, 10.97it/s]

Skipping UniRef50_A0A1V2IN36 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 20%|██        | 5012/25000 [22:18<35:21,  9.42it/s]

Skipping UniRef50_A0A817TX69 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 20%|██        | 5029/25000 [22:22<43:12,  7.70it/s]

Skipping UniRef50_A0A8S1XJJ5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 20%|██        | 5039/25000 [22:23<33:37,  9.90it/s]

Skipping UniRef50_A0A812ZVG7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 20%|██        | 5041/25000 [22:23<58:02,  5.73it/s]

Skipping UniRef50_UPI001CE1EB51 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 20%|██        | 5064/25000 [22:27<26:31, 12.53it/s]

Skipping UniRef50_A0A4S2JUD3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 20%|██        | 5079/25000 [22:31<1:36:55,  3.43it/s]

Skipping UniRef50_A0A1Z9AUD8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 20%|██        | 5116/25000 [22:37<27:50, 11.90it/s]

Skipping UniRef50_A0A814PWL9 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 21%|██        | 5142/25000 [22:42<30:46, 10.75it/s]

Skipping UniRef50_A0A6P3Y6J6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_UPI000FFDAD25 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 21%|██        | 5146/25000 [22:43<35:34,  9.30it/s]

Skipping UniRef50_UPI001E1C5245 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 21%|██        | 5162/25000 [22:48<43:11,  7.65it/s]

Skipping UniRef50_UPI0024464A07 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_UPI001C0908C1 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_L9LCN4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 21%|██        | 5189/25000 [22:53<29:59, 11.01it/s]

Skipping UniRef50_A0A6P6DQU7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 21%|██        | 5219/25000 [22:57<29:10, 11.30it/s]

Skipping UniRef50_A0A6J2V6Z7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 21%|██        | 5225/25000 [22:58<33:30,  9.84it/s]

Skipping UniRef50_A0A6H5J981 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 21%|██        | 5239/25000 [23:02<1:42:55,  3.20it/s]

Skipping UniRef50_C3XRB2 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 21%|██        | 5282/25000 [23:09<1:47:46,  3.05it/s]

Skipping UniRef50_A0A815FI60 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 21%|██        | 5295/25000 [23:13<45:58,  7.14it/s]

Skipping UniRef50_A0A9J7EP48 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 21%|██▏       | 5329/25000 [23:20<2:40:14,  2.05it/s]

Skipping UniRef50_A0A2I4CRW1 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 21%|██▏       | 5339/25000 [23:23<50:06,  6.54it/s]  

Skipping UniRef50_A0A2I0IZ70 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 21%|██▏       | 5355/25000 [23:27<1:15:55,  4.31it/s]

Skipping UniRef50_A0A674KF94 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 21%|██▏       | 5363/25000 [23:27<36:50,  8.88it/s]

Skipping UniRef50_A0A2I0J7E0 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 22%|██▏       | 5379/25000 [23:33<1:23:40,  3.91it/s]

Skipping UniRef50_A0A913Y9B5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 22%|██▏       | 5454/25000 [23:48<35:10,  9.26it/s]

Skipping UniRef50_A0A9B0UBH8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 22%|██▏       | 5498/25000 [23:57<26:17, 12.36it/s]

Skipping UniRef50_Q6VT95 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_A0A2E6WA18 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 22%|██▏       | 5513/25000 [24:02<59:12,  5.48it/s]  

Skipping UniRef50_A0A0L0CHI6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_A0A1I8CUS4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 22%|██▏       | 5517/25000 [24:02<41:43,  7.78it/s]

Skipping UniRef50_A0A699GH69 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 22%|██▏       | 5521/25000 [24:02<33:11,  9.78it/s]

Skipping UniRef50_UPI000811724A due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 22%|██▏       | 5530/25000 [24:05<1:28:59,  3.65it/s]

Skipping UniRef50_A0A4S2KQT3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 22%|██▏       | 5574/25000 [24:16<3:00:54,  1.79it/s]

Skipping UniRef50_A0A1Y5LM59 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 23%|██▎       | 5627/25000 [24:27<35:42,  9.04it/s]

Skipping UniRef50_A0A914CIL8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 23%|██▎       | 5656/25000 [24:34<52:02,  6.19it/s]

Skipping UniRef50_A0A341BXP8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 23%|██▎       | 5682/25000 [24:42<2:21:11,  2.28it/s]

Skipping UniRef50_A0A4S2KA88 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 23%|██▎       | 5696/25000 [24:44<1:17:39,  4.14it/s]

Skipping UniRef50_A0A6P8I2Z0 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 23%|██▎       | 5721/25000 [24:53<2:11:00,  2.45it/s]

Skipping UniRef50_UPI002452E7C8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 23%|██▎       | 5733/25000 [24:54<1:03:29,  5.06it/s]

Skipping UniRef50_A0A2I0J829 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 23%|██▎       | 5739/25000 [24:57<1:26:50,  3.70it/s]

Skipping UniRef50_UPI001786BF4C due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 23%|██▎       | 5791/25000 [25:10<1:44:33,  3.06it/s]

Skipping UniRef50_A0A7M5V3U3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 23%|██▎       | 5816/25000 [25:17<2:01:58,  2.62it/s]

Skipping UniRef50_A0A8C4PEP8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 23%|██▎       | 5837/25000 [25:22<54:23,  5.87it/s]  

Skipping UniRef50_A0A2K6G2G8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 23%|██▎       | 5856/25000 [25:24<34:53,  9.14it/s]

Skipping UniRef50_A0A8S1TEI5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 23%|██▎       | 5867/25000 [25:28<55:11,  5.78it/s]  

Skipping UniRef50_A0A171EBL6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 24%|██▎       | 5904/25000 [25:36<44:34,  7.14it/s]

Skipping UniRef50_A0A663MLB6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 24%|██▎       | 5912/25000 [25:37<35:03,  9.07it/s]

Skipping UniRef50_A0A016U4J2 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 24%|██▎       | 5915/25000 [25:37<53:59,  5.89it/s]

Skipping UniRef50_Q4RE89 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 24%|██▎       | 5926/25000 [25:42<1:08:58,  4.61it/s]

Skipping UniRef50_A0A357BMC7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 24%|██▍       | 5938/25000 [25:46<2:46:27,  1.91it/s]

Skipping UniRef50_A0A6P7ZPT3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 24%|██▍       | 5954/25000 [25:50<39:41,  8.00it/s]

Skipping UniRef50_A0A1S3F5A6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 24%|██▍       | 5963/25000 [25:51<36:25,  8.71it/s]

Skipping UniRef50_UPI0023066CCA due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 24%|██▍       | 5967/25000 [25:51<34:44,  9.13it/s]

Skipping UniRef50_A0A2E8GNA3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 24%|██▍       | 5973/25000 [25:53<1:07:44,  4.68it/s]

Skipping UniRef50_A0A345GNZ4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 24%|██▍       | 6006/25000 [26:02<2:38:12,  2.00it/s]

Skipping UniRef50_UPI000C6D93F7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 24%|██▍       | 6028/25000 [26:07<48:41,  6.49it/s]  

Skipping UniRef50_Q4S6Y5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 24%|██▍       | 6065/25000 [26:17<44:34,  7.08it/s]

Skipping UniRef50_A0A2E3EKV3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 24%|██▍       | 6074/25000 [26:18<38:38,  8.16it/s]

Skipping UniRef50_UPI001CC82850 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 24%|██▍       | 6094/25000 [26:25<1:44:46,  3.01it/s]

Skipping UniRef50_A0A2Y9IFC6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 25%|██▍       | 6126/25000 [26:33<1:14:59,  4.19it/s]

Skipping UniRef50_A0A2Y9G3B7 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 25%|██▍       | 6128/25000 [26:34<54:48,  5.74it/s]  

Skipping UniRef50_A0A2I4CIU3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein
Skipping UniRef50_A0A3S4BTG6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 25%|██▍       | 6132/25000 [26:34<42:26,  7.41it/s]

Skipping UniRef50_A0A060W9W1 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 25%|██▍       | 6177/25000 [26:44<2:18:43,  2.26it/s]

Skipping UniRef50_UPI000B50CCE8 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 25%|██▍       | 6215/25000 [26:54<42:07,  7.43it/s]

Skipping UniRef50_UPI0004BD92A4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 25%|██▍       | 6230/25000 [26:59<50:49,  6.16it/s]

Skipping UniRef50_A0A5P9K4W0 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 25%|██▍       | 6245/25000 [27:02<41:08,  7.60it/s]

Skipping UniRef50_O96554 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 25%|██▌       | 6255/25000 [27:06<1:34:50,  3.29it/s]

Skipping UniRef50_A0A0K0G019 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 25%|██▌       | 6298/25000 [27:18<1:08:25,  4.56it/s]

Skipping UniRef50_G3RAL6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 25%|██▌       | 6308/25000 [27:23<1:12:20,  4.31it/s]

Skipping UniRef50_A0A914V448 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 25%|██▌       | 6368/25000 [27:44<1:32:33,  3.36it/s]

Skipping UniRef50_A0A8S1GMY3 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 26%|██▌       | 6375/25000 [27:45<1:41:12,  3.07it/s]

Skipping UniRef50_UPI001C2EB754 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 26%|██▌       | 6392/25000 [27:51<2:59:21,  1.73it/s]

Skipping UniRef50_UPI00073FBBB4 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 26%|██▌       | 6426/25000 [28:03<1:10:39,  4.38it/s]

Skipping UniRef50_A0A913XZ57 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 26%|██▌       | 6462/25000 [28:18<5:33:34,  1.08s/it]

Skipping UniRef50_A0A0P4X051 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 26%|██▌       | 6470/25000 [28:19<1:13:29,  4.20it/s]

Skipping UniRef50_UPI000D6262EE due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 26%|██▌       | 6481/25000 [28:22<41:03,  7.52it/s]

Skipping UniRef50_UPI0021E296F6 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 26%|██▌       | 6483/25000 [28:22<37:58,  8.13it/s]

Skipping UniRef50_A0A011PZ21 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 26%|██▌       | 6504/25000 [28:32<1:50:46,  2.78it/s]

Skipping UniRef50_A0A2E4WQ50 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 26%|██▌       | 6517/25000 [28:36<1:57:30,  2.62it/s]

Skipping UniRef50_UPI0018A766D5 due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 26%|██▌       | 6525/25000 [28:37<50:04,  6.15it/s]

Skipping UniRef50_UPI0011765B6A due to ProteinAnalysis error: ''X'' is not a valid unambiguous letter for protein


 26%|██▌       | 6556/25000 [28:50<2:26:29,  2.10it/s]

In [None]:
print(dataset[0].keys())


dict_keys(['ids', 'text'])


In [None]:
!zip -r dataset25k.zip data/


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: data/token_representations/UniRef50_A0A369SCI2.pt (deflated 7%)
  adding: data/token_representations/UniRef50_H3D3Z4.pt (deflated 7%)
  adding: data/token_representations/UniRef50_UPI0022B6AE3B.pt (deflated 7%)
  adding: data/token_representations/UniRef50_A0A9Q9XUU9.pt (deflated 7%)
  adding: data/token_representations/UniRef50_A0A977KQ20.pt (deflated 7%)
  adding: data/token_representations/UniRef50_B4N0D6.pt (deflated 7%)
  adding: data/token_representations/UniRef50_U2HBY5.pt (deflated 7%)
  adding: data/token_representations/UniRef50_UPI0025FFCD96.pt (deflated 7%)
  adding: data/token_representations/UniRef50_A0A2U1U5Y8.pt (deflated 8%)
  adding: data/token_representations/UniRef50_K8FEQ1.pt (deflated 7%)
  adding: data/token_representations/UniRef50_A0A0G4HRG7.pt (deflated 7%)
  adding: data/token_representations/UniRef50_UPI0023B7D512.pt (deflated 7%)
  adding: data/token_representations/UniRef50_UPI00187

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!mkdir -p "/content/drive/My Drive/ColabBackups"
!cp -r data/ "/content/drive/My Drive/ColabBackups/"

         35.92G 100%    1.21MB/s    7:52:30 (xfr#25002, to-chk=0/25004)
