# Prepare Datasets for Benchmarking Tasks

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2022-10-05  


## Dataset1. Enzyme None-enzyme Dataset
The enzyme dataset is consists of two parts: <u>a training set</u> and <u>a testing set</u>.   
The training set is from snapshot Feb-2018 and ***excludes*** those <u>deleted items</u> and <u>sequences changed items</u> in snapshot Feb-2022.    
The training set is consists of ***469,134*** records, of which ***222,567*** are enzymes, and ***246,567*** are none-enzymes.   
The testing set is from snapshot Feb-2022 and excludes these items that appeared in snapshot Feb-2018.   
The testing set is consists of ***10,614*** records, of which ***5111*** are enzymes, and ***5503*** are none-enzymes.   
Unlike previous works,  we did not filter any sequences in terms of length and homology to make the data more inclusive. We make a label for each sequence, 1 for enzyme and 0 for none-enzyme.   

## Dataset2. Enzyme Function Quantity Dataset
The enzyme quantity dataset only contains enzyme data, contain ***222,567*** records. The function quantity ranges from 1 to 8.

## Dataset 3: EC Dataset

The EC dataset consists of 227,678 enzyme records, 222,567 are training-set, and the rest 5111 are testing-set, covering 6,031 EC numbers. Up to Feb 2022, ***cmopared with [ExplorEnz](https://www.enzyme-database.org/stats.php) CURRENT EC = 6674***, there still exist 643 EC numbers that the model can not handle in the benchmarking. Thus, we exclude the sequences with these 267 EC numbers in the evaluation process. But, this problem can be resolved in the production scenario because we use the entire data from Swiss-Prot. Now the EC coverage is 6,031 and can be automatically extended, for the training is real-time based on the publication of Swiss-Prot every 8 weeks. 

## 1. Import packages

In [5]:
import numpy as np
import pandas as pd
import sys,os
from tqdm import tqdm
import config as cfg
from functools import reduce

from tools import filetool as ftool
from tools import exact_ec_from_uniprot as exactec
from tools import funclib
from tools import minitools as mtool
# from tools import embedding_esm as esmebd
from tools import embdding_onehot as onehotebd

from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)


%load_ext autoreload
%autoreload 2
print(f'{os.getcwd()}\\')

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
c:\Users\Erfan\Downloads\Saeedeh\ECRECer\


## 2. Download rawdata from unisprot

> IF first time run pls uncomment the cell below

In [6]:
#snapshot 2018-02
ftool.wget(download_url=cfg.URL_SPROT_SNAP201802, save_file=cfg.FILE_SPROT_SNAP201802)

#snapshot 2019-12
ftool.wget(download_url=cfg.URL_SPROT_SNAP201902, save_file=cfg.FILE_SPROT_SNAP201902)

#snapshot 2020-06
ftool.wget(download_url=cfg.URL_SPROT_SNAP202006, save_file=cfg.FILE_SPROT_SNAP202006)

#snapshot 2021-02
ftool.wget(download_url=cfg.URL_SPROT_SNAP202102, save_file=cfg.FILE_SPROT_SNAP202102)

#snapshot 2022-02
ftool.wget(download_url=cfg.URL_SPROT_SNAP202202, save_file=cfg.FILE_SPROT_SNAP202202)

wget not installed
wget not installed
wget not installed
wget not installed
wget not installed


## 3. Extract records from rawdata

In [26]:
cmd_array = [
    # 2018 data
    f'tar -zxvf {cfg.FILE_SPROT_SNAP201802} -C {cfg.DIR_UNIPROT}',
    f'move {cfg.DIR_UNIPROT}uniprot_sprot.dat.gz {cfg.DIR_UNIPROT}sprot2018.data.gz', 
    f'del /Q /F {cfg.DIR_UNIPROT}uniprot_sprot.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot_varsplic.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot.xml.gz',
    
        
    # 2019 data
    f'tar -zxvf {cfg.FILE_SPROT_SNAP201902} -C {cfg.DIR_UNIPROT}',
    f'move {cfg.DIR_UNIPROT}uniprot_sprot.dat.gz {cfg.DIR_UNIPROT}sprot2019.data.gz', 
    f'del /Q /F {cfg.DIR_UNIPROT}uniprot_sprot.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot_varsplic.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot.xml.gz',

    # 2020 data
    f'tar -zxvf {cfg.FILE_SPROT_SNAP202006} -C {cfg.DIR_UNIPROT}',
    f'move {cfg.DIR_UNIPROT}uniprot_sprot.dat.gz {cfg.DIR_UNIPROT}sprot2020.data.gz', 
    f'del /Q /F {cfg.DIR_UNIPROT}uniprot_sprot.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot_varsplic.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot.xml.gz',

    # 2021 data
    f'tar -zxvf {cfg.FILE_SPROT_SNAP202102} -C {cfg.DIR_UNIPROT}',
    f'move {cfg.DIR_UNIPROT}uniprot_sprot.dat.gz {cfg.DIR_UNIPROT}sprot2021.data.gz', 
    f'del /Q /F {cfg.DIR_UNIPROT}uniprot_sprot.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot_varsplic.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot.xml.gz',

    # 2022 data
    f'tar -zxvf {cfg.FILE_SPROT_SNAP202202} -C {cfg.DIR_UNIPROT}',
    f'move {cfg.DIR_UNIPROT}uniprot_sprot.dat.gz {cfg.DIR_UNIPROT}sprot2022.data.gz', 
    f'del /Q /F {cfg.DIR_UNIPROT}uniprot_sprot.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot_varsplic.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot.xml.gz'
]

[os.system(item) for item in cmd_array]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [27]:
# snapshot 2018-02
exactec.run_exact_task(infile=f'{cfg.DIR_UNIPROT}sprot2018.data.gz', outfile=f'{cfg.DIR_UNIPROT}sprot2018.tsv')

# #snapshot 2019-02
exactec.run_exact_task(infile=f'{cfg.DIR_UNIPROT}sprot2019.data.gz', outfile=f'{cfg.DIR_UNIPROT}sprot2019.tsv')

#snapshot 2020-06
exactec.run_exact_task(infile=f'{cfg.DIR_UNIPROT}sprot2020.data.gz', outfile=f'{cfg.DIR_UNIPROT}sprot2020.tsv')

#snapshot 2021-06
exactec.run_exact_task(infile=f'{cfg.DIR_UNIPROT}sprot2021.data.gz', outfile=f'{cfg.DIR_UNIPROT}sprot2021.tsv')

# #snapshot 2022-02
exactec.run_exact_task(infile=f'{cfg.DIR_UNIPROT}sprot2022.data.gz', outfile=f'{cfg.DIR_UNIPROT}sprot2022.tsv')


0it [00:00, ?it/s][A
401it [00:00, 4009.76it/s][A
802it [00:00, 2782.48it/s][A
1108it [00:00, 2884.55it/s][A
1513it [00:00, 3275.23it/s][A
1966it [00:00, 3676.25it/s][A
2347it [00:00, 3121.22it/s][A
2679it [00:00, 2963.02it/s][A
2989it [00:00, 2952.16it/s][A
3294it [00:01, 2780.10it/s][A
3579it [00:01, 2754.92it/s][A
3859it [00:01, 2557.23it/s][A
4120it [00:01, 2097.88it/s][A
4344it [00:01, 1968.62it/s][A
4556it [00:01, 2004.39it/s][A
4765it [00:01, 1971.13it/s][A
5106it [00:01, 2345.93it/s][A
5451it [00:02, 2638.61it/s][A
5767it [00:02, 2776.13it/s][A
6053it [00:02, 2784.21it/s][A
6337it [00:02, 2626.52it/s][A
6605it [00:02, 2247.51it/s][A
6860it [00:02, 2318.09it/s][A
7102it [00:02, 2161.91it/s][A
7327it [00:02, 2124.28it/s][A
7754it [00:02, 2684.16it/s][A
8240it [00:03, 3270.32it/s][A
8580it [00:03, 3216.50it/s][A
8911it [00:03, 2930.46it/s][A
9215it [00:03, 2761.44it/s][A
9500it [00:03, 2699.81it/s][A
9778it [00:03, 2706.79it/s][A
10053it [00:03, 23

144453it [00:56, 2689.04it/s][A
144748it [00:56, 2758.11it/s][A
145229it [00:57, 3366.28it/s][A
145568it [00:57, 3131.16it/s][A
145886it [00:57, 2669.28it/s][A
146261it [00:57, 2945.24it/s][A
146574it [00:57, 2986.70it/s][A
146884it [00:57, 2898.06it/s][A
147323it [00:57, 3298.56it/s][A
147662it [00:57, 3164.16it/s][A
147986it [00:58, 2971.46it/s][A
148290it [00:58, 2919.17it/s][A
148587it [00:58, 2683.88it/s][A
148861it [00:58, 2453.42it/s][A
149144it [00:58, 2542.84it/s][A
149404it [00:58, 2498.20it/s][A
149658it [00:58, 2417.79it/s][A
149903it [00:58, 2350.00it/s][A
150187it [00:58, 2477.05it/s][A
150437it [00:59, 2256.00it/s][A
150765it [00:59, 2528.98it/s][A
151025it [00:59, 2486.85it/s][A
151319it [00:59, 2605.45it/s][A
151584it [00:59, 2603.32it/s][A
151912it [00:59, 2789.58it/s][A
152194it [00:59, 2766.29it/s][A
152474it [00:59, 2768.06it/s][A
152753it [00:59, 2516.50it/s][A
153010it [01:00, 2291.72it/s][A
153246it [01:00, 2172.80it/s][A
153513it [

295218it [01:52, 2140.70it/s][A
295438it [01:52, 2099.52it/s][A
295758it [01:52, 2402.34it/s][A
296004it [01:52, 2320.34it/s][A
296240it [01:53, 2113.32it/s][A
296457it [01:53, 2041.66it/s][A
296699it [01:53, 2137.05it/s][A
297006it [01:53, 2393.72it/s][A
297307it [01:53, 2560.47it/s][A
297623it [01:53, 2724.62it/s][A
297937it [01:53, 2836.85it/s][A
298286it [01:53, 3019.01it/s][A
298591it [01:53, 2900.23it/s][A
298884it [01:53, 2867.65it/s][A
299186it [01:54, 2911.48it/s][A
299479it [01:54, 2883.27it/s][A
299769it [01:54, 2613.74it/s][A
300227it [01:54, 3143.47it/s][A
300634it [01:54, 3403.21it/s][A
300982it [01:54, 3222.56it/s][A
301312it [01:54, 2876.55it/s][A
301611it [01:54, 2803.77it/s][A
301899it [01:55, 2560.62it/s][A
302205it [01:55, 2687.64it/s][A
302503it [01:55, 2758.27it/s][A
302785it [01:55, 2672.48it/s][A
303057it [01:55, 2656.34it/s][A
303332it [01:55, 2675.06it/s][A
303602it [01:55, 2622.39it/s][A
303946it [01:55, 2847.05it/s][A
304233it [

469356it [02:48, 2245.07it/s][A
469617it [02:48, 2342.76it/s][A
469854it [02:48, 2350.62it/s][A
470102it [02:48, 2388.40it/s][A
470435it [02:49, 2657.81it/s][A
470780it [02:49, 2891.86it/s][A
471071it [02:49, 2830.07it/s][A
471383it [02:49, 2914.92it/s][A
471708it [02:49, 3013.43it/s][A
472011it [02:49, 2670.39it/s][A
472286it [02:49, 2507.56it/s][A
472544it [02:49, 2513.47it/s][A
472859it [02:49, 2688.12it/s][A
473181it [02:50, 2829.95it/s][A
473516it [02:50, 2978.84it/s][A
473818it [02:50, 2898.25it/s][A
474111it [02:50, 2530.21it/s][A
474374it [02:50, 2398.84it/s][A
474622it [02:50, 2407.39it/s][A
474939it [02:50, 2606.43it/s][A
475206it [02:50, 2595.25it/s][A
475470it [02:50, 2564.49it/s][A
475730it [02:51, 2397.69it/s][A
475974it [02:51, 2243.31it/s][A
476203it [02:51, 2249.77it/s][A
476532it [02:51, 2529.57it/s][A
476790it [02:51, 2246.10it/s][A
477036it [02:51, 2302.18it/s][A
477287it [02:51, 2352.48it/s][A
477602it [02:51, 2568.41it/s][A
477939it [

finished use time 194.625 s



0it [00:00, ?it/s][A
410it [00:00, 4099.78it/s][A
820it [00:00, 2743.52it/s][A
1169it [00:00, 3001.46it/s][A
1658it [00:00, 3641.21it/s][A
2046it [00:00, 3523.87it/s][A
2414it [00:00, 3089.26it/s][A
2740it [00:00, 3036.13it/s][A
3055it [00:00, 3050.51it/s][A
3368it [00:01, 2826.73it/s][A
3658it [00:01, 2701.81it/s][A
3933it [00:01, 2518.97it/s][A
4189it [00:01, 2047.03it/s][A
4409it [00:01, 2003.68it/s][A
4619it [00:01, 1993.22it/s][A
4825it [00:01, 1945.97it/s][A
5149it [00:01, 2276.26it/s][A
5468it [00:02, 2523.12it/s][A
5770it [00:02, 2654.11it/s][A
6043it [00:02, 2660.56it/s][A
6314it [00:02, 2600.02it/s][A
6578it [00:02, 2235.17it/s][A
6839it [00:02, 2326.10it/s][A
7081it [00:02, 2236.94it/s][A
7312it [00:02, 2197.60it/s][A
7736it [00:02, 2744.78it/s][A
8205it [00:03, 3286.74it/s][A
8552it [00:03, 3330.16it/s][A
8893it [00:03, 2952.54it/s][A
9201it [00:03, 2828.26it/s][A
9493it [00:03, 2675.36it/s][A
9804it [00:03, 2788.85it/s][A
10090it [00:03, 24

149990it [00:56, 2427.92it/s][A
150235it [00:57, 2350.01it/s][A
150501it [00:57, 2435.23it/s][A
150759it [00:57, 2469.10it/s][A
151074it [00:57, 2664.24it/s][A
151343it [00:57, 2633.48it/s][A
151608it [00:57, 2600.54it/s][A
151869it [00:57, 2602.93it/s][A
152138it [00:57, 2621.12it/s][A
152411it [00:57, 2653.18it/s][A
152691it [00:57, 2688.70it/s][A
152961it [00:58, 2614.36it/s][A
153224it [00:58, 2327.27it/s][A
153463it [00:58, 2120.77it/s][A
153682it [00:58, 2023.48it/s][A
153924it [00:58, 2120.74it/s][A
154141it [00:58, 2078.01it/s][A
154428it [00:58, 2294.05it/s][A
154720it [00:58, 2462.68it/s][A
155026it [00:59, 2625.69it/s][A
155293it [00:59, 2600.91it/s][A
155556it [00:59, 2514.75it/s][A
155810it [00:59, 2493.53it/s][A
156075it [00:59, 2531.17it/s][A
156330it [00:59, 2444.43it/s][A
156576it [00:59, 2400.92it/s][A
156819it [00:59, 2402.36it/s][A
157060it [00:59, 2330.44it/s][A
157361it [00:59, 2517.54it/s][A
157615it [01:00, 2516.53it/s][A
157868it [

300331it [01:52, 2956.49it/s][A
300631it [01:52, 2836.00it/s][A
300928it [01:52, 2874.34it/s][A
301219it [01:52, 2884.68it/s][A
301510it [01:52, 2709.41it/s][A
301785it [01:52, 2646.40it/s][A
302376it [01:52, 3564.46it/s][A
302741it [01:53, 3278.80it/s][A
303079it [01:53, 2842.32it/s][A
303379it [01:53, 2692.67it/s][A
303659it [01:53, 2395.38it/s][A
303949it [01:53, 2510.56it/s][A
304248it [01:53, 2632.63it/s][A
304521it [01:53, 2575.90it/s][A
304786it [01:53, 2495.00it/s][A
305040it [01:54, 2435.01it/s][A
305287it [01:54, 2437.92it/s][A
305533it [01:54, 2430.26it/s][A
305846it [01:54, 2629.00it/s][A
306119it [01:54, 2650.81it/s][A
306468it [01:54, 2887.35it/s][A
306759it [01:54, 2741.09it/s][A
307036it [01:54, 2331.64it/s][A
307290it [01:54, 2379.12it/s][A
307597it [01:55, 2563.64it/s][A
307862it [01:55, 2452.20it/s][A
308114it [01:55, 2046.78it/s][A
308334it [01:55, 1849.90it/s][A
308554it [01:55, 1932.43it/s][A
308758it [01:55, 1843.46it/s][A
308955it [

472611it [02:48, 2385.99it/s][A
472853it [02:48, 2361.78it/s][A
473099it [02:48, 2390.10it/s][A
473418it [02:48, 2615.97it/s][A
473766it [02:48, 2861.45it/s][A
474054it [02:48, 2753.49it/s][A
474352it [02:49, 2818.49it/s][A
474688it [02:49, 2976.27it/s][A
474988it [02:49, 2648.49it/s][A
475261it [02:49, 2420.96it/s][A
475511it [02:49, 2349.75it/s][A
475811it [02:49, 2514.53it/s][A
476116it [02:49, 2653.46it/s][A
476436it [02:49, 2806.48it/s][A
476730it [02:49, 2836.54it/s][A
477018it [02:50, 2415.84it/s][A
477273it [02:50, 2341.30it/s][A
477516it [02:50, 2181.87it/s][A
477812it [02:50, 2375.31it/s][A
478125it [02:50, 2570.36it/s][A
478398it [02:50, 2614.38it/s][A
478666it [02:50, 2596.28it/s][A
478930it [02:50, 2390.79it/s][A
479175it [02:50, 2208.87it/s][A
479454it [02:51, 2355.30it/s][A
479746it [02:51, 2502.25it/s][A
480002it [02:51, 2231.49it/s][A
480267it [02:51, 2338.91it/s][A
480509it [02:51, 2260.27it/s][A
480830it [02:51, 2517.48it/s][A
481164it [

finished use time 194.828 s



0it [00:00, ?it/s][A
378it [00:00, 3742.80it/s][A
753it [00:00, 3094.37it/s][A
1076it [00:00, 3137.02it/s][A
1524it [00:00, 3617.50it/s][A
1922it [00:00, 3729.23it/s][A
2300it [00:00, 2966.45it/s][A
2621it [00:00, 2804.50it/s][A
2918it [00:00, 2658.90it/s][A
3195it [00:01, 2378.48it/s][A
3443it [00:01, 2368.44it/s][A
3687it [00:01, 2225.92it/s][A
3915it [00:01, 2070.15it/s][A
4126it [00:01, 1741.43it/s][A
4309it [00:01, 1549.81it/s][A
4501it [00:01, 1633.38it/s][A
4673it [00:02, 1552.30it/s][A
4843it [00:02, 1584.78it/s][A
5103it [00:02, 1843.78it/s][A
5430it [00:02, 2223.13it/s][A
5716it [00:02, 2393.49it/s][A
5989it [00:02, 2481.91it/s][A
6255it [00:02, 2525.89it/s][A
6512it [00:02, 2268.32it/s][A
6747it [00:02, 2113.12it/s][A
6999it [00:03, 2214.28it/s][A
7227it [00:03, 2089.78it/s][A
7441it [00:03, 2097.63it/s][A
7882it [00:03, 2727.58it/s][A
8368it [00:03, 3319.56it/s][A
8709it [00:03, 3234.25it/s][A
9039it [00:03, 2827.90it/s][A
9335it [00:03, 259

137735it [00:57, 2864.40it/s][A
138115it [00:57, 3113.44it/s][A
138432it [00:58, 2833.95it/s][A
138724it [00:58, 2811.87it/s][A
139050it [00:58, 2930.76it/s][A
139348it [00:58, 2879.97it/s][A
139640it [00:58, 2455.76it/s][A
139898it [00:58, 2456.07it/s][A
140153it [00:58, 2454.80it/s][A
140405it [00:58, 2439.58it/s][A
140654it [00:58, 2446.77it/s][A
140902it [00:59, 2388.58it/s][A
141143it [00:59, 2284.99it/s][A
141432it [00:59, 2452.84it/s][A
141738it [00:59, 2617.93it/s][A
142003it [00:59, 2444.28it/s][A
142271it [00:59, 2502.62it/s][A
142525it [00:59, 2471.26it/s][A
142775it [00:59, 2339.66it/s][A
143012it [00:59, 2201.40it/s][A
143235it [01:00, 2134.34it/s][A
143451it [01:00, 2095.72it/s][A
143668it [01:00, 2110.37it/s][A
143880it [01:00, 2060.80it/s][A
144103it [01:00, 2108.50it/s][A
144380it [01:00, 2291.87it/s][A
144626it [01:00, 2340.46it/s][A
144862it [01:00, 2260.28it/s][A
145090it [01:00, 2042.77it/s][A
145311it [01:01, 2087.95it/s][A
145524it [

274370it [01:53, 2494.59it/s][A
274639it [01:54, 2542.86it/s][A
274897it [01:54, 2350.49it/s][A
275137it [01:54, 2223.61it/s][A
275364it [01:54, 2043.32it/s][A
275607it [01:54, 2143.79it/s][A
275827it [01:54, 2141.78it/s][A
276084it [01:54, 2260.43it/s][A
276318it [01:54, 2283.01it/s][A
276557it [01:54, 2313.87it/s][A
276791it [01:55, 1895.52it/s][A
276994it [01:55, 1863.57it/s][A
277199it [01:55, 1911.95it/s][A
277398it [01:55, 1927.84it/s][A
277596it [01:55, 1905.26it/s][A
277890it [01:55, 2195.42it/s][A
278115it [01:55, 2198.19it/s][A
278338it [01:55, 1904.97it/s][A
278672it [01:55, 2281.65it/s][A
279157it [01:56, 2982.30it/s][A
279471it [01:56, 2714.41it/s][A
279758it [01:56, 2433.79it/s][A
280074it [01:56, 2608.99it/s][A
280349it [01:56, 2578.32it/s][A
280619it [01:56, 2610.89it/s][A
280887it [01:56, 2485.81it/s][A
281260it [01:56, 2823.77it/s][A
281550it [01:57, 2466.01it/s][A
281833it [01:57, 2559.47it/s][A
282100it [01:57, 2482.37it/s][A
282452it [

437805it [02:51, 2032.68it/s][A
438094it [02:51, 2271.08it/s][A
438468it [02:51, 2688.39it/s][A
438762it [02:51, 2753.20it/s][A
439041it [02:51, 2670.37it/s][A
439311it [02:51, 2604.56it/s][A
439574it [02:51, 2431.64it/s][A
439880it [02:51, 2604.11it/s][A
440145it [02:51, 2406.94it/s][A
440399it [02:52, 2442.69it/s][A
440648it [02:52, 2026.13it/s][A
440865it [02:52, 1983.45it/s][A
441165it [02:52, 2241.00it/s][A
441445it [02:52, 2383.25it/s][A
441693it [02:52, 2226.52it/s][A
441924it [02:52, 2113.59it/s][A
442142it [02:52, 2024.50it/s][A
442354it [02:52, 2044.40it/s][A
442562it [02:53, 1932.88it/s][A
442808it [02:53, 2074.14it/s][A
443020it [02:53, 1972.09it/s][A
443263it [02:53, 2096.17it/s][A
443477it [02:53, 2028.71it/s][A
443683it [02:53, 1993.02it/s][A
443918it [02:53, 2081.56it/s][A
444134it [02:53, 2097.90it/s][A
444391it [02:53, 2227.60it/s][A
444686it [02:54, 2430.84it/s][A
444931it [02:54, 2016.73it/s][A
445251it [02:54, 2315.34it/s][A
445496it [

finished use time 217.766 s



0it [00:00, ?it/s][A
349it [00:00, 3455.71it/s][A
695it [00:00, 2828.55it/s][A
985it [00:00, 2783.65it/s][A
1298it [00:00, 2908.76it/s][A
1715it [00:00, 3326.75it/s][A
2052it [00:00, 3033.87it/s][A
2362it [00:00, 2448.87it/s][A
2626it [00:00, 2478.56it/s][A
2890it [00:01, 2508.04it/s][A
3151it [00:01, 2343.64it/s][A
3394it [00:01, 2347.87it/s][A
3635it [00:01, 2253.98it/s][A
3865it [00:01, 2127.97it/s][A
4082it [00:01, 1874.25it/s][A
4276it [00:01, 1521.35it/s][A
4477it [00:01, 1630.41it/s][A
4681it [00:02, 1729.43it/s][A
4866it [00:02, 1733.70it/s][A
5130it [00:02, 1969.27it/s][A
5458it [00:02, 2328.81it/s][A
5738it [00:02, 2461.34it/s][A
6013it [00:02, 2537.29it/s][A
6275it [00:02, 2560.99it/s][A
6536it [00:02, 2287.39it/s][A
6773it [00:02, 2130.77it/s][A
7006it [00:03, 2183.04it/s][A
7230it [00:03, 2063.20it/s][A
7441it [00:03, 2016.39it/s][A
7847it [00:03, 2563.67it/s][A
8313it [00:03, 3147.53it/s][A
8680it [00:03, 3287.16it/s][A
9017it [00:03, 2761

134934it [00:57, 2104.96it/s][A
135158it [00:58, 2107.14it/s][A
135441it [00:58, 2297.08it/s][A
135682it [00:58, 2321.60it/s][A
135921it [00:58, 2327.72it/s][A
136254it [00:58, 2612.69it/s][A
136605it [00:58, 2864.02it/s][A
136994it [00:58, 3162.20it/s][A
137314it [00:58, 3118.57it/s][A
137629it [00:58, 2852.71it/s][A
137921it [00:58, 2699.75it/s][A
138290it [00:59, 2959.45it/s][A
138593it [00:59, 2898.73it/s][A
138888it [00:59, 2811.17it/s][A
139214it [00:59, 2927.65it/s][A
139512it [00:59, 2942.40it/s][A
139809it [00:59, 2553.13it/s][A
140075it [00:59, 2307.99it/s][A
140321it [00:59, 2340.68it/s][A
140563it [01:00, 2226.57it/s][A
140854it [01:00, 2399.40it/s][A
141101it [01:00, 2317.58it/s][A
141338it [01:00, 2189.47it/s][A
141639it [01:00, 2408.35it/s][A
141958it [01:00, 2616.83it/s][A
142226it [01:00, 2320.17it/s][A
142506it [01:00, 2439.50it/s][A
142759it [01:00, 2392.76it/s][A
143004it [01:01, 2325.38it/s][A
143241it [01:01, 2131.94it/s][A
143460it [

266730it [01:53, 2618.80it/s][A
267005it [01:53, 2656.17it/s][A
267274it [01:54, 2658.26it/s][A
267543it [01:54, 2439.33it/s][A
267836it [01:54, 2568.10it/s][A
268098it [01:54, 2246.20it/s][A
268333it [01:54, 2177.04it/s][A
268558it [01:54, 2026.31it/s][A
268776it [01:54, 2065.79it/s][A
269020it [01:54, 2166.22it/s][A
269241it [01:55, 1955.84it/s][A
269443it [01:55, 1776.92it/s][A
269711it [01:55, 2004.74it/s][A
270007it [01:55, 2251.98it/s][A
270336it [01:55, 2536.09it/s][A
270656it [01:55, 2715.65it/s][A
270976it [01:55, 2853.96it/s][A
271287it [01:55, 2928.25it/s][A
271585it [01:55, 2835.22it/s][A
271873it [01:56, 2784.54it/s][A
272155it [01:56, 2420.36it/s][A
272407it [01:56, 1984.37it/s][A
272652it [01:56, 2087.84it/s][A
272877it [01:56, 1957.61it/s][A
273103it [01:56, 2027.17it/s][A
273385it [01:56, 2227.51it/s][A
273629it [01:56, 2284.94it/s][A
273878it [01:56, 2335.56it/s][A
274117it [01:57, 2279.79it/s][A
274349it [01:57, 2079.70it/s][A
274613it [

424918it [02:51, 4481.22it/s][A
425371it [02:51, 3801.98it/s][A
425771it [02:51, 3603.50it/s][A
426146it [02:51, 3439.40it/s][A
426500it [02:51, 3465.12it/s][A
426902it [02:51, 3603.96it/s][A
427269it [02:51, 3612.22it/s][A
427635it [02:51, 3535.88it/s][A
427992it [02:51, 3486.49it/s][A
428343it [02:52, 3314.38it/s][A
428678it [02:52, 3191.25it/s][A
429000it [02:52, 3190.40it/s][A
429321it [02:52, 2916.14it/s][A
429618it [02:52, 2718.62it/s][A
429895it [02:52, 2682.78it/s][A
430336it [02:52, 3139.48it/s][A
430760it [02:52, 3435.13it/s][A
431111it [02:52, 3398.53it/s][A
431463it [02:53, 3423.42it/s][A
431825it [02:53, 3480.13it/s][A
432176it [02:53, 3458.57it/s][A
432524it [02:53, 2587.23it/s][A
432816it [02:53, 2109.58it/s][A
433062it [02:53, 1989.89it/s][A
433285it [02:53, 1868.18it/s][A
433488it [02:54, 1760.23it/s][A
433675it [02:54, 1745.63it/s][A
433870it [02:54, 1790.29it/s][A
434060it [02:54, 1813.59it/s][A
434295it [02:54, 1951.13it/s][A
434517it [

finished use time 223.625 s



0it [00:00, ?it/s][A
370it [00:00, 3699.98it/s][A
740it [00:00, 2783.19it/s][A
1032it [00:00, 2803.60it/s][A
1397it [00:00, 3091.93it/s][A
1823it [00:00, 3471.38it/s][A
2178it [00:00, 3002.46it/s][A
2493it [00:00, 2290.63it/s][A
2753it [00:01, 2324.88it/s][A
3007it [00:01, 2360.58it/s][A
3259it [00:01, 2236.45it/s][A
3536it [00:01, 2373.39it/s][A
3784it [00:01, 2201.21it/s][A
4013it [00:01, 2014.81it/s][A
4222it [00:01, 1675.31it/s][A
4402it [00:01, 1460.20it/s][A
4595it [00:02, 1559.52it/s][A
4802it [00:02, 1676.82it/s][A
4981it [00:02, 1619.27it/s][A
5231it [00:02, 1839.75it/s][A
5538it [00:02, 2167.88it/s][A
5787it [00:02, 2250.80it/s][A
6065it [00:02, 2397.76it/s][A
6333it [00:02, 2478.79it/s][A
6586it [00:02, 2293.35it/s][A
6822it [00:03, 1964.35it/s][A
7043it [00:03, 2021.01it/s][A
7255it [00:03, 1969.22it/s][A
7459it [00:03, 1779.62it/s][A
7831it [00:03, 2266.54it/s][A
8239it [00:03, 2740.94it/s][A
8697it [00:03, 3245.51it/s][A
9037it [00:03, 268

129489it [00:58, 2108.37it/s][A
129706it [00:58, 2044.56it/s][A
129979it [00:58, 2219.43it/s][A
130274it [00:58, 2418.90it/s][A
130529it [00:58, 2448.87it/s][A
130927it [00:58, 2877.95it/s][A
131220it [00:59, 2884.67it/s][A
131512it [00:59, 2572.27it/s][A
131778it [00:59, 2582.03it/s][A
132043it [00:59, 2317.92it/s][A
132408it [00:59, 2644.93it/s][A
132683it [00:59, 2141.27it/s][A
132967it [00:59, 2306.03it/s][A
133219it [00:59, 2354.08it/s][A
133469it [01:00, 2037.35it/s][A
133689it [01:00, 1867.86it/s][A
133889it [01:00, 1868.19it/s][A
134199it [01:00, 2171.62it/s][A
134428it [01:00, 2135.33it/s][A
134672it [01:00, 2216.72it/s][A
134940it [01:00, 2338.24it/s][A
135201it [01:00, 2414.97it/s][A
135548it [01:00, 2718.25it/s][A
135825it [01:01, 2196.26it/s][A
136064it [01:01, 2122.35it/s][A
136290it [01:01, 2109.30it/s][A
136549it [01:01, 2228.79it/s][A
136816it [01:01, 2348.37it/s][A
137058it [01:01, 2304.07it/s][A
137370it [01:01, 2532.11it/s][A
137733it [

260910it [01:54, 2401.82it/s][A
261182it [01:54, 2486.75it/s][A
261468it [01:55, 2595.14it/s][A
261775it [01:55, 2726.87it/s][A
262082it [01:55, 2819.79it/s][A
262366it [01:55, 2776.48it/s][A
262645it [01:55, 2670.75it/s][A
262914it [01:55, 2676.27it/s][A
263183it [01:55, 2656.82it/s][A
263467it [01:55, 2710.24it/s][A
263747it [01:55, 2728.89it/s][A
264021it [01:56, 2623.74it/s][A
264285it [01:56, 2583.71it/s][A
264545it [01:56, 2480.85it/s][A
264795it [01:56, 2424.85it/s][A
265039it [01:56, 2362.12it/s][A
265276it [01:56, 2351.29it/s][A
265512it [01:56, 2269.39it/s][A
265740it [01:56, 2078.53it/s][A
265970it [01:56, 2132.51it/s][A
266186it [01:57, 1731.44it/s][A
266373it [01:57, 1502.46it/s][A
266547it [01:57, 1553.08it/s][A
266790it [01:57, 1763.85it/s][A
266979it [01:57, 1755.60it/s][A
267239it [01:57, 1979.84it/s][A
267538it [01:57, 2251.50it/s][A
267883it [01:57, 2581.62it/s][A
268149it [01:57, 2560.06it/s][A
268416it [01:58, 2584.20it/s][A
268685it [

407990it [02:52, 4064.16it/s][A
408443it [02:52, 4201.02it/s][A
408904it [02:52, 4309.88it/s][A
409339it [02:52, 4113.87it/s][A
409815it [02:52, 4299.46it/s][A
410249it [02:52, 3679.64it/s][A
410768it [02:52, 4075.10it/s][A
411274it [02:52, 4331.70it/s][A
411778it [02:53, 4517.06it/s][A
412242it [02:53, 4463.10it/s][A
412697it [02:53, 4412.36it/s][A
413144it [02:53, 4416.01it/s][A
413590it [02:53, 4415.95it/s][A
414066it [02:53, 4503.49it/s][A
414587it [02:53, 4697.10it/s][A
415105it [02:53, 4839.36it/s][A
415624it [02:53, 4929.47it/s][A
416171it [02:54, 5089.49it/s][A
416698it [02:54, 5143.12it/s][A
417229it [02:54, 5177.62it/s][A
417756it [02:54, 5205.13it/s][A
418277it [02:54, 5156.80it/s][A
418794it [02:54, 4968.69it/s][A
419302it [02:54, 5000.95it/s][A
419804it [02:54, 4977.08it/s][A
420303it [02:54, 4908.72it/s][A
420795it [02:54, 4467.53it/s][A
421250it [02:55, 4334.83it/s][A
421689it [02:55, 4101.75it/s][A
422105it [02:55, 3906.41it/s][A
422500it [

556180it [03:47, 3022.88it/s][A
556670it [03:47, 3531.45it/s][A
557032it [03:48, 3446.96it/s][A
557383it [03:48, 3306.93it/s][A
557743it [03:48, 3387.90it/s][A
558086it [03:48, 3288.41it/s][A
558440it [03:48, 3358.83it/s][A
558819it [03:48, 3472.23it/s][A
559169it [03:48, 3382.47it/s][A
559522it [03:48, 3415.28it/s][A
559906it [03:48, 3538.02it/s][A
560310it [03:48, 3674.90it/s][A
560712it [03:49, 3776.21it/s][A
561091it [03:49, 3650.89it/s][A
561458it [03:49, 3583.66it/s][A
561880it [03:49, 3756.60it/s][A
562338it [03:49, 3996.52it/s][A
562740it [03:49, 3899.94it/s][A
563132it [03:49, 3817.13it/s][A
563515it [03:49, 3479.57it/s][A
563869it [03:49, 3394.54it/s][A
564246it [03:50, 3497.28it/s][A
564600it [03:50, 3138.32it/s][A
564978it [03:50, 3308.14it/s][A
565317it [03:50, 2378.07it/s][A
565596it [03:50, 2021.46it/s][A
565833it [03:50, 2014.73it/s][A
566059it [03:51, 1913.46it/s][A
566267it [03:51, 1709.23it/s][A
566451it [03:51, 1597.78it/s][A
566619it [

finished use time 231.781 s





## 4. Load records & Drop Duplicates

In [28]:
#加载数据并转换时间格式
sprot2018 = pd.read_csv(f'{cfg.DIR_UNIPROT}sprot2018.tsv', sep='\t',header=0) #读入文件
sprot2018 = mtool.convert_DF_dateTime(inputdf = sprot2018)

sprot2019 = pd.read_csv(f'{cfg.DIR_UNIPROT}sprot2019.tsv', sep='\t',header=0) #读入文件
sprot2019 = mtool.convert_DF_dateTime(inputdf = sprot2019)

sprot2020 = pd.read_csv(f'{cfg.DIR_UNIPROT}sprot2020.tsv', sep='\t',header=0) #读入文件
sprot2020 = mtool.convert_DF_dateTime(inputdf = sprot2020)

sprot2021 = pd.read_csv(f'{cfg.DIR_UNIPROT}sprot2021.tsv', sep='\t',header=0) #读入文件
sprot2021 = mtool.convert_DF_dateTime(inputdf = sprot2021)

sprot2022 = pd.read_csv(f'{cfg.DIR_UNIPROT}sprot2022.tsv', sep='\t',header=0) #读入文件
sprot2022 = mtool.convert_DF_dateTime(inputdf = sprot2022)


#Drop Duplicates
sprot2018.drop_duplicates(subset=['seq'], keep='first', inplace=True)
sprot2018.reset_index(drop=True, inplace=True)

sprot2019.drop_duplicates(subset=['seq'], keep='first', inplace=True)
sprot2019.reset_index(drop=True, inplace=True)

sprot2020.drop_duplicates(subset=['seq'], keep='first', inplace=True)
sprot2020.reset_index(drop=True, inplace=True)

sprot2021.drop_duplicates(subset=['seq'], keep='first', inplace=True)
sprot2021.reset_index(drop=True, inplace=True)

sprot2022.drop_duplicates(subset=['seq'], keep='first', inplace=True)
sprot2022.reset_index(drop=True, inplace=True)

In [29]:
sprot2018.head(3)

Unnamed: 0,id,name,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,Q3J1A3,LHB1_RHOS4,False,False,0,-,0,1986-07-21,2007-01-23,2017-10-25,MADKSDLGYTGLTDEQAQELHSVYMSGLWLFSAVAIVAHLAVYIWRPWF,49
1,P02157,MYG_MELME,False,False,0,-,0,1986-07-21,2007-01-23,2017-11-22,MGLSDGEWQLVLNVWGKVEADLAGHGQEVLIRLFKGHPETLEKFDK...,154
2,P02178,MYG_MEGNO,False,False,0,-,0,1986-07-21,2007-01-23,2017-11-22,MVLSDAEWQLVLNIWAKVEADVAGHGQDILIRLFKGHPETLEKFDK...,154


In [30]:
sprot2020.head(3)

Unnamed: 0,id,name,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,P03711,SCAF_LAMBD,True,False,1,3.4.21.-,3,1986-07-21,1986-07-21,2020-12-02,MTAELRNLPHIASMAFNEPLMLEPAYARVFFCALAGQLGISSLTDA...,439
1,P01027,CO3_MOUSE,False,False,0,-,0,1986-07-21,2011-07-27,2020-12-02,MGPASGSQLLVLLLLLASSPLALGIPMYSIITPNVLRLESEETIVL...,1663
2,P02706,ASGR1_RAT,False,False,0,-,0,1986-07-21,2007-01-23,2020-12-02,MTKDYQDFQHLDNENDHHQLQRGPPPAPRLLQRLCSGFRLFLLSLG...,284


In [31]:
sprot2022.head(3)

Unnamed: 0,id,name,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,P00250,FER_APHSA,False,False,0,-,0,1986-07-21,2007-01-23,2022-05-25,MASYKVTLKTPDGDNVITVPDDEYILDVAEEEGLDLPYSCRAGACS...,97
1,P03420,FUS_HRSVA,False,False,0,-,0,1986-07-21,1986-07-21,2022-05-25,MELLILKANAITTILTAVTFCFASGQNITEEFYQSTCSAVSKGYLS...,574
2,P0ACF7,DBHB_SHIFL,False,False,0,-,0,1986-07-21,1986-07-21,2022-05-25,MNKSQLIDKIAAGADISKAAAGRALDAIIASVTESLKEGDDVALVG...,90


## 5. Preprocessing
### 5.1 format EC

In [41]:
#sprot2018
sprot2018['ec_number'] = sprot2018.ec_number.apply(lambda x: mtool.format_ec(x))
sprot2018['ec_number'] = sprot2018.ec_number.apply(lambda x: mtool.specific_ecs(x))
sprot2018['functionCounts'] = sprot2018.ec_number.apply(lambda x: 0 if x=='-'  else len(x.split(',')))
print('sprot 2018 finished')

#sprot2019
sprot2019['ec_number'] = sprot2019.ec_number.apply(lambda x: mtool.format_ec(x))
sprot2019['ec_number'] = sprot2019.ec_number.apply(lambda x: mtool.specific_ecs(x))
sprot2019['functionCounts'] = sprot2019.ec_number.apply(lambda x: 0 if x=='-'  else len(x.split(',')))
print('sprot 2019 finished')

#sprot2020
sprot2020['ec_number'] = sprot2020.ec_number.apply(lambda x: mtool.format_ec(x))
sprot2020['ec_number'] = sprot2020.ec_number.apply(lambda x: mtool.specific_ecs(x))
sprot2020['functionCounts'] = sprot2020.ec_number.apply(lambda x: 0 if x=='-'  else len(x.split(',')))
print('sprot 2020 finished')

#sprot2021
sprot2021['ec_number'] = sprot2021.ec_number.apply(lambda x: mtool.format_ec(x))
sprot2021['ec_number'] = sprot2021.ec_number.apply(lambda x: mtool.specific_ecs(x))
sprot2021['functionCounts'] = sprot2021.ec_number.apply(lambda x: 0 if x=='-'  else len(x.split(',')))
print('sprot 2021 finished')


#sprot2022
sprot2022['ec_number'] = sprot2022.ec_number.apply(lambda x: mtool.format_ec(x))
sprot2022['ec_number'] = sprot2022.ec_number.apply(lambda x: mtool.specific_ecs(x))
sprot2022['functionCounts'] = sprot2022.ec_number.apply(lambda x: 0 if x=='-'  else len(x.split(',')))

print('sprot 2022 finished')

sprot 2018 finished
sprot 2019 finished
sprot 2020 finished
sprot 2021 finished
sprot 2022 finished


In [42]:
sprot2018.to_feather(cfg.DIR_UNIPROT + '/snap201802.feather')
sprot2019.to_feather(cfg.DIR_UNIPROT + '/snap201902.feather')
sprot2020.to_feather(cfg.DIR_UNIPROT + '/snap202006.feather')
sprot2021.to_feather(cfg.DIR_UNIPROT + '/snap202102.feather')
sprot2022.to_feather(cfg.DIR_UNIPROT + '/snap202202.feather')

### 5.2 Split Tain Test

In [43]:
sprot2018

Unnamed: 0,id,name,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,Q3J1A3,LHB1_RHOS4,False,False,0,-,0,1986-07-21,2007-01-23,2017-10-25,MADKSDLGYTGLTDEQAQELHSVYMSGLWLFSAVAIVAHLAVYIWRPWF,49
1,P02157,MYG_MELME,False,False,0,-,0,1986-07-21,2007-01-23,2017-11-22,MGLSDGEWQLVLNVWGKVEADLAGHGQEVLIRLFKGHPETLEKFDK...,154
2,P02178,MYG_MEGNO,False,False,0,-,0,1986-07-21,2007-01-23,2017-11-22,MVLSDAEWQLVLNIWAKVEADVAGHGQDILIRLFKGHPETLEKFDK...,154
3,P02194,MYG_MACRU,False,False,0,-,0,1986-07-21,2007-01-23,2017-11-22,MGLSDGEWQLVLNIWGKVETDEGGHGKDVLIRLFKGHPETLEKFDK...,154
4,P01915,HB22_MOUSE,False,False,0,-,0,1986-07-21,1986-07-21,2017-10-25,MVWLPRVPCVAAVILLLTVLSPPVALVRDTRPRFLEYVTSECHFYN...,264
...,...,...,...,...,...,...,...,...,...,...,...,...
469129,Q21221,AHO3_CAEEL,True,False,1,3.1.2.22,4,2018-02-28,2004-11-23,2018-02-28,MSSGAPSGSSMSSTPGSPPPRAGGPNSVSFKDLCCLFCCPPFPSSI...,332
469130,Q6QJ72,PDL2_ARATH,True,False,1,4.2.1.96,4,2018-02-28,2004-07-05,2018-02-28,MSRLLLPKLFSISRTQVPAASLFNNLYRRHKRFVHWTSKMSTDSVR...,187
469131,C0HL68,ES1GA_ODOGR,False,False,0,-,0,2018-02-28,2018-02-28,2018-02-28,GLFSKPAGKGIKNLIPKGVKHIGKEVGKDVIRTGIDVAGCKIKGEC,46
469132,C0HK74,VKT3_HETMG,False,False,0,-,0,2018-02-28,2018-02-28,2018-02-28,GSICLEPKVVGPCTAYFPRFYFDSETGKCTPFIYGGCEGNGNNFET...,56


In [44]:
train = sprot2018.iloc[:,np.r_[0,2:8,10:12]]

test_2019 = sprot2019.iloc[:,np.r_[0,2:8,10:12]]
test_2020 = sprot2020.iloc[:,np.r_[0,2:8,10:12]]
test_2021 = sprot2021.iloc[:,np.r_[0,2:8,10:12]]
test_2022 = sprot2022.iloc[:,np.r_[0,2:8,10:12]]

test_2019 =test_2019[~test_2019.seq.isin(train.seq)]
test_2019.reset_index(drop=True, inplace=True)

test_2020 =test_2020[~test_2020.seq.isin(train.seq)]
test_2020.reset_index(drop=True, inplace=True)

test_2021 =test_2021[~test_2021.seq.isin(train.seq)]
test_2021.reset_index(drop=True, inplace=True)

test_2022 =test_2022[~test_2022.seq.isin(train.seq)]
test_2022.reset_index(drop=True, inplace=True)

In [45]:
test_2022

Unnamed: 0,id,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,seq,seqlength
0,P02812,False,False,0,-,0,1986-07-21,MLLILLSVALLALSSAQNLNEDVSQEESPSLIAGNPQGAPPQGGNK...,416
1,P02883,False,False,0,-,0,1986-07-21,MAATTCFFFLFPFLLLLTLSRAATFEIVNRCSYTVWAAASKGDAAL...,235
2,P01160,False,False,0,-,0,1986-07-21,MSSFSTTTVSFLLLLAFQLLGQTRANPMYNAVSNADLMDFKNLLDH...,151
3,P00780,True,False,1,3.4.21.62,4,1986-07-21,MMRKKSFWLGMLTAFMLVFTMAFSDSASAAQPAKNVEKDYIVGFKS...,379
4,P01523,False,False,0,-,0,1986-07-21,MMSKLGVLLTICLLLFPLTALPMDGDEPANRPVERMQDNISSEQYP...,75
...,...,...,...,...,...,...,...,...,...
12053,A0A2K5TU92,True,True,1,2.4.2.-; 2.3.1.286,4,2022-05-25,MSVNYAAGLSPYADKGKCGLPEIFDPPEELERKVWELARLVWQSSH...,355
12054,A0A3R0A696,True,False,1,3.2.1.55,4,2022-05-25,MKHWKKMAASLIAISTMVAVVPTTYAMESEDSQPQTTDTATVQTTK...,1065
12055,Q5ZV91,False,False,0,-,0,2022-05-25,MDEIKKDDELSQWLSTYGTITAERILGRYNISLPQDEILEAINIPS...,294
12056,M1H607,False,False,0,-,0,2022-05-25,MDAIKKKMQAMKLEKDDAMDRADTLEQQNKEANIRAEKAEEEVHNL...,284


### 5.3 Remove changed seqence in test set

In [46]:
test_2019 = test_2019[~test_2019.id.isin(test_2019.merge(train, on='id', how='inner').id.values)]
test_2019.reset_index(drop=True, inplace=True)

test_2020 = test_2020[~test_2020.id.isin(test_2020.merge(train, on='id', how='inner').id.values)]
test_2020.reset_index(drop=True, inplace=True)

test_2021 = test_2021[~test_2021.id.isin(test_2021.merge(train, on='id', how='inner').id.values)]
test_2021.reset_index(drop=True, inplace=True)

test_2022 = test_2022[~test_2022.id.isin(test_2022.merge(train, on='id', how='inner').id.values)]
test_2022.reset_index(drop=True, inplace=True)


### 5.4 Trim string

In [47]:
with pd.option_context('mode.chained_assignment', None):
    train.ec_number = train.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    train.seq = train.seq.parallel_apply(lambda x : str(x).strip()) #seq trim

    test_2019.ec_number = test_2019.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    test_2019.seq = test_2019.seq.parallel_apply(lambda x : str(x).strip()) #seq trim
    
    test_2020.ec_number = test_2020.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    test_2020.seq = test_2020.seq.parallel_apply(lambda x : str(x).strip()) #seq trim

    test_2021.ec_number = test_2021.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    test_2021.seq = test_2021.seq.parallel_apply(lambda x : str(x).strip()) #seq trim

    test_2022.ec_number = test_2022.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    test_2022.seq = test_2022.seq.parallel_apply(lambda x : str(x).strip()) #seq trim

### 5.5 Save train test

In [49]:
train.to_feather(cfg.DATADIR + 'datasets/train.feather')
test_2019.to_feather(cfg.DATADIR + 'datasets/test_2019.feather')
test_2020.to_feather(cfg.DATADIR + 'datasets/test_2020.feather')
test_2021.to_feather(cfg.DATADIR + 'datasets/test_2021.feather')
test_2022.to_feather(cfg.DATADIR + 'datasets/test_2022.feather')

## 6. Build benchmarking datasets
### 6.1 Task 1 isEnzyme

In [52]:
train = pd.read_feather(cfg.DIR_DATASETS + 'train.feather')
test_2019 = pd.read_feather(cfg.DIR_DATASETS + 'test_2019.feather')
test_2020 = pd.read_feather(cfg.DIR_DATASETS + 'test_2020.feather')
test_2021 = pd.read_feather(cfg.DIR_DATASETS + 'test_2021.feather')
test_2022 = pd.read_feather(cfg.DIR_DATASETS + 'test_2022.feather')

task1_train = train.iloc[:,np.r_[0,7,1]]

task1_test_2019 = test_2019.iloc[:,np.r_[0,7,1]]
task1_test_2020 = test_2020.iloc[:,np.r_[0,7,1]]
task1_test_2021 = test_2021.iloc[:,np.r_[0,7,1]]
task1_test_2022 = test_2022.iloc[:,np.r_[0,7,1]]


task1_train.to_feather(cfg.FILE_TASK1_TRAIN)
task1_test_2019.to_feather(cfg.FILE_TASK1_TEST_2019)
task1_test_2020.to_feather(cfg.FILE_TASK1_TEST_2020)
task1_test_2021.to_feather(cfg.FILE_TASK1_TEST_2021)
task1_test_2022.to_feather(cfg.FILE_TASK1_TEST_2022)

funclib.table2fasta(table=task1_train[['id', 'seq']], file_out=cfg.FILE_TASK1_TRAIN_FASTA)
funclib.table2fasta(table=task1_test_2019[['id', 'seq']], file_out=cfg.FILE_TASK1_TEST_2019_FASTA)
funclib.table2fasta(table=task1_test_2020[['id', 'seq']], file_out=cfg.FILE_TASK1_TEST_2020_FASTA)
funclib.table2fasta(table=task1_test_2021[['id', 'seq']], file_out=cfg.FILE_TASK1_TEST_2021_FASTA)
funclib.table2fasta(table=task1_test_2022[['id', 'seq']], file_out=cfg.FILE_TASK1_TEST_2022_FASTA)

Write finished
Write finished
Write finished
Write finished
Write finished


### 6.2 Task2 Function Counts

In [53]:
task2_train = train[train.functionCounts >0]
task2_train.reset_index(drop=True, inplace=True)
task2_train = task2_train.iloc[:,np.r_[0,7,3]]

task2_test_2019 = test_2019[test_2019.functionCounts >0]
task2_test_2019.reset_index(drop=True, inplace=True)
task2_test_2019 = task2_test_2019.iloc[:,np.r_[0,7,3]]

task2_test_2020 = test_2020[test_2020.functionCounts >0]
task2_test_2020.reset_index(drop=True, inplace=True)
task2_test_2020 = task2_test_2020.iloc[:,np.r_[0,7,3]]

task2_test_2021 = test_2021[test_2021.functionCounts >0]
task2_test_2021.reset_index(drop=True, inplace=True)
task2_test_2021 = task2_test_2021.iloc[:,np.r_[0,7,3]]

task2_test_2022 = test_2022[test_2022.functionCounts >0]
task2_test_2022.reset_index(drop=True, inplace=True)
task2_test_2022 = task2_test_2022.iloc[:,np.r_[0,7,3]]

task2_train.to_feather(cfg.FILE_TASK2_TRAIN)
task2_test_2019.to_feather(cfg.FILE_TASK2_TEST_2019)
task2_test_2020.to_feather(cfg.FILE_TASK2_TEST_2020)
task2_test_2021.to_feather(cfg.FILE_TASK2_TEST_2021)
task2_test_2022.to_feather(cfg.FILE_TASK2_TEST_2022)

funclib.table2fasta(table=task2_train[['id', 'seq']], file_out=cfg.FILE_TASK2_TRAIN_FASTA)
funclib.table2fasta(table=task2_test_2019[['id', 'seq']], file_out=cfg.FILE_TASK2_TEST_2019_FASTA)
funclib.table2fasta(table=task2_test_2020[['id', 'seq']], file_out=cfg.FILE_TASK2_TEST_2020_FASTA)
funclib.table2fasta(table=task2_test_2021[['id', 'seq']], file_out=cfg.FILE_TASK2_TEST_2021_FASTA)
funclib.table2fasta(table=task2_test_2022[['id', 'seq']], file_out=cfg.FILE_TASK2_TEST_2022_FASTA)

Write finished
Write finished
Write finished
Write finished
Write finished


### 6.3 Task3 EC Number

In [54]:
task3_train = train[train.functionCounts >0]
task3_train.reset_index(drop=True, inplace=True)
task3_train = task3_train.iloc[:,np.r_[0,7,4]]

task3_test_2019 = test_2019[test_2019.functionCounts >0]
task3_test_2019.reset_index(drop=True, inplace=True)
task3_test_2019 = task3_test_2019.iloc[:,np.r_[0,7,4]]

task3_test_2020 = test_2020[test_2020.functionCounts >0]
task3_test_2020.reset_index(drop=True, inplace=True)
task3_test_2020 = task3_test_2020.iloc[:,np.r_[0,7,4]]

task3_test_2021 = test_2021[test_2021.functionCounts >0]
task3_test_2021.reset_index(drop=True, inplace=True)
task3_test_2021 = task3_test_2021.iloc[:,np.r_[0,7,4]]

task3_test_2022 = test_2022[test_2022.functionCounts >0]
task3_test_2022.reset_index(drop=True, inplace=True)
task3_test_2022 = task3_test_2022.iloc[:,np.r_[0,7,4]]

task3_train.to_feather(cfg.FILE_TASK3_TRAIN)
task3_test_2019.to_feather(cfg.FILE_TASK3_TEST_2019)
task3_test_2020.to_feather(cfg.FILE_TASK3_TEST_2020)
task3_test_2021.to_feather(cfg.FILE_TASK3_TEST_2021)
task3_test_2022.to_feather(cfg.FILE_TASK3_TEST_2022)

funclib.table2fasta(table=task3_train[['id', 'seq']], file_out=cfg.FILE_TASK3_TRAIN_FASTA)
funclib.table2fasta(table=task3_test_2019[['id', 'seq']], file_out=cfg.FILE_TASK3_TEST_2019_FASTA)
funclib.table2fasta(table=task3_test_2020[['id', 'seq']], file_out=cfg.FILE_TASK3_TEST_2020_FASTA)
funclib.table2fasta(table=task3_test_2021[['id', 'seq']], file_out=cfg.FILE_TASK3_TEST_2021_FASTA)
funclib.table2fasta(table=task3_test_2022[['id', 'seq']], file_out=cfg.FILE_TASK3_TEST_2022_FASTA)

Write finished
Write finished
Write finished
Write finished
Write finished


## 7 Make Feature Bank

### 7.1 ESM embedding 

In [9]:
# loading sprot data
snap18 = pd.read_feather(cfg.DIR_UNIPROT + '\\snap201802.feather')
snap19 = pd.read_feather(cfg.DIR_UNIPROT + '\\snap201902.feather')
snap20 = pd.read_feather(cfg.DIR_UNIPROT + '\\snap202006.feather')
snap21 = pd.read_feather(cfg.DIR_UNIPROT + '\\snap202102.feather')
snap22 = pd.read_feather(cfg.DIR_UNIPROT + '\\snap202202.feather')
# merge
full_snap_data = pd.concat([snap18, snap19,snap20,snap21,snap22], axis=0)
full_snap_data = full_snap_data.sort_values(by=['id', 'date_annotation_update'], ascending=False)
full_snap_data = full_snap_data[['id', 'seq']].drop_duplicates(subset='id', keep='first')
full_snap_data.reset_index(drop=True, inplace=True)


# loading exsisting features
if ftool.isfileExists(cfg.FILE_FEATURE_ESM0):
    feature_esm0 = pd.read_feather(cfg.FILE_FEATURE_ESM0)
    feature_esm32 = pd.read_feather(cfg.FILE_FEATURE_ESM32)
    feature_esm33 = pd.read_feather(cfg.FILE_FEATURE_ESM33)
    feature_unirep = pd.read_feather(cfg.FILE_FEATURE_UNIREP)
    feature_onehot = pd.read_feather(cfg.FILE_FEATURE_ONEHOT)
    #caculate embedding list
    needesm = full_snap_data[~full_snap_data.id.isin(list(set(feature_esm33.id)))]
    needunirep = full_snap_data[~full_snap_data.id.isin(list(set(feature_unirep.id)))]
    needonehot = full_snap_data[~full_snap_data.id.isin(list(set(feature_onehot.id)))]
else:
    needesm = full_snap_data
    needunirep = full_snap_data
    needonehot = full_snap_data



In [10]:
#!pip install fair-esm
from tools import embedding_esm as esmebd
if len(needesm)>0:
    tr_rep0, tr_rep32, tr_rep33 = esmebd.get_rep_multi_sequence(sequences=needesm, model='esm1b_t33_650M_UR50S',seqthres=1022)

    #merge existing
    print("1")
    feature_esm0 = pd.concat([feature_esm0, tr_rep0], axis=0).reset_index(drop=True)
    print("2")    
    feature_esm32 = pd.concat([feature_esm32, tr_rep32], axis=0).reset_index(drop=True)
    print("3")
    feature_esm33 = pd.concat([feature_esm33, tr_rep33], axis=0).reset_index(drop=True)


    #save
    print("4")
    feature_esm0.to_feather(cfg.FILE_FEATURE_ESM0)
    print("5")
    feature_esm32.to_feather(cfg.FILE_FEATURE_ESM32)
    print("6")
    feature_esm33.to_feather(cfg.FILE_FEATURE_ESM33)

Transferred model to GPU


  0%|          | 123/500707 [00:57<65:07:51,  2.13it/s] 


OutOfMemoryError: CUDA out of memory. Tried to allocate 720.00 MiB (GPU 0; 5.00 GiB total capacity; 3.54 GiB already allocated; 0 bytes free; 4.28 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [14]:
import torch
print(torch. __version__)

1.13.0+cu117


### 7.2 Unirep

In [4]:
if len(needunirep) > 0:
    from tools import embedding_unirep as unirep
    tr_unirep = unirep.getunirep(needunirep, 40)

    feature_unirep = pd.concat([feature_unirep, tr_unirep],axis=0).reset_index(drop=True)
    feature_unirep.to_feather(cfg.FILE_FEATURE_UNIREP)


100%|██████████| 49/49 [14:24<00:00, 17.65s/it]

length not match





### 7.3 one-hot

In [10]:
feature_onehot = pd.DataFrame()
needonehot = full_snap_data
if len(needonehot) > 0:
    tr_unirep = onehotebd.get_onehot(sequences=needonehot, padding=True, padding_window=1500)
    feature_onehot = pd.concat([feature_onehot, tr_unirep],axis=0).reset_index(drop=True)
    feature_onehot.to_feather(cfg.FILE_FEATURE_ONEHOT)

In [None]:
import xml.etree.ElementTree as et 
pd.read_xml(f'{cfg.DATADIR}explorenz/enzyme-data.xml')
xtree = et.parse(f'{cfg.DATADIR}explorenz/enzyme-data.xml')
xroot = xtree.getroot()

counter = 1
for node in xroot:
    print(dir(node))
    print(node.attrib.get('row'))

    counter = counter +1
    if counter ==15:
        break