In [1]:
import pandas as pd
import numpy as np
import os
import sys
import polars as pl
import json

In [2]:
TRAIN_CSV_PATH = '/home/dangnh36/datasets/competitions/leash_belka/raw/train.csv'
TEST_CSV_PATH = '/home/dangnh36/datasets/competitions/leash_belka/raw/test.csv'

In [3]:
df = pl.scan_csv(TRAIN_CSV_PATH)
print(df.columns)
df.head(10).collect()

['id', 'buildingblock1_smiles', 'buildingblock2_smiles', 'buildingblock3_smiles', 'molecule_smiles', 'protein_name', 'binds']


id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds
i64,str,str,str,str,str,i64
0,"""C#CC[C@@H](CC(…","""C#CCOc1ccc(CN)…","""Br.Br.NCC1CCCN…","""C#CCOc1ccc(CNc…","""BRD4""",0
1,"""C#CC[C@@H](CC(…","""C#CCOc1ccc(CN)…","""Br.Br.NCC1CCCN…","""C#CCOc1ccc(CNc…","""HSA""",0
2,"""C#CC[C@@H](CC(…","""C#CCOc1ccc(CN)…","""Br.Br.NCC1CCCN…","""C#CCOc1ccc(CNc…","""sEH""",0
3,"""C#CC[C@@H](CC(…","""C#CCOc1ccc(CN)…","""Br.NCc1cccc(Br…","""C#CCOc1ccc(CNc…","""BRD4""",0
4,"""C#CC[C@@H](CC(…","""C#CCOc1ccc(CN)…","""Br.NCc1cccc(Br…","""C#CCOc1ccc(CNc…","""HSA""",0
5,"""C#CC[C@@H](CC(…","""C#CCOc1ccc(CN)…","""Br.NCc1cccc(Br…","""C#CCOc1ccc(CNc…","""sEH""",0
6,"""C#CC[C@@H](CC(…","""C#CCOc1ccc(CN)…","""C#CCOc1ccc(CN)…","""C#CCOc1ccc(CNc…","""BRD4""",0
7,"""C#CC[C@@H](CC(…","""C#CCOc1ccc(CN)…","""C#CCOc1ccc(CN)…","""C#CCOc1ccc(CNc…","""HSA""",0
8,"""C#CC[C@@H](CC(…","""C#CCOc1ccc(CN)…","""C#CCOc1ccc(CN)…","""C#CCOc1ccc(CNc…","""sEH""",0
9,"""C#CC[C@@H](CC(…","""C#CCOc1ccc(CN)…","""C=C(C)C(=O)NCC…","""C#CCOc1ccc(CNc…","""BRD4""",0


In [4]:
df = pl.scan_csv(TRAIN_CSV_PATH,
            new_columns = ['id', 'bb1', 'bb2', 'bb3', 'molecule', 'protein', 'binds'],
           dtypes = {
               'bb1': pl.Categorical,
               'bb2': pl.Categorical,
               'bb3': pl.Categorical,
               'molecule': pl.Categorical,
               'protein': pl.Categorical,
               'binds': pl.Int8,
           }).cast({'protein': pl.Enum(['BRD4', 'HSA', 'sEH']), 'binds': pl.Boolean}).collect()
df

id,bb1,bb2,bb3,molecule,protein,binds
i64,cat,cat,cat,cat,enum,bool
0,"""C#CC[C@@H](CC(…","""C#CCOc1ccc(CN)…","""Br.Br.NCC1CCCN…","""C#CCOc1ccc(CNc…","""BRD4""",false
1,"""C#CC[C@@H](CC(…","""C#CCOc1ccc(CN)…","""Br.Br.NCC1CCCN…","""C#CCOc1ccc(CNc…","""HSA""",false
2,"""C#CC[C@@H](CC(…","""C#CCOc1ccc(CN)…","""Br.Br.NCC1CCCN…","""C#CCOc1ccc(CNc…","""sEH""",false
3,"""C#CC[C@@H](CC(…","""C#CCOc1ccc(CN)…","""Br.NCc1cccc(Br…","""C#CCOc1ccc(CNc…","""BRD4""",false
4,"""C#CC[C@@H](CC(…","""C#CCOc1ccc(CN)…","""Br.NCc1cccc(Br…","""C#CCOc1ccc(CNc…","""HSA""",false
…,…,…,…,…,…,…
295246825,"""[N-]=[N+]=NCCC…","""Nc1noc2ccc(F)c…","""Nc1nnn[nH]1""","""[N-]=[N+]=NCCC…","""HSA""",false
295246826,"""[N-]=[N+]=NCCC…","""Nc1noc2ccc(F)c…","""Nc1nnn[nH]1""","""[N-]=[N+]=NCCC…","""sEH""",false
295246827,"""[N-]=[N+]=NCCC…","""Nc1noc2ccc(F)c…","""Nc1noc2ccc(F)c…","""[N-]=[N+]=NCCC…","""BRD4""",false
295246828,"""[N-]=[N+]=NCCC…","""Nc1noc2ccc(F)c…","""Nc1noc2ccc(F)c…","""[N-]=[N+]=NCCC…","""HSA""",false


In [5]:
df.describe()

statistic,id,bb1,bb2,bb3,molecule,protein,binds
str,f64,str,str,str,str,str,f64
"""count""",295246830.0,"""295246830""","""295246830""","""295246830""","""295246830""","""295246830""",295246830.0
"""null_count""",0.0,"""0""","""0""","""0""","""0""","""0""",0.0
"""mean""",147620000.0,,,,,,0.005385
"""std""",85230000.0,,,,,,
"""min""",0.0,,,,,,0.0
"""25%""",73811707.0,,,,,,
"""50%""",147623415.0,,,,,,
"""75%""",221435122.0,,,,,,
"""max""",295246829.0,,,,,,1.0


In [6]:
df.select(pl.col('*').n_unique())

id,bb1,bb2,bb3,molecule,protein,binds
u32,u32,u32,u32,u32,u32,u32
295246830,271,693,872,98415610,3,2


In [7]:
train_bb1s = df.select(pl.col('bb1').unique()).to_series().to_list()
train_bb2s = df.select(pl.col('bb2').unique()).to_series().to_list()
train_bb3s = df.select(pl.col('bb3').unique()).to_series().to_list()

print(len(train_bb1s), len(train_bb2s), len(train_bb3s))

271 693 872


In [8]:
len(train_bb1s) + len(train_bb2s) + len(train_bb3s)

1836

In [9]:
len(set(train_bb1s + train_bb2s + train_bb3s))

1145

In [10]:
train_bbs = list(set(train_bb1s + train_bb2s + train_bb3s))
len(train_bbs)

1145

In [11]:
test_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/raw/test.csv').head(10).collect()
test_df

id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name
i64,str,str,str,str,str
295246830,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""C=Cc1ccc(N)cc1…","""C#CCCC[C@H](Nc…","""BRD4"""
295246831,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""C=Cc1ccc(N)cc1…","""C#CCCC[C@H](Nc…","""HSA"""
295246832,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""C=Cc1ccc(N)cc1…","""C#CCCC[C@H](Nc…","""sEH"""
295246833,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""CC(O)Cn1cnc2c(…","""C#CCCC[C@H](Nc…","""BRD4"""
295246834,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""CC(O)Cn1cnc2c(…","""C#CCCC[C@H](Nc…","""HSA"""
295246835,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""CC(O)Cn1cnc2c(…","""C#CCCC[C@H](Nc…","""sEH"""
295246836,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""CC1(C)CCCC1(O)…","""C#CCCC[C@H](Nc…","""BRD4"""
295246837,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""CC1(C)CCCC1(O)…","""C#CCCC[C@H](Nc…","""HSA"""
295246838,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""CC1(C)CCCC1(O)…","""C#CCCC[C@H](Nc…","""sEH"""
295246839,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""COC(=O)c1cc(Cl…","""C#CCCC[C@H](Nc…","""BRD4"""


In [12]:
test_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/raw/test.csv',
            new_columns = ['id', 'bb1', 'bb2', 'bb3', 'molecule', 'protein'],
           dtypes = {
               'bb1': pl.Categorical,
               'bb2': pl.Categorical,
               'bb3': pl.Categorical,
               'molecule': pl.Categorical,
               'protein': pl.Categorical,
           }).cast({'protein': pl.Enum(['BRD4', 'HSA', 'sEH'])}).collect()
test_df

id,bb1,bb2,bb3,molecule,protein
i64,cat,cat,cat,cat,enum
295246830,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""C=Cc1ccc(N)cc1…","""C#CCCC[C@H](Nc…","""BRD4"""
295246831,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""C=Cc1ccc(N)cc1…","""C#CCCC[C@H](Nc…","""HSA"""
295246832,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""C=Cc1ccc(N)cc1…","""C#CCCC[C@H](Nc…","""sEH"""
295246833,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""CC(O)Cn1cnc2c(…","""C#CCCC[C@H](Nc…","""BRD4"""
295246834,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""CC(O)Cn1cnc2c(…","""C#CCCC[C@H](Nc…","""HSA"""
…,…,…,…,…,…
296921721,"""[N-]=[N+]=NCCC…","""Nc1noc2ccc(F)c…","""COC1CCC(CCN)CC…","""COC1CCC(CCNc2n…","""HSA"""
296921722,"""[N-]=[N+]=NCCC…","""Nc1noc2ccc(F)c…","""COC1CCC(CCN)CC…","""COC1CCC(CCNc2n…","""sEH"""
296921723,"""[N-]=[N+]=NCCC…","""Nc1noc2ccc(F)c…","""NCc1cccs1""","""[N-]=[N+]=NCCC…","""BRD4"""
296921724,"""[N-]=[N+]=NCCC…","""Nc1noc2ccc(F)c…","""NCc1cccs1""","""[N-]=[N+]=NCCC…","""HSA"""


In [13]:
test_bb1s = test_df.select(pl.col('bb1').unique()).to_series().to_list()
test_bb2s = test_df.select(pl.col('bb2').unique()).to_series().to_list()
test_bb3s = test_df.select(pl.col('bb3').unique()).to_series().to_list()

print(len(test_bb1s), len(test_bb2s), len(test_bb3s))

341 1140 1389


In [14]:
len(test_bb1s) + len(test_bb2s) + len(test_bb3s)

2870

In [15]:
len(set(test_bb1s + test_bb2s + test_bb3s))

2110

In [16]:
test_bbs = list(set(test_bb1s + test_bb2s + test_bb3s))
len(test_bbs)

2110

In [17]:
len(train_bbs) + len(test_bbs)

3255

In [18]:
len(set(train_bbs + test_bbs))

2110

In [19]:
all_bbs = list(set(train_bbs + test_bbs))
len(all_bbs)

2110

In [20]:
all_bbs.index(df[0, 'bb1']), all_bbs.index(df[0, 'bb2']), all_bbs.index(df[0, 'bb3'])

(1640, 1653, 765)

In [21]:
print(type((df[2::3, 'protein'] == 'sEH')))
print(dir((df[2::3, 'protein'] == 'sEH')))

<class 'polars.series.series.Series'>
['__abs__', '__add__', '__and__', '__annotations__', '__array__', '__array_ufunc__', '__bool__', '__class__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__floordiv__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__invert__', '__iter__', '__le__', '__len__', '__lt__', '__matmul__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__or__', '__pos__', '__pow__', '__radd__', '__rand__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmatmul__', '__rmod__', '__rmul__', '__ror__', '__rpow__', '__rsub__', '__rtruediv__', '__rxor__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '__xor__', '_accessors', '_arithmetic', '_comp', '_from_buffer', '_from_buffers', '_from_pyseries', '_get_buffer_i

In [22]:
assert (df[0::3, 'protein'] == 'BRD4').all()
assert (df[1::3, 'protein'] == 'HSA').all()
assert (df[2::3, 'protein'] == 'sEH').all()
assert ((df[0::3, 'molecule'] == df[1::3, 'molecule']) & (df[0::3, 'molecule'] == df[2::3, 'molecule'])).all()

In [23]:
df['binds'].sum()

1589906

In [24]:
BB2IDX = {all_bbs[i]: i for i in range(len(all_bbs))}

In [None]:
# new_train_df = df.group_by('molecule').agg(
#     pl.col('bb1', 'bb2', 'bb3').first().map_elements(all_bbs.index),
#     pl.col('binds').map_elements(lambda x: x[0], return_dtype = pl.Boolean).alias('BRD4'),
#     pl.col('binds').map_elements(lambda x: x[1], return_dtype = pl.Boolean).alias('HSA'),
#     pl.col('binds').map_elements(lambda x: x[2], return_dtype = pl.Boolean).alias('sEH')
# )
# new_train_df

In [27]:
new_train_df = df[0::3].select(
    pl.col('bb1', 'bb2', 'bb3').map_elements(lambda x: BB2IDX[x])
)
new_train_df



bb1,bb2,bb3
i64,i64,i64
1640,1653,765
1640,1653,205
1640,1653,1653
1640,1653,146
1640,1653,439
…,…,…
141,1415,1792
141,1415,1178
141,1415,1699
141,1415,1308


In [40]:
new_train_df = new_train_df.with_columns(
    pl.col('bb1', 'bb2', 'bb3').cast(pl.UInt16),
    molecule = df[0::3, 'molecule'],
    BRD4 = df[0::3, 'binds'],
    HSA = df[1::3, 'binds'],
    eSH = df[2::3, 'binds'],
)
new_train_df

bb1,bb2,bb3,BRD4,HSA,eSH,molecule
u16,u16,u16,bool,bool,bool,cat
1640,1653,765,false,false,false,"""C#CCOc1ccc(CNc…"
1640,1653,205,false,false,false,"""C#CCOc1ccc(CNc…"
1640,1653,1653,false,false,false,"""C#CCOc1ccc(CNc…"
1640,1653,146,false,false,false,"""C#CCOc1ccc(CNc…"
1640,1653,439,false,false,false,"""C#CCOc1ccc(CNc…"
…,…,…,…,…,…,…
141,1415,1792,false,false,false,"""[N-]=[N+]=NCCC…"
141,1415,1178,false,false,false,"""[N-]=[N+]=NCCC…"
141,1415,1699,false,false,false,"""[N-]=[N+]=NCCC…"
141,1415,1308,false,false,false,"""[N-]=[N+]=NCCC…"


In [41]:
new_train_df.estimated_size('mb')

8876.121075630188

In [42]:
df.estimated_size('mb')

15821.555647850037

In [57]:
new_train_df

bb1,bb2,bb3,BRD4,HSA,eSH,molecule
u16,u16,u16,bool,bool,bool,cat
1640,1653,765,false,false,false,"""C#CCOc1ccc(CNc…"
1640,1653,205,false,false,false,"""C#CCOc1ccc(CNc…"
1640,1653,1653,false,false,false,"""C#CCOc1ccc(CNc…"
1640,1653,146,false,false,false,"""C#CCOc1ccc(CNc…"
1640,1653,439,false,false,false,"""C#CCOc1ccc(CNc…"
…,…,…,…,…,…,…
141,1415,1792,false,false,false,"""[N-]=[N+]=NCCC…"
141,1415,1178,false,false,false,"""[N-]=[N+]=NCCC…"
141,1415,1699,false,false,false,"""[N-]=[N+]=NCCC…"
141,1415,1308,false,false,false,"""[N-]=[N+]=NCCC…"


In [62]:
new_train_df[:, ['BRD4', 'HSA', 'eSH']].sum()

BRD4,HSA,eSH
u32,u32,u32
456964,408410,724532


In [64]:
new_train_df[:, ['BRD4', 'HSA', 'eSH']].mean()

BRD4,HSA,eSH
f64,f64,f64
0.004643,0.00415,0.007362


In [35]:
(0.004643 + 0.00415 + 0.007362) / 3

0.005385

In [65]:
456964 + 408410 + 724532

1589906

In [70]:
new_train_df = new_train_df.cast({
    'BRD4': pl.UInt8,
    'HSA': pl.UInt8,
    'eSH': pl.UInt8	
})

In [72]:
new_train_df[:100].write_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/train_v2_preview.csv')
new_train_df.write_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/train_v2.csv')

In [68]:
with open('/home/dangnh36/datasets/competitions/leash_belka/processed/building_blocks.json', 'w') as f:
    json.dump(
        {
            'train_bbs': train_bbs,
            'train_bb1s': train_bb1s,
            'train_bb2s': train_bb2s,
            'train_bb3s': train_bb3s,
            'test_bb1s': test_bb1s,
            'test_bb2s': test_bb2s,
            'test_bb3s': test_bb3s,
            'test_bbs': test_bbs,
            'all_bbs': all_bbs
        },
        f
    )

In [73]:
len(all_bbs)

2110

In [38]:
test_df

id,bb1,bb2,bb3,molecule,protein
i64,cat,cat,cat,cat,enum
295246830,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""C=Cc1ccc(N)cc1…","""C#CCCC[C@H](Nc…","""BRD4"""
295246831,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""C=Cc1ccc(N)cc1…","""C#CCCC[C@H](Nc…","""HSA"""
295246832,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""C=Cc1ccc(N)cc1…","""C#CCCC[C@H](Nc…","""sEH"""
295246833,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""CC(O)Cn1cnc2c(…","""C#CCCC[C@H](Nc…","""BRD4"""
295246834,"""C#CCCC[C@H](NC…","""C=Cc1ccc(N)cc1…","""CC(O)Cn1cnc2c(…","""C#CCCC[C@H](Nc…","""HSA"""
…,…,…,…,…,…
296921721,"""[N-]=[N+]=NCCC…","""Nc1noc2ccc(F)c…","""COC1CCC(CCN)CC…","""COC1CCC(CCNc2n…","""HSA"""
296921722,"""[N-]=[N+]=NCCC…","""Nc1noc2ccc(F)c…","""COC1CCC(CCN)CC…","""COC1CCC(CCNc2n…","""sEH"""
296921723,"""[N-]=[N+]=NCCC…","""Nc1noc2ccc(F)c…","""NCc1cccs1""","""[N-]=[N+]=NCCC…","""BRD4"""
296921724,"""[N-]=[N+]=NCCC…","""Nc1noc2ccc(F)c…","""NCc1cccs1""","""[N-]=[N+]=NCCC…","""HSA"""


In [39]:
PROTEIN2IDX = {
    'BRD4': 0,
    'HSA': 1,
    'sEH': 2
}

In [45]:
new_test_df = test_df.select(
    pl.col('id', 'molecule'),
    pl.col('bb1', 'bb2', 'bb3').map_elements(lambda x: BB2IDX[x]).cast(pl.UInt16),
    pl.col('protein')
)
new_test_df



id,molecule,bb1,bb2,bb3,protein
i64,cat,u16,u16,u16,enum
295246830,"""C#CCCC[C@H](Nc…",1989,409,409,"""BRD4"""
295246831,"""C#CCCC[C@H](Nc…",1989,409,409,"""HSA"""
295246832,"""C#CCCC[C@H](Nc…",1989,409,409,"""sEH"""
295246833,"""C#CCCC[C@H](Nc…",1989,409,1012,"""BRD4"""
295246834,"""C#CCCC[C@H](Nc…",1989,409,1012,"""HSA"""
…,…,…,…,…,…
296921721,"""COC1CCC(CCNc2n…",141,1415,1556,"""HSA"""
296921722,"""COC1CCC(CCNc2n…",141,1415,1556,"""sEH"""
296921723,"""[N-]=[N+]=NCCC…",141,1415,236,"""BRD4"""
296921724,"""[N-]=[N+]=NCCC…",141,1415,236,"""HSA"""


In [46]:
test_df.estimated_size('mb'), new_test_df.estimated_size('mb')

(114.57435321807861, 104.89342212677002)

In [47]:
os.makedirs('/home/dangnh36/datasets/competitions/leash_belka/processed/', exist_ok=True)

In [55]:
new_test_df.write_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/test_v2.csv')

In [56]:
new_test_df.select(pl.col('*').n_unique())

id,molecule,bb1,bb2,bb3,protein
u32,u32,u32,u32,u32,u32
1674896,878022,341,1140,1389,3


In [34]:
98_415_610 + 878_022

99293632

In [2]:
with open('/home/dangnh36/datasets/competitions/leash_belka/processed/building_blocks.json', 'r') as f:
    bb_meta = json.load(f)
bb_meta.keys()

dict_keys(['train_bbs', 'train_bb1s', 'train_bb2s', 'train_bb3s', 'test_bb1s', 'test_bb2s', 'test_bb3s', 'test_bbs', 'all_bbs'])

In [3]:
len(bb_meta['train_bbs'])

1145

In [4]:
len(set(bb_meta['train_bbs']))

1145

In [6]:
271 + 693 + 872

1836

In [7]:
len(bb_meta['test_bbs']), len(set(bb_meta['test_bbs']))

(2110, 2110)

In [8]:
341 + 1140 + 1389

2870

In [9]:
1145 / 1836

0.6236383442265795

In [10]:
2110 / 2870

0.735191637630662

In [12]:
len(set(bb_meta['train_bb1s']).intersection(set(bb_meta['test_bb1s'])))

271

In [15]:
len(set(bb_meta['train_bb2s']).intersection(set(bb_meta['test_bb2s'])))

693

In [16]:
len(set(bb_meta['train_bb3s']).intersection(set(bb_meta['test_bb3s'])))

871

In [17]:
len(set(bb_meta['train_bbs']).intersection(set(bb_meta['test_bbs'])))

1145

In [18]:
271 + 693 + 871

1835

In [19]:
2110 - 1145

965

In [24]:
len(set(bb_meta['test_bbs']).difference(set(bb_meta['train_bbs'])))

965

In [26]:
98_415_610 / 5_711_873

17.230006689574505

In [27]:
1674896 / 368671

4.543064141199063

In [28]:
98_415_610 + 1_674_896

100090506

In [32]:
len(set(bb_meta['test_bbs']).union(set(bb_meta['train_bbs'])))

2110

In [33]:
295_246_830 + 1_674_896

296921726

In [None]:
test_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/test_v2.csv').collect()
test_df.head(5)

test_df_v2 = test_df.with_columns(
    [
        pl.when(pl.col('protein') == protein).then(pl.col('id')).otherwise(0).alias(f'id_{protein}')
        for protein in ['BRD4', 'HSA', 'sEH']]
).groupby('molecule').agg(pl.col('*').exclude('id', 'protein', 'id_BRD4', 'id_HSA', 'id_sEH').first(),
                         pl.col('id_BRD4', 'id_HSA', 'id_sEH').sum()).sort(pl.max_horizontal('id_BRD4', 'id_HSA', 'id_sEH').alias('min_id'))
test_df_v2