# feature 생성

### 라이브러리 호출

In [1]:
import pandas as pd
import numpy as np
import gc

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Descriptors
from rdkit.Chem import Crippen
from rdkit.Chem import QED
from sklearn.preprocessing import OneHotEncoder
import concurrent.futures
import warnings

# 오류 경고 무시하기
warnings.filterwarnings(action='ignore')

## 멀티 쓰레드 사용하여 feature 생성
- molecule
- reactivity
- molecular weight
- Steric strain
- LogP
- TPSA
- NHBD
- NHBA
- planarity
- PSA
- QED
- atomic_num
- ecfp
- BRD4
- HSA
- sEH

#### binds가 1인 것 중 [:1500000]

In [2]:
df = pd.read_csv('./binds_1_df.csv')
df1 = df.iloc[:1500000]

In [3]:
def generate_ecfp(molecule, radius=2, bits=1024):
    if molecule is None:
        return None
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))

[1, 0, ..]

def generate_atomic_num(chunk):
    atomic_num_li = []
    n = chunk.shape[0]
    print(n)

    for i in range(n):
        atomic_num_li.append(chunk['molecule'].tolist()[i].GetNumAtoms())

    return atomic_num_li

make_features_dict = {'molecule': Chem.MolFromSmiles,
                      'ecfp': generate_ecfp,
                      'reactivity': rdMolDescriptors.CalcChi0n,
                      'molecular weight': Descriptors.MolWt,
                      'Steric strain': rdMolDescriptors.CalcNumRotatableBonds,
                      'LogP': Crippen.MolLogP,
                      'TPSA': rdMolDescriptors.CalcTPSA,
                      'NHBD': rdMolDescriptors.CalcNumHBD,
                      'NHBA': rdMolDescriptors.CalcNumRotatableBonds,
                      'planarity': rdMolDescriptors.CalcNumAromaticRings,
                      'PSA': rdMolDescriptors.CalcTPSA,
                      'QED': QED.qed,
                      'atomic_num': generate_atomic_num
                     }

features = make_features_dict.keys()

n = 150000
m = df.shape[0] // n

# 150만개의 데이터를 15만개씩 분할
# 10개의 데이터셋
split_df_li = []
for i in range(m):
    start = n * i
    end = n * (i+1)
    split_df_li.append(df1.iloc[start:end])

def process_chunk(chunk):
    for feature in features:
        if feature == 'molecule':
            chunk[feature] = chunk['molecule_smiles'].apply(make_features_dict[feature])
        elif feature == 'atomic_num':
            chunk[feature] = generate_atomic_num(chunk)
        else :
            chunk[feature] = chunk['molecule'].apply(make_features_dict[feature])

    return chunk

max_workers = 6 # 쓰레드 개수
CHUNK_SIZE = n // max_workers # 한 쓰레드에 들어갈 chunk 크기

cnt = 0

for split_df in split_df_li:
    print(cnt, 'START', end=' ')
    thread_result_li = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for start in range(0, n, CHUNK_SIZE):
            end = start + CHUNK_SIZE
            chunk = split_df.iloc[start:end]
            future = executor.submit(process_chunk, chunk)
            futures.append(future)
            
            del future
            gc.collect()

        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            thread_result_li.append(result)

            del result
            gc.collect()

    full_df_tmp = pd.concat([thread_result_li[i] for i in range(max_workers)], axis=0, ignore_index=True)
    ecfp_df_tmp = pd.DataFrame(full_df_tmp['ecfp'].to_list()) # ecfp를 dataframe 형태로 변환후 더해준다.
    full_df_tmp.drop(['molecule', 'ecfp'], axis=1, inplace=True) # 필요없는 데이터 삭제
    full_df = pd.concat([full_df_tmp, ecfp_df_tmp], axis=1)
    full_df.to_csv(f'./binds_1_v1_r2/binds_1_v1_r2_{cnt}.csv', index=False)

    del thread_result_li
    del futures
    del full_df_tmp
    del ecfp_df_tmp
    del full_df
    gc.collect()

    cnt += 1
    print('END')


0 START 25000
25000
25000
25000
25000
25000
END
1 START 25000
25000
25000
25000
25000
25000
END
2 START 25000
25000
25000
25000
25000
25000
END
3 START 25000
25000
25000
25000
25000
25000
END
4 START 25000
25000
25000
25000
25000
25000
END
5 START 25000
25000
25000
25000
25000
25000
END
6 START 25000
25000
25000
25000
25000
25000
END
7 START 25000
25000
25000
25000
25000
25000
END
8 START 25000
25000
25000
25000
25000
25000
END
9 START 25000
25000
25000
25000
25000
25000
END


In [4]:
del df1
gc.collect()

0

#### binds가 1인 것 중 [x:]

In [5]:
df2 = df.iloc[89906:]

In [2]:
df2 = pd.read_csv('./binds_1_df.csv').iloc[89906:]

# ecfp 생성 함수
def generate_ecfp(molecule, radius=2, bits=1024):
    if molecule is None:
        return None
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))

# atomic_num 생성 함수
def generate_atomic_num(chunk):
    print('start')
    atomic_num_li = []
    n = chunk.shape[0]
    print(n)

    for i in range(n):
        atomic_num_li.append(chunk['molecule'].tolist()[i].GetNumAtoms())

    return atomic_num_li

# 각 feature에 대한 함수 정의
make_features_dict = {'molecule': Chem.MolFromSmiles,
                      'ecfp': generate_ecfp,
                      'reactivity': rdMolDescriptors.CalcChi0n,
                      'molecular weight': Descriptors.MolWt,
                      'Steric strain': rdMolDescriptors.CalcNumRotatableBonds,
                      'LogP': Crippen.MolLogP,
                      'TPSA': rdMolDescriptors.CalcTPSA,
                      'NHBD': rdMolDescriptors.CalcNumHBD,
                      'NHBA': rdMolDescriptors.CalcNumRotatableBonds,
                      'planarity': rdMolDescriptors.CalcNumAromaticRings,
                      'PSA': rdMolDescriptors.CalcTPSA,
                      'QED': QED.qed,
                      'atomic_num': generate_atomic_num
                     }

features = make_features_dict.keys() # feature 리스트

n = 150000 # 전체 데이터셋에서 나눠지는 데이터의 개수 -> 한 데이터셋 마다 15만개 저장
m = df2.shape[0] // n # 만들어지는 그룹 개수

# 150만개의 데이터를 15만개씩 분할
# 10개의 데이터셋
split_df_li = [] # 분할된 데이터 저장 리스트
# 데이터 분할
for i in range(m):
    start = n * i
    end = n * (i+1)
    split_df_li.append(df2.iloc[start:end])

# 쓰레드에서 수행할 함수
def process_chunk(chunk):
    for feature in features:
        if feature == 'molecule':
            chunk[feature] = chunk['molecule_smiles'].apply(make_features_dict[feature])
        elif feature == 'atomic_num':
            chunk[feature] = generate_atomic_num(chunk)
        else :
            chunk[feature] = chunk['molecule'].apply(make_features_dict[feature])
    
    return chunk

max_workers = 6 # 사용할 쓰레드 개수
CHUNK_SIZE = n // max_workers # 한 쓰레드에 들어갈 chunk 크기 -> 150000 // 6 = 25000

cnt = 0 # 잘 작동하는지 확인

for split_df in split_df_li:
    print(cnt, 'START', end=' ') # 시작
    thread_result_li = [] # 각 쓰레드의 결과 저장 리스트

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [] # 쓰레드들의 집합
        for start in range(0, n, CHUNK_SIZE):
            end = start + CHUNK_SIZE
            chunk = split_df.iloc[start:end] # 쓰레드에 들어가 데이터 추출 -> 150000 / 6 = 25000
            future = executor.submit(process_chunk, chunk) # 쓰레드에 할일 지정
            futures.append(future) # 쓰레드를 리스트에 추가
            
            del future
            gc.collect()

        for future in concurrent.futures.as_completed(futures):
            result = future.result() # 쓰레드에서 끝난 결과 반환
            thread_result_li.append(result) # 결과들을 하나의 리스트에 저장

            del result
            gc.collect()

    # 모든 결과를 하나의 dataframe으로 병합
    full_df_tmp = pd.concat([thread_result_li[i] for i in range(max_workers)], axis=0, ignore_index=True)
    ecfp_df_tmp = pd.DataFrame(full_df_tmp['ecfp'].to_list()) # ecfp를 dataframe 형태로 변환후 더해준다.
    full_df_tmp.drop(['molecule', 'ecfp'], axis=1, inplace=True) # 필요없는 데이터 삭제 -> molecule, ecfp
    full_df = pd.concat([full_df_tmp, ecfp_df_tmp], axis=1) # ecfp 추가
    full_df.to_csv(f'./binds_1_v2_r2/binds_1_v2_r2_{cnt}.csv', index=False) # 데이터 저장
    
    del thread_result_li
    del futures
    del full_df_tmp
    del ecfp_df_tmp
    del full_df
    del split_df
    gc.collect()

    cnt += 1
    print('END')

del df2
del split_df_li
gc.collect()

0 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
1 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
2 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
3 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
4 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
5 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
6 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
7 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
8 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
9 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END


0

### binds가 0인 데이터
- 10 ~ 20개 사용 예정

In [3]:
# ecfp 생성 함수
def generate_ecfp(molecule, radius=2, bits=1024):
    if molecule is None:
        return None
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))

# atomic_num 생성 함수
def generate_atomic_num(chunk):
    print('start')
    atomic_num_li = []
    n = chunk.shape[0]
    print(n)

    for i in range(n):
        atomic_num_li.append(chunk['molecule'].tolist()[i].GetNumAtoms())

    return atomic_num_li

# 각 feature에 대한 함수 정의
make_features_dict = {'molecule': Chem.MolFromSmiles,
                      'ecfp': generate_ecfp,
                      'reactivity': rdMolDescriptors.CalcChi0n,
                      'molecular weight': Descriptors.MolWt,
                      'Steric strain': rdMolDescriptors.CalcNumRotatableBonds,
                      'LogP': Crippen.MolLogP,
                      'TPSA': rdMolDescriptors.CalcTPSA,
                      'NHBD': rdMolDescriptors.CalcNumHBD,
                      'NHBA': rdMolDescriptors.CalcNumRotatableBonds,
                      'planarity': rdMolDescriptors.CalcNumAromaticRings,
                      'PSA': rdMolDescriptors.CalcTPSA,
                      'QED': QED.qed,
                      'atomic_num': generate_atomic_num
                     }

features = make_features_dict.keys() # feature 리스트

# 쓰레드에서 수행할 함수
def process_chunk(chunk):
    for feature in features:
        if feature == 'molecule':
            chunk[feature] = chunk['molecule_smiles'].apply(make_features_dict[feature])
        elif feature == 'atomic_num':
            chunk[feature] = generate_atomic_num(chunk)
        else :
            chunk[feature] = chunk['molecule'].apply(make_features_dict[feature])
    
    return chunk

n = 150000 # 전체 데이터셋에서 나눠지는 데이터의 개수 -> 한 데이터셋 마다 15만개 저장
m = 10 # 만들어지는 그룹 개수

max_workers = 6 # 사용할 쓰레드 개수
CHUNK_SIZE = n // max_workers # 한 쓰레드에 들어갈 chunk 크기 -> 150000 // 6 = 25000

In [4]:
for j in range(2, 6):
    df = pd.read_csv(f'./binds_0/df_{j}.csv')

    # 150만개의 데이터를 15만개씩 분할
    # 10개의 데이터셋
    split_df_li = [] # 분할된 데이터 저장 리스트
    # 데이터 분할
    for i in range(m):
        start = n * i
        end = n * (i+1)
        split_df_li.append(df.iloc[start:end])

    cnt = 0 # 잘 작동하는지 확인

    for split_df in split_df_li:
        print(cnt, 'START', end=' ') # 시작
        thread_result_li = [] # 각 쓰레드의 결과 저장 리스트

        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [] # 쓰레드들의 집합
            for start in range(0, n, CHUNK_SIZE):
                end = start + CHUNK_SIZE
                chunk = split_df.iloc[start:end] # 쓰레드에 들어가 데이터 추출 -> 150000 / 6 = 25000
                future = executor.submit(process_chunk, chunk) # 쓰레드에 할일 지정
                futures.append(future) # 쓰레드를 리스트에 추가
                
                del future
                gc.collect()

            for future in concurrent.futures.as_completed(futures):
                result = future.result() # 쓰레드에서 끝난 결과 반환
                thread_result_li.append(result) # 결과들을 하나의 리스트에 저장

                del result
                gc.collect()

        # 모든 결과를 하나의 dataframe으로 병합
        full_df_tmp = pd.concat([thread_result_li[i] for i in range(max_workers)], axis=0, ignore_index=True)
        ecfp_df_tmp = pd.DataFrame(full_df_tmp['ecfp'].to_list()) # ecfp를 dataframe 형태로 변환후 더해준다.
        full_df_tmp.drop(['molecule', 'ecfp'], axis=1, inplace=True) # 필요없는 데이터 삭제 -> molecule, ecfp
        full_df = pd.concat([full_df_tmp, ecfp_df_tmp], axis=1) # ecfp 추가
        full_df.to_csv(f'./binds_0_{j}_r2/{cnt}.csv', index=False) # 데이터 저장

        del split_df
        del thread_result_li
        del futures
        del full_df_tmp
        del ecfp_df_tmp
        del full_df
        gc.collect()

        cnt += 1
        print('END')

    del df
    del split_df_li
    gc.collect()

0 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
1 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
2 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
3 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
4 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
5 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
6 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
7 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
8 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
9 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
0 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25000
END
1 START start
25000
start
25000
start
25000
start
25000
start
25000
start
25