In [1]:
%load_ext dotenv
%dotenv

import os

%cd {os.getenv("PROJECT_PATH") or "."}

%load_ext autoreload
%autoreload 1

from IPython.display import display

In [2]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sys
from pathlib import Path
from absl import logging
from tqdm.notebook import tqdm, trange
from timeit import default_timer as timer
import pickle
from collections import defaultdict

logging.set_verbosity(logging.INFO)

sns.set_context("notebook")

In [3]:
from pandarallel import pandarallel

pandarallel.initialize(
    nb_workers=os.cpu_count(),
    progress_bar=True,
    verbose=2
)

In [4]:
def show_df(df: pd.DataFrame):
    display(df.head())
    print(df.shape)

In [5]:
df_zinc250k = pd.read_csv("data/raw/zinc250k.csv", index_col=0)
df_zinc250k['source'] = "zinc250k"

show_df(df_zinc250k)

In [6]:
df_moses = pd.read_csv("data/raw/moses.csv").rename(columns={"SMILES": "smiles"})
df_moses['source'] = "moses"

show_df(df_moses)

In [7]:
df_chembl = pd.read_csv("data/raw/chembl_33_chemreps.txt", sep="\t", index_col=0).rename(columns={"canonical_smiles": "smiles"})
df_chembl['source'] = "chembl"

show_df(df_chembl)

In [8]:
# df_chembl.canonical_smiles.to_csv("data/processed/chembl.smi", index=False, header=False)

In [9]:
df_smiles = pd.concat([df_moses, df_chembl, df_zinc250k], axis=0, ignore_index=True)
df_smiles = df_smiles[['smiles', 'source']]
df_smiles = df_smiles.dropna().drop_duplicates().sort_values(by="smiles").reset_index(drop=True)

show_df(df_smiles)

In [16]:
df_smiles_single = df_smiles[~df_smiles.smiles.str.contains("\.")]

show_df(df_smiles_single)

In [19]:
import selfies as sf

def func(x:pd.Series):
    try:
        selfies = sf.encoder(x['smiles'])
        
        x['length'] = len(list(sf.split_selfies(selfies)))
    except Exception as e:
        return x
    return x

# _df = df_chembl[~df_chembl['canonical_smiles'].str.contains("\.")].parallel_apply(func, axis=1).dropna()
_df_len = df_smiles_single.parallel_apply(func, axis=1).dropna()

show_df(_df_len)

In [20]:
_df_len.query('source == "moses"').length.describe()

In [21]:
_df_len.query('source == "zinc250k"').length.describe()

In [23]:
_df_len.query('13 <= length <= 72').length.hist(bins=72-13+1)

In [25]:
df_processed = _df_len.query('13 <= length <= 72').drop(columns=['length', 'source'])

show_df(df_processed)

In [26]:
df_processed.to_csv("data/processed/zmc.smi", index=False, header=False)