In [1]:
import pandas as pd 
import json 
import matplotlib
from matplotlib import pyplot as plt 


In [2]:
from numpy import *
from matplotlib.pyplot import *

rcParams['mathtext.fontset'] = 'custom'
rcParams['mathtext.it'] = 'DejaVu Sans:italic'
rcParams['mathtext.bf'] = 'DejaVu Sans:italic:bold'


In [3]:
from src.predict import calculate_raw_descriptors
from src.screening import *

In [None]:
raw_path = "csv/ChemBL_4to99.csv"
raw_df = pd.read_csv(raw_path, sep=";")
raw_df.rename({"Smiles": "h_smiles"}, axis=1, inplace=True)

In [None]:
skimmed_df = filter_metal(raw_df)
df = add_ch3_smiles(skimmed_df)
df, failed = filter_empty_ch3_smiles(df)

In [None]:
final_df = pd.DataFrame()
missingh_df = pd.DataFrame()
failed_df = pd.DataFrame()
for idx, row in df.iterrows():
    try:
        tmp_df = calculate_raw_descriptors(row['h_smiles'],
                                  row['ch3_smiles'],
                                  ind_descriptors="csv/descriptors-ind.csv")
        if all(tmp_df[row['ch3_smiles']].notna()):
            final_df = final_df.append(row)
        else:
            missingh_df = missingh_df.append(row)
    except:
        failed_df = failed_df.append(row)

In [None]:
final_df.to_csv("csv/filtered.csv")

In [None]:
result_path = "screening_results/ChemBL_4to99_1top1bot.json"
with open(result_path) as r:
    raw_result = json.load(r)

In [None]:
time = raw_result.pop("elapsed-time")
result_df = pd.DataFrame.from_dict(raw_result).T
raw_df = result_df

In [None]:
lowCOF = 0 
for val in result_df["COF"]:
    if val <= 0.129:
        lowCOF += 1 
print(lowCOF)

In [None]:
lowF0 = 0 
for val in result_df["F0"]:
    if val <= 0.8966:
        lowF0 += 1 
print(lowF0)

# Filtering

In [None]:
for idx, row in result_df.iterrows():
    og = [row['top_smiles'], row['bot_smiles']]
    if og != sorted(og):
        sorted_top = sorted(og)[0]
        sorted_bot = sorted(og)[1]
        result_df.loc[idx]["top_smiles"] = sorted_top
        result_df.loc[idx]["bot_smiles"] = sorted_bot
        


In [None]:
result_df["top_h_smiles"] = ""
result_df["top_ch3_smiles"] = ""
result_df["bot_h_smiles"] = ""
result_df["bot_ch3_smiles"] = ""
for idx, row in result_df.iterrows():
    result_df.loc[idx]["top_h_smiles"] = row["top_smiles"][0][0]
    result_df.loc[idx]["top_ch3_smiles"] = row["top_smiles"][0][1]
    result_df.loc[idx]["bot_h_smiles"] = row["bot_smiles"][0][0]
    result_df.loc[idx]["bot_ch3_smiles"] = row["bot_smiles"][0][1]

In [None]:
dropby = ['top_h_smiles', 'top_ch3_smiles', 'bot_h_smiles', 'bot_ch3_smiles', 'COF', 'F0']
result_df.drop_duplicates(subset=dropby, inplace=True)
filtered_results = result_df.loc[result_df["status"] == "pass"]

# Distribution Plots

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(filtered_results['COF'], filtered['F0'], alpha=0.01)
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(filtered_results['COF'], bins=50)
plt.xlabel('COF') 
plt.ylabel('N', style='italic')
plt.xlim(0.085, 0.2)
plt.savefig('./screening_results/plots/COF_dist.pdf', dpi=500)

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(filtered_results['F0'], bins=100)
plt.xlabel(r'$\mathbf{F_0}$, nN')
plt.ylabel('N', style='italic')
plt.xlim(0, 8)
plt.savefig('./screening_results/plots/F0_dist.pdf', dpi=500)

# Ranking and Determining Optimal Systems


In [None]:
COF_ranked = filtered_results.sort_values(by="COF").drop(["top_smiles", "top_fracs", "bot_smiles", "bot_fracs"], axis=1)
F0_ranked = filtered_results.sort_values(by="F0").drop(["top_smiles", "top_fracs", "bot_smiles", "bot_fracs"], axis=1)

In [None]:
on = ["top_h_smiles", "top_ch3_smiles", "bot_h_smiles", "bot_ch3_smiles", "COF", "F0", "status"]
n = 2000
optimal = pd.merge(COF_ranked.head(n), F0_ranked.head(n), how="inner", on=on)
optimal = optimal[["top_h_smiles", "top_ch3_smiles", "bot_h_smiles", "bot_ch3_smiles", "COF", "F0", "status"]]

In [None]:
optimal.to_csv("screening_results/plots/optimal.csv")

In [None]:
smiles_set_top = set(optimal['top_h_smiles'])
smiles_set_bot = set(optimal['bot_h_smiles'])
smiles_set = smiles_set_top | smiles_set_bot

In [None]:
smiles_dict = {'C': 'Methane', 
               'C#C': 'Acetylene',
               'C#CC': 'Propyne', 
               'C#N': 'Cyanide', 
               'C#[N+][O-]': 'Fulminic Acid',
               'C1CC1': 'Cyclopropane',
               'C=C': 'Ethylene', 
               'C=C(Cl)Cl': '1,1-Dichloroethene', 
               'C=C(F)F': '1,1-Difluoroethane',
               'C=CC': 'Propene',
               'C=CC#N': 'Acrylonitrile',
               'C=CCl': 'Vinylchloride',
               'CBr': 'Bromoethane', 
               'CC': 'Ethane',
               'CC#CC': 'But-2-yne', 
               'CC#N': 'Acetonitrile', 
               'CC(F)F': 'Difluoroethane', 
               'CCC': 'Propane', 
               'N#CCC#N': 'Malononitrile',
               'N#CCCC#N': '1,2-Dicyanoethane',
               'NC(=O)C(F)F': '2,2-Difluoroacetamide', 
               'O=CNNC=O': '1,2-Diformylhydrazine'}

In [None]:
optimal['top name'] = ''
optimal['bot name'] = ''
for idx, row in optimal.iterrows():
    optimal['top name'][idx] = smiles_dict[row['top_h_smiles']]
    optimal['bot name'][idx] = smiles_dict[row['bot_h_smiles']]

In [None]:
skimmed_optimal = optimal[['top name', 'bot name', 'COF', 'F0']]

In [None]:
for idx, row in skimmed_optimal.iterrows():
    skimmed_optimal['COF'][idx] = round(row['COF'], 4)
    skimmed_optimal['F0'][idx] = round(row['F0'], 4)

In [None]:
skimmed_optimal.to_csv("screening_results/plots/skimmed_optimal.csv")