# Tox21 comparioson to other datasets
Here all datasets which are compared with the Tox21 dataset will be evaluated. 

In [None]:
import warnings
warnings.filterwarnings('ignore', category = RuntimeWarning)

import re

import pandas as pd
import numpy as np

In [None]:
#Cleaned tox21 and spectral/chemical data
apci = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/Cleaned data/APCI_chemicals_STD.csv', sep = '\t')
tox21 = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/Cleaned data/Tox21_chemicals_STD.csv', sep = '\t')
iris = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/Cleaned data/Iris_chemicals_STD.csv', sep = '\t')
isabel = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/Cleaned data/Isabelles_chemicals_STD_updated.csv', sep = '\t')
klara = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/Cleaned data/KLARA_chemicals_STD.csv', sep = '\t') 

#Cleaned SIRIUS training data
sirius = pd.read_csv('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/SIRIUS training set/sirius_without_dup.tsv', sep='\t')

In [None]:
tox21[tox21.InChIKey14.isna()]

In [None]:
isabel

In [None]:
tox21_ahr_mmp = tox21[['InChIKey14', 'SMILES', 'nr.ahr', 'sr.mmp']]
tox21_ahr_mmp = tox21_ahr_mmp.dropna(subset=['InChIKey14'], how = 'all').reset_index(drop=True)
tox21_ahr_mmp = tox21_ahr_mmp.dropna(subset=['nr.ahr', 'sr.mmp'], how = 'all').reset_index(drop=True)


tox21_ahr_mmp.shape

In [None]:
#Filter out LC-APCI spectra
apci_gc = apci[apci.SeparationMethod == 'GC'].reset_index(drop=True)

In [None]:
# Merge Tox21 with APCI, Iris data and Isabelle data
tox21_ahr_mmp = pd.merge(tox21_ahr_mmp, apci_gc[['InChIKey14', 'DataBank']], on='InChIKey14', how='left')
tox21_ahr_mmp = pd.merge(tox21_ahr_mmp, iris[['InChIKey14', 'source']], on='InChIKey14', how='left')
tox21_ahr_mmp = pd.merge(tox21_ahr_mmp, isabel[['InChIKey14', 'Compound']], on='InChIKey14', how='left')

tox21_ahr_mmp = tox21_ahr_mmp.drop_duplicates(subset=tox21_ahr_mmp.columns, keep='first').reset_index(drop=True)

tox21_ahr_mmp.rename(columns={'DataBank': 'ms_library', 'source': 'iris_data', 'Compound':'isabel_data'}, inplace=True)

#Filter out rows with compounds without any spectral information
tox21_w_spectra = tox21_ahr_mmp.dropna(subset=['ms_library', 'iris_data', 'isabel_data'], how='all').reset_index(drop=True)


In [None]:
tox21_ahr_mmp.shape

In [None]:
# Actives/inactives for nr.ahr and sr.mmp endpoints with available spectra (SIRIUS training data included)
endpoints_list = ['nr.ahr', 'sr.mmp']

print('Tox21 compounds with spectra (SIRIUS training data included):')
for endpoint in endpoints_list: 
    print(tox21_w_spectra.value_counts(endpoint))

In [None]:
#add information about sirius training data
tox21_ahr_mmp = pd.merge(tox21_ahr_mmp, sirius[['ionmode', 'InChIKey14']], on='InChIKey14', how='left')
tox21_ahr_mmp.rename(columns={'ionmode':'sirius_data'}, inplace=True)

#Filter out compounds not found in libraries
tox21_w_spectra_sirius = tox21_ahr_mmp.dropna(subset=['ms_library', 'iris_data', 'isabel_data'], how='all').reset_index(drop=True)

#Filter out compounds included in SIRIUS training data
tox21_w_spectra_no_sirius = tox21_w_spectra_sirius[tox21_w_spectra_sirius.sirius_data.isna()]

In [None]:
tox21_ahr_mmp.shape

In [None]:
# Actives/inactives for nr.ahr and sr.mmp endpoints with available spectra (SIRIUS training data removed)
endpoints_list = ['nr.ahr', 'sr.mmp']

print('Tox21 compounds with spectra (SIRIUS training data removed):')
for endpoint in endpoints_list: 
    print(tox21_w_spectra_no_sirius.value_counts(endpoint))

When filtering out any compounds available in SIRIUS training data (needed to use SIRIUS fingerprints) and any compounds without an available spectrum, we get the values above for actives (1) and inactives (0). 

### Add KLARA data (previously cleaned by Gordian)
Cleaning procedure to get InChIKeys found in '2024-12-16_Cleaning_data.ipynb'

In [None]:
tox21_ahr_mmp = pd.merge(tox21_ahr_mmp, klara[['InChIKey14', 'Avdelning']], on='InChIKey14', how='left')
tox21_ahr_mmp.rename(columns={'Avdelning':'old_klara_MMK'}, inplace=True)

In [None]:
tox21_ahr_mmp[tox21_ahr_mmp.InChIKey14.duplicated(keep=False)]

In [None]:
tox21_ahr_mmp = tox21_ahr_mmp.drop_duplicates(subset=['InChIKey14'], keep='first').reset_index(drop=True)

In [None]:
tox21_ahr_mmp.shape

### Adding newly cleaned KLARA data (KLARA ACES)

In [None]:
with open('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Code/Data cleaning/2025-02-13_klara_aces_cleaned.pkl', 'rb') as f:
    klara_aces = pd.read_pickle(f)

klara_aces.head()

In [None]:
klara_aces['ACES'] = 'found in klara aces'

klara_aces_no_dupl = klara_aces.drop_duplicates(subset=['InChIKey14'], keep='first').reset_index(drop=True)

In [None]:
tox21_ahr_mmp = pd.merge(tox21_ahr_mmp, klara_aces_no_dupl[['InChIKey14', 'ACES']], on='InChIKey14', how='left') #Duplicates introduced since compounds available in muptliple parts of the university

print(tox21_ahr_mmp)

### Adding newly cleaned KLARA Kemikum data
Cleaned using code in '2024-12-16_Cleaning_data.ipynb'

In [None]:
with open('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Code/Data cleaning/2025-03-06_klara_kemikum_UPDATED_cleaned.pkl', 'rb') as f:
    klara_kemikum = pd.read_pickle(f)

In [None]:
klara_kemikum.head()

In [None]:
klara_kemikum_no_dupl = klara_kemikum.drop_duplicates(subset=['InChIKey14'], keep='first').reset_index(drop=True)
klara_kemikum['kemikum'] = 'found in klara kemikum'

In [None]:
klara_kemikum = klara_kemikum.sort_values(by='InChIKey14')
klara_kemikum_nona = klara_kemikum.dropna(subset=['SMILES'], how='all').reset_index(drop=True)

In [None]:
klara_kemikum_nona[klara_kemikum_nona.SMILES.str.contains(r'\.')] #Should not have any entries

In [None]:
klara_kemikum.section.unique()

In [None]:
tox21_ahr_mmp = pd.merge(tox21_ahr_mmp, klara_kemikum_no_dupl[['InChIKey14', 'kemikum']], on='InChIKey14', how='left') #Duplicates introduced since compounds available in muptliple parts of the university

tox21_ahr_mmp.head()

In [None]:
tox21_ahr_mmp.shape

### Add SusDat data from NORMAN for GC- amenability prediction
Previously cleaned (find workflow in 2024-12-16_Cleaning_data.ipynb)

In [None]:
#NORMAN SusDat dataset
with open('/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Code/Data cleaning/2025-02-12_susdat_std.pkl', 'rb') as f:
    susdat = pd.read_pickle(f)

In [None]:
susdat.head()

In [None]:
susdat_nona = susdat.dropna(subset=['SMILES'], how='all').reset_index(drop=True)

susdat_nona[susdat_nona.SMILES.str.contains(r'\.')].shape

In [None]:
#filter out duplicate InChIKey14 and keep the one with the highest gc_probability

#sort by InChIKey14 and gc_probability
susdat_sorted = susdat_nona.sort_values(by=['InChIKey14', 'gc_probability'], ascending=[True, False])

#filter out duplicates with lower gc_probability and keep the one with the highest
susdat_filtered = susdat_sorted.drop_duplicates(subset=['InChIKey14'], keep='first').reset_index(drop=True)

In [None]:
tox21_ahr_mmp = pd.merge(tox21_ahr_mmp, susdat_filtered[['InChIKey14', 'gc_probability']], on='InChIKey14', how='left')

In [None]:
tox21_ahr_mmp.shape

In [None]:
# with open('2025-05-12_tox21_ahr_mmp_available_compounds_all_sources_UPDATED.pkl', 'wb') as f:
#     tox21_ahr_mmp.to_pickle(f)

In [None]:
import pickle
import pandas as pd

with open('2025-03-06_tox21_ahr_mmp_available_compounds_all_sources_UPDATED.pkl', 'rb') as f:
    tox21_ahr_mmp = pd.read_pickle(f)

In [None]:
tox21_ahr_mmp

In [None]:
tox21_ahr_mmp.drop_duplicates(subset=['InChIKey14'], keep='first', inplace=True)

In [None]:
#remove duplicate inchikeys
#tox21_ahr_mmp_klara_kemikum = tox21_ahr_mmp.dropna(subset=['section_aces'], how='all').reset_index(drop=True)
#tox21_ahr_mmp_klara_kemikum_no_dupl = tox21_ahr_mmp_klara_kemikum.drop_duplicates(subset=['InChIKey14'], keep='first').reset_index(drop=True)

### Visualizations on compounds available in KLARA ACES and Kemikum

#### KLARA ACES visualizations

Removing any compounds which were not available in KLARA ACES, as well as any duplicate InChIKey14's.

In [None]:
tox21_aces = tox21_ahr_mmp.dropna(subset=['section_aces'], how='all').reset_index(drop=True)
tox21_aces_no_dupl = tox21_aces.drop_duplicates(subset=['InChIKey14'], keep='first').reset_index(drop=True)

Visualization of the data. 

In [None]:
import matplotlib.pyplot as plt

plt.hist(tox21_aces_no_dupl['gc_probability'], bins=20)
plt.xlabel('Probability of GC-amenability')
plt.ylabel('Number of compounds')
plt.title('Probability of GC-amenability for compounds in KLARA ACES \n SIRIUS training data included')

In [None]:
tox21_aces_unique_sirius = tox21_aces_no_dupl[(tox21_aces_no_dupl['ms_library'].isna())&
                                              (tox21_aces_no_dupl['iris_data'].isna())&
                                              (tox21_aces_no_dupl['isabel_data'].isna())&
                                              (tox21_aces_no_dupl['section_kemikum'].isna())]

In [None]:
tox21_aces_unique_no_sirius = tox21_aces_unique_sirius[tox21_aces_unique_sirius.sirius_data.isna()]

In [None]:
tox21_aces_ahr_no_sirius = tox21_aces_unique_no_sirius.dropna(subset='nr.ahr', how='all').reset_index(drop=True)
tox21_aces_mmp_no_sirius = tox21_aces_unique_no_sirius.dropna(subset='sr.mmp', how='all').reset_index(drop=True)

In [None]:
plt.hist(tox21_aces_unique_no_sirius['gc_probability'], bins=20)
plt.xlabel('Probability of GC-amenability')
plt.ylabel('Number of compounds')
plt.title('Probability of GC-amenability for compounds in KLARA ACES \n SIRIUS training data removed')

In [None]:
tox21_aces_unique_sirius_GC50 = tox21_aces_unique_sirius[tox21_aces_unique_sirius['gc_probability'] > 0.5]
tox21_aces_unique_no_sirius_GC50 = tox21_aces_unique_sirius_GC50[tox21_aces_unique_sirius_GC50.sirius_data.isna()]

In [None]:
print('Prob. of GC-amenability > 0.5')
print('----------')

print(f'Tox21 compounds without spectra and found in KLARA ACES (SIRIUS training data included):')
for endpoint in endpoints_list: 
    print(tox21_aces_unique_sirius_GC50.value_counts(endpoint))

print(f'Tox21 compounds without spectra and found in KLARA ACES (SIRIUS training data removed):')
for endpoint in endpoints_list: 
    print(tox21_aces_unique_no_sirius_GC50.value_counts(endpoint))

#### KLARA Kemikum

Removing any compounds not included in KLARA kemikum, as well as any duplicate compounds. 

In [None]:
tox21_kemikum = tox21_ahr_mmp.dropna(subset=['section_kemikum'], how='all').reset_index(drop=True)
tox21_kemikum_no_dupl = tox21_kemikum.drop_duplicates(subset=['InChIKey14'], keep='first').reset_index(drop=True)

Visualizations.

In [None]:
import matplotlib.pyplot as plt

plt.hist(tox21_kemikum_no_dupl['gc_probability'], bins=20)
plt.xlabel('Probability of GC-amenability')
plt.ylabel('Number of compounds')
plt.title('Probability of GC-amenability for compounds in KLARA Kemikum \n SIRIUS training data included')

In [None]:
tox21_kemikum_unique_sirius = tox21_kemikum_no_dupl[(tox21_kemikum_no_dupl['ms_library'].isna())&
                                                    (tox21_kemikum_no_dupl['iris_data'].isna())&
                                                    (tox21_kemikum_no_dupl['isabel_data'].isna())
]

tox21_kemikum_unique_no_sirius = tox21_kemikum_unique_sirius[tox21_kemikum_unique_sirius.sirius_data.isna()]

tox21_kemikum_ahr_no_sirius = tox21_kemikum_unique_no_sirius.dropna(subset='nr.ahr', how='all').reset_index(drop=True)
tox21_kemikum_mmp_no_sirius = tox21_kemikum_unique_no_sirius.dropna(subset='sr.mmp', how='all').reset_index(drop=True)

In [None]:
plt.hist(tox21_kemikum_unique_no_sirius['gc_probability'], bins=20)
plt.xlabel('Probability of GC-amenability')
plt.ylabel('Number of compounds')
plt.title('Probability of GC-amenability for compounds in KLARA ACES \n SIRIUS training data removed')

In [None]:
tox21_kemikum_unique_sirius_GC50 = tox21_kemikum_unique_sirius[tox21_kemikum_unique_sirius['gc_probability'] > 0.5]
tox21_kemikum_unique_no_sirius_GC50 = tox21_kemikum_unique_sirius_GC50[tox21_kemikum_unique_sirius_GC50.sirius_data.isna()]

print('Prob. of GC-amenability > 0.5')
print('----------')

print(f'Tox21 compounds without spectra and found in KLARA Kemikum (SIRIUS training data included):')
for endpoint in endpoints_list: 
    print(tox21_kemikum_unique_sirius_GC50.value_counts(endpoint))

print(f'Tox21 compounds without spectra and found in KLARA Kemikum (SIRIUS training data removed):')
for endpoint in endpoints_list: 
    print(tox21_kemikum_unique_no_sirius_GC50.value_counts(endpoint))

### Compiling list of chemicals to analyse
I will compile lists for all actives and inactives with a GC-amenability probability over 0.5, as well as remove any compounds which are already found in any spectral library or dataset. 

In [None]:
tox21_experimental_list = tox21_ahr_mmp.dropna(subset=['section_aces', 'section_kemikum'], how='all').reset_index(drop=True)
tox21_experimental_list = tox21_experimental_list[(tox21_experimental_list['ms_library'].isna())&
                                                  (tox21_experimental_list['iris_data'].isna())&
                                                  (tox21_experimental_list['isabel_data'].isna())]
tox21_experimental_list_GC50 = tox21_experimental_list[tox21_experimental_list['gc_probability'] > 0.5]
tox21_experimental_list_GC50 = tox21_experimental_list_GC50.drop(columns=['old_klara_MMK', 'ms_library', 'iris_data', 'isabel_data']).reset_index(drop=True)

tox21_experimental_list_GC50_no_sirius = tox21_experimental_list_GC50[tox21_experimental_list_GC50['sirius_data'].isna()]

In [None]:
# Remove any duplicates in regards to InChIKey14 and group by both Kemikum and ACES sections
tox21_experimental_list_sorted = tox21_experimental_list_GC50_no_sirius.sort_values(by='gc_probability', ascending=False)
tox21_experimental_list_filtered = tox21_experimental_list_sorted.drop_duplicates(subset=['InChIKey14', 'section_kemikum', 'section_aces'],keep='first').reset_index(drop=True)

In [None]:
tox21_experimental_list_filtered.section_kemikum.value_counts()

In [None]:
#List of names of groups in analytical chemistry
list_of_names = ['Group CÖ_UN', 'Kurslab_AK', 'Group Ioannis Sadiktsis', 'Group Jan Holmbäck','Masslab', 'Group Anneli Kruve', 'Group Nicole Pamme', 'Group Leopold Ilag']

filter_of_compounds_available = tox21_experimental_list_filtered.section_kemikum.isin(list_of_names)

In [None]:
#analytical department compounds
tox21_experimental_list_analytical_department = tox21_experimental_list_filtered[filter_of_compounds_available].reset_index(drop=True)

#kemikum (non-analytical) department unique compounds
tox21_experimental_list_other_departments = tox21_experimental_list_filtered[~tox21_experimental_list_filtered.InChIKey14.isin(tox21_experimental_list_analytical_department['InChIKey14'])]
tox21_experimental_list_other_departments_kemikum = tox21_experimental_list_other_departments.dropna(subset=['section_kemikum'], how='all').reset_index(drop=True)

#aces unique compounds
tox21_experimental_list_aces = tox21_experimental_list_other_departments[tox21_experimental_list_other_departments.section_kemikum.isna()].reset_index(drop=True)

Determine how many unique compounds are found in each subcategory section.

In [None]:
def active_inactive_count(df, endpoints_list):
    df = df.drop_duplicates(subset='InChIKey14')
    for endpoint in endpoints_list:
        print(f'Active/inactive count for {endpoint}')
        print(df.value_counts(endpoint))

print('Analytical department')
active_inactive_count(tox21_experimental_list_analytical_department, endpoints_list)
print('------')

print('Other departments')
active_inactive_count(tox21_experimental_list_other_departments_kemikum, endpoints_list)
print('------')

print('ACES')
active_inactive_count(tox21_experimental_list_aces, endpoints_list)
print('------')

Combine with information from KLARA datasets

In [None]:
tox21_experimental_list_analytical_department = tox21_experimental_list_analytical_department.merge(klara_kemikum_nona[['cas', 'name', 'InChIKey14']], on='InChIKey14', how='left')
tox21_experimental_list_analytical_department = tox21_experimental_list_analytical_department.drop_duplicates(subset=['InChIKey14', 'section_kemikum']).reset_index(drop=True)
tox21_experimental_list_analytical_department = tox21_experimental_list_analytical_department.drop(columns=['section_aces', 'gc_probability', 'sirius_data'])
tox21_experimental_list_analytical_department = tox21_experimental_list_analytical_department.sort_values(by=['nr.ahr', 'sr.mmp', 'InChIKey14'], ascending=[False,False,True]).reset_index(drop=True)

tox21_experimental_list_other_departments_kemikum = tox21_experimental_list_other_departments_kemikum.merge(klara_kemikum_nona[['cas', 'name', 'InChIKey14']], on='InChIKey14', how='left')
tox21_experimental_list_other_departments_kemikum = tox21_experimental_list_other_departments_kemikum.drop_duplicates(subset=['InChIKey14', 'section_kemikum']).reset_index(drop=True)
tox21_experimental_list_other_departments_kemikum = tox21_experimental_list_other_departments_kemikum.drop(columns=['section_aces', 'gc_probability', 'sirius_data'])
tox21_experimental_list_other_departments_kemikum = tox21_experimental_list_other_departments_kemikum.sort_values(by=['nr.ahr', 'sr.mmp', 'InChIKey14'], ascending=[False, False,True]).reset_index(drop=True)

tox21_experimental_list_aces = tox21_experimental_list_aces.merge(klara_aces[['cas', 'name', 'InChIKey14']], on='InChIKey14', how='left')
tox21_experimental_list_aces = tox21_experimental_list_aces.drop_duplicates(subset=['InChIKey14', 'section_aces']).reset_index(drop=True)
tox21_experimental_list_aces = tox21_experimental_list_aces.drop(columns=['section_kemikum', 'gc_probability', 'sirius_data'])
tox21_experimental_list_aces = tox21_experimental_list_aces.sort_values(by=['nr.ahr', 'sr.mmp', 'InChIKey14'], ascending=[False,False,True]).reset_index(drop=True)

In [None]:
# tox21_experimental_list_analytical_department.to_excel('2025-02-14_available_chemicals_KLARA_Kemikum_Analytical_Department.xlsx')
# tox21_experimental_list_other_departments_kemikum.to_excel('2025-02-14_available_chemicals_KLARA_Kemikum_Other_Departments.xlsx')
# tox21_experimental_list_aces.to_excel('2025-02-14_available_chemicals_KLARA_ACES.xlsx')

### Compiling lists of actives to analyse and ask about

Which compounds are already available in the group to analyse? Which are available in the corridor to ask about? 

In [None]:
def separate_groups(df, group_name):
    group = df[(df['section_kemikum'].str.contains(group_name))]
    df = df[~df['InChIKey14'].isin(group['InChIKey14'])]
    return df, group

tox21_ad_actives = tox21_experimental_list_analytical_department[(tox21_experimental_list_analytical_department['nr.ahr'] == 1)|
                                                                 (tox21_experimental_list_analytical_department['sr.mmp'] == 1)]

tox21_ad_actives, tox21_kruve_group = separate_groups(tox21_ad_actives, 'Kruve')
tox21_ad_actives, tox21_masslab_group = separate_groups(tox21_ad_actives, 'Masslab')
tox21_ad_actives, tox21_kurslab_group = separate_groups(tox21_ad_actives, 'Kurslab')
tox21_ad_actives, tox21_group_cö_un = separate_groups(tox21_ad_actives, 'Group CÖ_UN')
tox21_ad_actives, tox21_group_ioannis = separate_groups(tox21_ad_actives, 'Ioannis')
tox21_ad_actives, tox21_group_ilag = separate_groups(tox21_ad_actives, 'Ilag')
tox21_ad_actives, tox21_group_jan_holmback = separate_groups(tox21_ad_actives, 'Group Jan Holmbäck')

In [None]:
tox21_ac_actives = pd.concat([tox21_kruve_group, 
                              tox21_masslab_group, 
                              tox21_kurslab_group, 
                              tox21_group_cö_un, 
                              tox21_group_ioannis, 
                              tox21_group_ilag, 
                              tox21_group_jan_holmback])

#### Kemikum KLARA list specifications

### Available compounds in Kemikum (excl. AC)

In [None]:
groups_allowed_to_borrow_chemicals = ['Group JoS', 'Group KS']

tox21_kemikum_available_actives = tox21_experimental_list_other_departments_kemikum[(tox21_experimental_list_other_departments_kemikum['nr.ahr'] == 1)|
                                                                                    (tox21_experimental_list_other_departments_kemikum['sr.mmp'] == 1)]

tox21_kemikum_available_actives = tox21_kemikum_available_actives[tox21_kemikum_available_actives['section_kemikum'].isin(groups_allowed_to_borrow_chemicals)]

tox21_kemikum_available_actives = tox21_kemikum_available_actives.sort_values(by=['InChIKey14','section_kemikum'], ascending=[True, True]).reset_index(drop=True)
tox21_kemikum_available_actives = tox21_kemikum_available_actives.drop_duplicates(subset='InChIKey14', keep='first').reset_index(drop=True)
tox21_kemikum_available_actives = tox21_kemikum_available_actives.sort_values(by='section_kemikum').reset_index(drop=True)

### Determine molecular formula for each compound

In [None]:
from rdkit import Chem
from rdkit.Chem import PandasTools, Descriptors, rdMolDescriptors, Crippen, Fragments

In [None]:
def calc_molecular_formula_and_mol_weight(df):
    PandasTools.AddMoleculeColumnToFrame(df, smilesCol='SMILES')
    df['molecular_weight'] = df['ROMol'].apply(Chem.rdMolDescriptors.CalcExactMolWt)
    df['molecular_formula'] = df['ROMol'].apply(Chem.rdMolDescriptors.CalcMolFormula)
    df['logP'] = df['ROMol'].apply(Chem.Crippen.MolLogP)

    prim_amines = df['ROMol'].apply(Chem.Fragments.fr_NH2)
    sec_amines = df['ROMol'].apply(Chem.Fragments.fr_NH1)
    tert_amines = df['ROMol'].apply(Chem.Fragments.fr_NH0)
    arom_amines = df['ROMol'].apply(Chem.Fragments.fr_Ar_NH)
    df['amines'] = prim_amines + sec_amines + tert_amines + arom_amines

    aliph_hydroxyls = df['ROMol'].apply(Chem.Fragments.fr_Al_OH)
    aromatic_hydroxyls = df['ROMol'].apply(Chem.Fragments.fr_Ar_OH)
    df['hydroxyls'] = aliph_hydroxyls + aromatic_hydroxyls

    df['HBA'] = df['ROMol'].apply(Chem.rdMolDescriptors.CalcNumLipinskiHBA)
    df['HBD'] = df['ROMol'].apply(Chem.rdMolDescriptors.CalcNumLipinskiHBD)
    return df

In [None]:
available_actives = pd.concat([tox21_ac_actives, tox21_kemikum_available_actives]).reset_index(drop=True)

available_actives = calc_molecular_formula_and_mol_weight(available_actives)

In [None]:
available_actives['boiling_pointC'] = [142.78, np.nan, 129, 280,
                                       np.nan, 
                                       288, np.nan, np.nan, np.nan, 83, 
                                       np.nan, 251, np.nan, np.nan, np.nan, 243, np.nan, np.nan, np.nan, np.nan, 302, np.nan, np.nan, np.nan,
                                       np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 410, np.nan, np.nan, np.nan, np.nan, 304, 
                                       np.nan, np.nan, np.nan, 
                                       184,
                                       np.nan, 256, 263, 265, 306, 
                                       285, 267, np.nan, 200, np.nan, 204, np.nan, 246, 245, np.nan, 211, 255, 182, np.nan  ]
available_actives['vapour_pressure_mmHg'] = [0.0000035, 0.000112, 0.0000172, 0.000098, 
                                             np.nan,
                                             0.000274, 0.0028, np.nan,  np.nan, 0.0052, 
                                             np.nan, 0.01, np.nan, 0.00000001, np.nan, 0.03, 0.4, 0.00206, 0.00308, 0.00000047, 0.00067, 0.00000016, 0.00000194, 0.00000079,
                                             np.nan, 0.0000265, 0.00000954, 0.000138, np.nan, np.nan, 0.00000175, 0.000138, np.nan, 0.00000005, 0.0000183, 0.00048, 
                                             np.nan, np.nan, 0.0317,
                                             0.67,
                                             np.nan, 0.01, 0.00791, np.nan, 0.000881, 
                                             0.0018, 0.0000778, 0.0000051, 0.28, np.nan, 95.4, 0.0000119, 0.008, 0.005, np.nan, 0.17, 0.000553, 1.1, np.nan] 

In [None]:
available_actives = available_actives[available_actives['name']!= '2,4-toluendiisocyanat (isomerblandning)']

In [None]:
from rdkit.Chem import Draw

available_actives['structure'] = available_actives.ROMol.apply(Chem.Draw.MolToImage)

In [None]:
available_actives

In [None]:
available_actives.molecular_weight.describe()

In [None]:
available_actives_w_same_mol_formula = available_actives[available_actives.duplicated(subset='molecular_formula', keep=False)].reset_index(drop=True).sort_values(by='molecular_formula')

available_actives_w_same_mol_formula

The mixes that will be made

In [None]:
def make_mixes_df(df, mix_list): 
    mix_df = df[df['name'].isin(mix_list)].reset_index(drop=True)
    new_df = df[~df['name'].isin(mix_list)].reset_index(drop=True)
    return new_df, mix_df

In [None]:
compounds_to_remove_list = ['p-Toluidin', #Too crystalized in packaging, could not be transferred
                            'Aminoguanidine bicarbonate' # Could not be dissolved in anything other that water
                            ] 

mix1_list = ['4-Klorfenylisocyanat', '2-Kloracetofenon', 'alpha-Tetralone (volym)', 'trans-Stilben', 'Triphenylborane', 'Inden (massa)', 'Ftaldialdehyd', 'p-Kloranil']

mix2_list = ['N-Phenyl-o-phenylenediamine', '3-(Dimetylamino)-fenol', 'N,N-Dimetyl-p-fenylendiamin', '2-Nitrophenylacetonitril', '1,3-Fenylendiamin', 'Benzhydrazide', '2,4,6-Triklorfenol', 'N,N-Dimetyl-p-toluidin (massa)',
             '2,3-Diaminotoluen', '1-Naftol', 'Tiourea', 'Myristyltrimetylammoniumbromid', 'Hexadecyltrimetylammoniumbromid', 'N,N-Dietyl-1,4-fenylendiammoniumsulfat']

In [None]:
available_actives = available_actives[~available_actives['name'].isin(compounds_to_remove_list)]

available_actives_new, mix1 = make_mixes_df(available_actives, mix1_list)
available_actives_new, mix2 = make_mixes_df(available_actives_new, mix2_list)

In [None]:
mix1 = mix1.sort_values(by='molecular_weight', ascending=True)

mix1

In [None]:
mix1

In [None]:
mix1.molecular_weight.sort_values()

In [None]:
mix2 = mix2.sort_values(by='molecular_weight', ascending=True)

mix2

In [None]:
mix2.vapour_pressure_mmHg

In [None]:
mix2.iloc[10]

##### KLARA information added to each group

Analytical chemistry department

In [None]:
klara_kemikum_nona.rename(columns={'section':'section_kemikum'}, inplace=True)

def add_klara_info(df):
    new_df = klara_kemikum_nona[['name', 'cas', 'building', 'floor', 'room', 'storage', 'section_kemikum', 'comment', 'klara-id', 'InChIKey14']].merge(df[['InChIKey14', 'section_kemikum']], 
                                                                                                                                                       on=['InChIKey14', 'section_kemikum'], 
                                                                                                                                                       how='right')
    new_df = new_df.drop_duplicates(subset=new_df.columns, keep='first').reset_index(drop=True)
    return new_df

tox21_kurslab_klara = add_klara_info(tox21_kurslab_group)
tox21_un_klara = add_klara_info(tox21_group_cö_un)
tox21_ilag_klara = add_klara_info(tox21_group_ilag)

tox21_kruve_klara = add_klara_info(tox21_kruve_group)
tox21_kruve_klara

In [None]:
tox21_un_klara

In [None]:
list_of_chemicals_group_tosave = [tox21_kurslab_klara, tox21_un_klara, tox21_ilag_klara]

for group in list_of_chemicals_group_tosave:
    group.to_excel(f'2025-02-18_available_chemicals_{group["section_kemikum"].iloc[0]}.xlsx', index=False)

Organic chemistry department

In [None]:
group_ks_klara = tox21_kemikum_available_actives[tox21_kemikum_available_actives['section_kemikum'].str.contains('Group KS')]
group_ks_klara = add_klara_info(group_ks_klara)

group_ks_klara_no_dupl = group_ks_klara.drop(columns=['section_kemikum', 'building', 'floor', 'InChIKey14'])
group_ks_klara_no_dupl = group_ks_klara_no_dupl.drop_duplicates(subset=group_ks_klara_no_dupl.columns, keep='first').reset_index(drop=True)

group_ks_klara_no_dupl

group_ks_klara_no_dupl.to_excel('2025-02-19_available_chemicals_Group_KS.xlsx', index=False)

In [None]:
group_ks_klara = group_JoS_klara = tox21_kemikum_available_actives[tox21_kemikum_available_actives['section_kemikum'].str.contains('Group KS')]
group_ks_klara = add_klara_info(group_ks_klara)

group_ks_klara

In [None]:
group_JoS_klara = tox21_kemikum_available_actives[tox21_kemikum_available_actives['section_kemikum'].str.contains('Group JoS')]
group_JoS_klara = add_klara_info(group_JoS_klara)

group_JoS_klara

group_JoS_klara.to_excel('2025-02-21_available_chemicals_group_JoS.xlsx', index=False)

In [None]:
group_JoS_klara