In [6]:
import pandas as pd
import numpy as np
import math
import os
from rdkit.Chem import PandasTools
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem.rdmolops import FindPotentialStereo
#from IPython.display import display

# Made by Henric Pietro Vicente Gil please credit me in your work :) 
# henricgil@discente.ufg.br

In [7]:
"""
# This cell defines the location of the files, 

# path_to_files may be a folder where multiple files are;

# file_list is a list of all the files in the folder provided in path_to_files;

# s_file represents the file(s) to be read, can be either a list or a string

# uncomment the part that you will use, comment the part you wont
"""

# Points to a file ./myfolder/example.xlsx or to a folder c:\\my\\folder\\where\\my\\\files\\are
path_to_files = "D:\\Projetos academicos\\Labmol\\Code\\Data\\My_data\\Pre Clinical Data\\Bundled_Data\\results"
# if its a single file pass its name
result_name="Resultado_exemplo"
try:
    os.mkdir("./data")
except FileExistsError:
    pass
#### To get from a directory

file_list=os.listdir(path_to_files)
s_file = [f"{path_to_files}\\{file}" for file in file_list]

#### To get from a path

#s_file=path_to_files

In [8]:
"""
# Defines the necessary variables used all through out the code

# duplicate_identifier_column is the collumn that identifies the duplicates in your code

# values_col defines the column where your dose/lc50/ic50... values are

# max_z_score is the z value used to separate the outliers from non outliers

# convert_to_p despite the name, this bool conditions the result into a -log(result) or pResult

# convert_measure bool that conditions if you need to convert measurements, also eliminates impossible values
"""
duplicate_identifier_column = "Chemical Structure"
values_col = "Dose"
max_z_score = 3
convert_to_p = True # p = -log(measure)
convert_measure = True

Standardize thy molecules

In [9]:
name_smiles="Chemical Structure"

def remove_invalid(df):
        for i in df.index:
            try:
                smiles = df[name_smiles][i]
                m = Chem.MolFromSmiles(smiles)
            except:
                df.drop(i, inplace=True)
        df.reset_index(drop=True, inplace=True)
        return df

def remove_metals(df):
    badAtoms = Chem.MolFromSmarts('[!$([#1,#3,#11,#19,#4,#12,#20,#5,#6,#14,#7,#15,#8,#16,#9,#17,#35,#53])]')
    mols = []
    for i in df.index:
        smiles = df[name_smiles][i]
        m = Chem.MolFromSmiles(smiles)
        try:
            if m.HasSubstructMatch(badAtoms):
                df.drop(i, inplace=True)
        except:
            df.drop(i, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

def normalize_groups(df):
    mols = []
    for smi in df[name_smiles]:
        m = Chem.MolFromSmiles(smi,sanitize=True)
        m2 = rdMolStandardize.Normalize(m)
        smi = Chem.MolToSmiles(m2,kekuleSmiles=True)
        mols.append(smi)
    norm = pd.Series(mols)
    df[name_smiles] = norm
    #df_normalized = df.join(norm)
    return df

def neutralize(df):
    uncharger = rdMolStandardize.Uncharger()
    mols = []
    for smi in df[name_smiles]:
        m = Chem.MolFromSmiles(smi,sanitize=True)
        m2 = uncharger.uncharge(m)
        smi = Chem.MolToSmiles(m2,kekuleSmiles=True)
        mols.append(smi)
    #neutral = pd.DataFrame(mols, columns=[name_smiles])
    #df_neutral = df.join(neutral)
    norm = pd.Series(mols)
    df[name_smiles] = norm
    return df
    
def no_mixture(df):
    mols = []
    for smi in df[name_smiles]:
        m = Chem.MolFromSmiles(smi,sanitize = True)
        m2 = rdMolStandardize.FragmentParent(m)
        smi = Chem.MolToSmiles(m2,kekuleSmiles=True)
        mols.append(smi)
    #no_mixture = pd.DataFrame(mols, columns=[name_smiles])
    #df_no_mixture = df.join(no_mixture)
    norm = pd.Series(mols)
    df[name_smiles] = norm
    return df

def canonical_tautomer(df):
    te = rdMolStandardize.TautomerEnumerator()
    mols = []
    for smi in df[name_smiles]:
        m = Chem.MolFromSmiles(smi,sanitize=True)
        m2 = te.Canonicalize(m)
        smi = Chem.MolToSmiles(m2,kekuleSmiles=True)
        mols.append(smi)
    # canonical_tautomer = pd.DataFrame(mols, columns=[name_smiles])
    # df_canonical_tautomer = df.join(canonical_tautomer)
    norm = pd.Series(mols)
    df[name_smiles] = norm
    return df

def no_stereoisomer_info(df):
    mols = []
    for smi in df[name_smiles]:
        smi = str(smi).replace("@","")
        mols.append(smi)
    # no_stereoisomer = pd.DataFrame(mols, columns=[name_smiles])
    # df_stereoisomer = df.join(no_stereoisomer)
    norm = pd.Series(mols)
    df[name_smiles] = norm
    return df
# Removes stereochemistry info and adds it to "no_stereo"
def standardize_with_and_out_stereo(df) -> pd.DataFrame:
    df=remove_invalid(df)
    df=remove_metals(df)
    df=normalize_groups(df)
    df=neutralize(df)
    df=no_mixture(df)
    df=canonical_tautomer(df)
    df=no_stereoisomer_info(df)
    return df

def standardize_with_stereo(df) -> pd.DataFrame:
    df=remove_invalid(df)
    df=remove_metals(df)
    df=normalize_groups(df)
    df=neutralize(df)
    df=no_mixture(df)
    df=canonical_tautomer(df)
    return df

##### DO NOT RUN UNLESS YOUR DATA CONTAINS NULL CHARS ########


######### DO NOT RUN UNLESS YOUR DATA CONTAINS NULL CHARS ################

for file in s_file:        
        with open(file,"r") as f:
                a=f.read()

        a=a.replace("\x00","")
        
        with open(file,"w") as f:
                f.write(a)
        # df=pd.read_csv(file,delimiter=",")
        # print(df.head(1))

##### DONT RUN UNLESS YOUR FILES HAVE SOME COMPATIBILITY ISSUES #######


###### DONT RUN UNLESS YOUR FILES HAVE SOME COMPATIBILITY ISSUES #######
if type(s_file) is not str:
    for file in s_file:
        name=str(file).replace(path_to_files,"")
        name=name.replace(".csv",".xlsx")
        
        df=pd.read_csv(file,delimiter=",")
        df.to_excel(f"temp_dataset/{name}")

path_to_files = "./temp_dataset"
file_list=os.listdir(path_to_files)
s_file = [f"{path_to_files}\\{file}" for file in file_list]

### Defintively run

In [10]:
def main(files):
    
    def isInt(i):
        """Check if it is an integer"""
        try:
            int(i)
            return True
        except ValueError:
            return False

    def isFloat(f):
        try:
            float(f)
            return True
        except ValueError:
            return False

    def convertTo_mg(df, column=values_col):
        """
        Pass a pandas 'series like' (aka: df.column) of measurement strings, 
        convert the values into miligram numbers, 
        returns a list of lists with a number and the unit
        
        """

        micro = "\u00B5"
        measure = []

        for dose in df.loc[:, column]:
            # num=[n for n in number if isInt(n) or n=="."]
            num = ""
            unit = ""
            nums = []
            avg = 0
            # Ensures it is not nan or None
            if type(dose) is str:
                for i, n in enumerate(dose):
                    # Avoids unprecise entries
                    if n in ["<", ">"]:
                        continue

                    if isInt(n) or n == "." and num.find(".") == -1:
                        num += n

                    if isInt(n) == False:
                        unit += n
                        num = ""
                    if n in [",", "-", "\\", ".", "/"] and num.find(".") != 0 and num != "":
                        num.strip(" ,-\\/")
                        nums.append(float(num))
                        num = ""
                    if len(nums) > 1 and "" not in nums:
                        avg = np.mean(nums)

                    elif len(nums) == 0 and num != "":
                        avg = float(num)
                # unit=[str(s) for s in number if isInt(s)==False]
                measure.append([avg, str(unit).strip(" -.")])

            elif type(dose) is float or type(dose) is int:
                measure.append([dose, "mg"])
        for i, m in enumerate(measure):
            # print(m)
            u = str(m[1])
            n = m[0]

            # if you want to use the strings that contain unit values uncomment the commented lines
            if u.find("kg") != -1 and u.find("mg") == -1:
                measure[i][0] = n*1_000_000
                # measure[i][1]=str(measure[i][1]).replace("kg","mg")

            # might be with special char
            elif u.find(f"{micro}g") != -1 or unit.find("ug") != -1:
                measure[i][0] = n/1000
                # measure[i][1]=str(measure[i][1]).replace("ug","mg")

            elif u.find("cg") != -1:
                measure[i][0] = n*10
                # measure[i][1]=str(measure[i][1]).replace("cg","mg")
            if u.find("mg") == 0:
                measure[i][0] = n

            if u.find("g") == 0:
                measure[i][0] = n*1000
                # measure[i][1]=str(measure[i][1]).replace("g","mg")

        solved_col = pd.Series(measure)

        return solved_col
    
    # Method that calculates the standard deviation with a number list and the mean, but use np.std() instead
    def stdCalculation(numList, mean):
        n = len(numList)+1
        soma = 0
        for x in numList:
            soma += (x-mean)**2
        std = math.sqrt((soma)/n)
        return std

    # Method to calculate the z score for each number in a array of numbers, returns a list of z-scores of each number relevant to the array
    def z_scorer(nums: list, mean: float or int, std: float or int):

        z_scores = []
        for x in nums:
            z = (x-mean)/std
            z_scores.append(z)
        return z_scores

    # Most of the action happens here
    def idOutliers(df: pd.DataFrame, name_col: str = "Drug", value_col: str = "Dose", max_z: float = 1.8, convert_to_p: bool=True):
        dict_rows = {}
        means = {}
        # Comment if you have empty values
        df = df.dropna(axis=0, how="all")
        
        if convert_measure:
            values = [num for num, mg in df.loc[:, value_col]]
        else:
            values = [num for num in df.loc[:, value_col]]
        
        names = [drugs for drugs in df.loc[:, name_col] if type(drugs) is str]
        for i, name in enumerate(names):
            if name=="" or name=="-":
                name="Empty"
                
            if name not in dict_rows:
                dict_rows[name] = []
                
                dict_rows[name].append(values[i])
            else:
                dict_rows[name].append(values[i])

        for n in dict_rows:
            media = []
            if len(dict_rows[n]) > 1 and type(dict_rows[n]) is float:
                means[n] = np.mean(dict_rows[n])

                if dict_rows[n][0]!=0 and means[n]/dict_rows[n][0] != 1:
                    std = np.std(dict_rows[n])
                else:
                    std = 1

                z = z_scorer(dict_rows[n], means[n], std)

                for _i, i in enumerate(z):
                    if abs(i) <= max_z:
                        media.append(dict_rows[n][_i])

                med = np.mean(media)
                # atrubutes where in the column specified is equal to the name of the current row, and replaces the value (dose) of that row
                if convert_to_p:
                    df.loc[df[name_col] == n, value_col] = -math.log10(med)
                else:
                    df.loc[df[name_col] == n, value_col] = med
            else:
                if convert_to_p:
                    if dict_rows[n][0] <= 0:
                        df.loc[df[name_col] == n, value_col] = dict_rows[n][0]
                    else: 
                        df.loc[df[name_col] == n, value_col] = -math.log10(dict_rows[n][0])
                else:
                    df.loc[df[name_col] == n, value_col] = dict_rows[n][0]

        return df
    
    # computes in the order necessary to generate the dataframes
    def organize(df,name):
        # read
        print(name)    
        # convert?
        if convert_measure:
            new_doses = convertTo_mg(df, values_col)
            df[values_col] = new_doses
        # generate
        df = standardize_with_and_out_stereo(df)
        z = idOutliers(df, duplicate_identifier_column, values_col, max_z_score, convert_to_p)
        z = z.drop_duplicates(keep="first",subset=duplicate_identifier_column)
        z.to_csv(f"./data/{name}.csv")
        print(f"Generated: {name}")

    # Made to work with a single file and a list of files
    def read_through_files(files):

        if type(files) is str:
            name=result_name
            name=name.replace(".xlsx","")
            name=name.replace(".csv","")
            if files.find(".csv")!=-1:
                df=pd.read_csv(files,delimiter=",")
                organize(df,name)
            else:
                sheets = pd.ExcelFile(files).sheet_names
                if len(sheets)>1:
                    for sheet in sheets:
                        if sheet != ".":
                            name=sheet
                            df=pd.read_excel(files,sheet_name=sheet)
                            organize(df,name)
                else:
                    for sheet in sheets:
                        if sheet != ".":
                            #name=sheet
                            df=pd.read_excel(files,sheet_name=sheet)
                            organize(df,name)
        else:
            for file in files:
                name=str(file).replace(path_to_files,"")
                name=name.replace(".xlsx","")
                name=name.replace(".csv","")
                if file.find(".csv")!=-1:
                    df=pd.read_csv(file,delimiter=",")
                    organize(df,name)
                else:   
                    sheets = pd.ExcelFile(file).sheet_names
                    if len(sheets)>1:
                        for sheet in sheets:
                            if sheet != ".":
                                name=sheet
                                df=pd.read_excel(file,sheet_name=sheet)
                                organize(df,name)
                    else:
                        for sheet in sheets:
                            if sheet != ".":
                                #name=sheet
                                df=pd.read_excel(file,sheet_name=sheet)
                                organize(df,name)

    read_through_files(files=files)

### Run to execute
###### Equivalent to if __name__ == __main__:

In [11]:
# Execute everything
main(s_file)
    

\Hepatic necrosis
Generated: \Hepatic necrosis
\Hepatic Steatosis
Generated: \Hepatic Steatosis
\Hepatitis
Generated: \Hepatitis
\Hepatomegaly
Generated: \Hepatomegaly
\Liver disorder
Generated: \Liver disorder
\Liver injury
Generated: \Liver injury
