In [None]:
import sys
import glob
import numpy as np
import os
from rdkit import Chem
import pandas as pd
from rdkit.Chem.PandasTools import LoadSDF
import matplotlib.pyplot as plt
import random
import networkx as nx
from networkx import Graph
from itertools import combinations
from rapidfuzz import process
from rapidfuzz.distance import JaroWinkler

In [None]:
df = pd.read_excel("10001_11911.xlsx" , engine="openpyxl")

In [None]:
reagent = "3-chloro-benzenecarboperoxoic acid"
reagentList = df['Reagent']

In [None]:
refinedDF = df[df["Reagent"] == reagent].copy()
refinedDF.reset_index(drop=True, inplace=True)
selected_columns = ["Reaction ID", "Reaction", "Fulltext of reaction" , "Reactant","Yield (numerical)" , "Reagent" , "Catalyst" , "References"]  # Replace with actual column names

# Select only those columns
refinedDF = refinedDF[refinedDF['Reaction'].notna()].copy()
refinedDF = refinedDF[selected_columns].copy()

refinedDF = refinedDF[refinedDF['Yield (numerical)'].notna()].copy()
refinedDF.reset_index(drop=True, inplace=True)

print(refinedDF.shape)

In [None]:
refinedDF

In [None]:
#from given dataset, group 
def groupBy(df, groupVar):
    parameters = np.array(df[groupVar])
    parameters = np.unique(parameters)
    dfMAST = []
    
    for i in range (len(parameters)):
        parm = parameters[i]
        smallDF = df[df[groupVar] == parm].copy()
        smallDF.reset_index(drop=True, inplace=True)
        dfMAST.append(smallDF)
    return dfMAST



In [None]:
partitionedDF = groupBy(refinedDF, "Reaction ID")

In [None]:
print(len(partitionedDF))


In [None]:
columnsMAST = ["ID" , "SMILES" , "Yield" ,"Catalyst" ,  "Source"]
masterDF = pd.DataFrame(columns=columnsMAST)
for j in range (len(partitionedDF)):
    smallDF = partitionedDF[j]
    reactionType = list(smallDF['Reaction'])
    char = "."
    found = any(char in string for string in reactionType)
    #print("found" , found)
    if not found:
        #print("newMolec")
        #goal is to group by catalyst type first before averaging the yield data 
        
        catalystType = smallDF['Catalyst']
        newCatList = np.where(pd.isna(catalystType), "None", catalystType)
        yields = list(smallDF["Yield (numerical)"])
        print("YIELDS:" , yields)
        yieldsListMAST = []
        yieldIndexList = []
        for i in range (len(yields)):
            try:
                yield_ = float(yields[i])
                yieldsListMAST.append(yield_)
                yieldIndexList.append(i)
            except ValueError:
                print("ValueError" ,i ,  yield_)
                            
                        
        if len(yieldIndexList) != 0:
            maxValue = max(yieldsListMAST)
            if maxValue  <= 100.00:
                #print("yieldsListMAST:", yieldsListMAST)
                #print("yieldIndexList:", yieldIndexList)
                maxIndices = [yieldIndexList[i] for i, val in enumerate(yieldsListMAST) if val == maxValue]
                #print("max Index" , maxIndices)
                targetInd = maxIndices[0]
                newIndexList = [targetInd]
                #print("targetInd" , targetInd)
                #print("newCatList" , newCatList)
                finalCat = newCatList[targetInd]
                for i, catalyst in enumerate (newCatList):
                    #print(catalyst)
                    if i != targetInd:
                        #print("pass")
                        similarity = JaroWinkler.similarity(catalyst, finalCat)
                        #print("similarity" , similarity)
                        if similarity >= 0.75:
                            newIndexList.append(i)
                newYieldsList = []
                #print("newIndexList" , newIndexList)
                #print(yieldsListMAST)
                for i, ind in enumerate(newIndexList):
                    newYieldsList.append(yieldsListMAST[ind])
                finalYield = np.mean(newYieldsList)
                reactionType = list(smallDF['Reaction'])
                reaction = reactionType[targetInd]
                smiles = str(reaction.split(">")[0])
                refs = list(smallDF["References"])
                ref = refs[targetInd]
                id_ = list(smallDF["Reaction ID"])
                idMAST = id_[targetInd]
                masterDF.loc[len(masterDF)] = [idMAST , smiles, finalYield , finalCat , ref]
                

In [None]:
masterDF.shape

In [None]:
masterDF

In [None]:
masterDF.to_excel("mCPBA3.xlsx", index=False, engine="openpyxl")

In [None]:
while True:
    raw_input = input(f"Enter the string representations for partition (Ex: Mn,Manganese,Jacobsen): ")
    partitionList = [part.strip() for part in raw_input.split(",")]
    print(partitionList)
    if len(partitionList) < 2 or any(part == '' for part in partitionList):
        print("Invalid input. Please enter comma-separated values without empty parts (e.g., Mn,Manganese,Jacobsen).")
    else:
        break 

In [None]:
df = pd.read_excel("/home/danny/Downloads/UTF-8Reaxys_Exp_20250212_185249 copy.xlsx" , engine="openpyxl")
if os.path.isfile("/home/danny/Code/stahl_ML_DataScience/reaxysProcessing/Example/columns.dat"):
    with open("/home/danny/Code/stahl_ML_DataScience/reaxysProcessing/Example/columns.dat", "r") as file:
        headers = [col.strip() for col in file.readline().split(",")]
        print(headers)
refinedDF = df[df[headers[2]].notna()].copy()
refinedDF = refinedDF[refinedDF[headers[1]].notna()].copy()

In [None]:
refinedDF = refinedDF.drop(columns=[col for col in refinedDF.columns if col not in headers])
refinedDF

In [None]:
reagentList = ["NaOCl" , "bleach" , "sodium hypochlorite"]


In [None]:
reagentList = ["NaOCl" , "bleach" , "sodium hypochlorite"]
substrateDF = pd.DataFrame(columns=refinedDF.columns)

for index, row in refinedDF.iterrows():
    reagent = str(row[headers[3]])
    if any(sub.lower() in reagent.lower() for sub in reagentList):
        substrateDF = pd.concat([substrateDF, pd.DataFrame([row])], ignore_index=True)

In [None]:
substrateDF

In [None]:
reactionDF = groupBy(substrateDF , "Reaction ID")

In [None]:
print(min(["Mn" , "Manganese" , "Jacobsen"] , key = len))