# Notes jupyternotebook
Only for testing purposes

In [None]:
import pandas as pd

df = pd.read_csv("schneider50k_clean.tsv",sep="\t")

# original_rxn: the mapped reaction SMILES (to be transformed for kernels)
# rxn_class: the classification class of the reaction (to be learned)

# Remove source
df.drop(columns=["source"], inplace=True)

# Count how many different and distinct rxn_class exist and how many reactions per class
print("Number of different classes:", len(df["rxn_class"].unique()))
print("Number of reactions per class:")
print(df["rxn_class"].value_counts())

# Pick 5 random, distinct classes and save them to a new file
random_classes = df["rxn_class"].drop_duplicates().sample(n=5, random_state=42)

In [None]:
# [O:22]=[C:14]([NH:13][CH2:12][CH2:11][NH:10][c:3]1[n:2][s:1][c:5]2[cH:6][cH:7][cH:8][cH:9][c:4]21)[c:15]1[cH:16][c:17](Cl)[cH:18][cH:19][n:20]1.[CH2:27]1[CH2:26][O:25][CH2:24][CH2:23][NH:28]1>>[O:22]=[C:14]([NH:13][CH2:12][CH2:11][NH:10][c:3]1[n:2][s:1][c:5]2[cH:6][cH:7][cH:8][cH:9][c:4]21)[c:15]1[cH:16][c:17]([N:28]2[CH2:27][CH2:26][O:25][CH2:24][CH2:23]2)[cH:18][cH:19][n:20]1

from scripts import plot_and_print_its_graphs
from networkx.algorithms import all_pairs_shortest_path, floyd_warshall
from synkit.IO import rsmi_to_graph

#plot_and_print_its_graphs(df.iloc[46]["original_rxn"])

educt, product = rsmi_to_graph(df.iloc[46]["original_rxn"])

# Transform educt to undirected graph
educt = educt.to_undirected()

paths = dict(floyd_warshall(educt))

# Print all shortest paths
for source_node, target_dict in paths.items():
    for target_node, path in target_dict.items():
        print(f"Shortest path from {source_node} to {target_node}: {path}")

# There exist equivalent paths but inversed, needs to be cleaned up



In [None]:
# Categorize by rxn_class and count
class_counts = df['rxn_class'].value_counts()
print(class_counts)

In [None]:
# Create varied sets containing 3-5 different classes and each 20-200 reactions
import random

def create_varied_set(classes:int, reactions_per_class:int) -> pd.DataFrame:
    selected_classes = random.sample(list(class_counts.index), classes)
    varied_set = pd.DataFrame()

    for cls in selected_classes:
        if not reactions_per_class:
            reactions_per_class = random.randint(20, 200)
        class_subset = df[df['rxn_class'] == cls].sample(n=reactions_per_class, random_state=42)
        varied_set = pd.concat([varied_set, class_subset])

    return varied_set.reset_index(drop=True)

print(create_varied_set(4, None))

In [None]:
from scripts import plot_and_print_its_graphs
from phi_transformation import phi_vertex

for i in range(5):
  try:
    reaction = df.iloc[i]["clean_rxn"]
    plot_and_print_its_graphs(reaction)
  except:
    None

In [None]:
from scripts import plot_and_print_its_graphs
from phi_transformation import phi_vertex
from synkit.IO import rsmi_to_graph


for i in range(50):
  try:
    reaction = df.iloc[i]["clean_rxn"]
    educt_graph, product_graph = rsmi_to_graph(reaction)
    print(f"Educt graph: {phi_vertex(educt_graph)}")
    print(f"Product graph: {phi_vertex(product_graph)}")
  except:
    None

In [None]:
from phi_transformation import phi_shortest_path

t = phi_shortest_path("[CH3:17][S:14](=[O:15])(=[O:16])[N:11]1[CH2:10][CH2:9][N:8](Cc2ccccc2)[CH2:13][CH2:12]1>>[CH3:17][S:14](=[O:15])(=[O:16])[N:11]1[CH2:10][CH2:9][NH:8][CH2:13][CH2:12]1")

print(t)

In [None]:
from networkx.algorithms import all_pairs_shortest_path
from synkit.IO import rsmi_to_graph

graph, _ = rsmi_to_graph("[CH3:17][S:14](=[O:15])(=[O:16])[N:11]1[CH2:10][CH2:9][N:8](Cc2ccccc2)[CH2:13][CH2:12]1>>[CH3:17][S:14](=[O:15])(=[O:16])[N:11]1[CH2:10][CH2:9][NH:8][CH2:13][CH2:12]1")

for n, d in graph.nodes(data=True):
  # Store mapping of node to label
  node_to_label = {n: d["element"] for n, d in graph.nodes(data=True)}
  
print(node_to_label)

paths = dict(all_pairs_shortest_path(graph))

# For each path, convert to string representation
for source, target_dict in paths.items():
  for target, path in target_dict.items():
    # Print path
    print(f"Path from {source} to {target}: {' -> '.join(map(str, path))}")
    # Convert the path to label concatenation using node_to_label
    label_path = ''.join(node_to_label[n] for n in path)
    print(f"Label path: {label_path}")

In [None]:
# Read schneider_50k_clean.tsv, extract the clean_rxn column and write every reaction to a separate line in reactions.txt
with open("reactions.txt", "w") as f:
    for i in range(len(df)):
        reaction = df.iloc[i]["clean_rxn"]
        f.write(reaction + "\n")

In [None]:
# Read files from data/*

import glob
import pandas as pd
import random

# Read schneider50k
with open("schneider50k_clean.tsv", "r") as f:
    schneider50k = pd.read_csv(f, sep="\t")

with open("data/pre-computed-feature_sets_part_1.xlsx", "rb") as f:
    part1 = pd.read_excel(f)
    # Print all rows
    print(part1)

data = pd.DataFrame()

for filename in glob.glob("data/*.xlsx"):
    with open(filename, "rb") as f:
        data = pd.read_excel(f)
        data = pd.concat([data, data], ignore_index=True)

# Append schneider50k[rxn_class] to data
data = pd.concat([data, schneider50k[['rxn_class']]], axis=1)

# Validate all columns exist

required_columns = [
    "educt_phi_vertex_dict",
    "product_phi_vertex_dict",
    "symmetric_difference_vertex_dict",
    "educt_phi_edge",
    "product_phi_edge",
    "symmetric_difference_edge",
    "educt_phi_shortest_path",
    "product_phi_shortest_path",
    "symmetric_difference_shortest_path",
    "rxn_class"
]
with open("data/combined_data.xlsx", "wb") as f:
  data.to_excel(f, index=False)

# Validaate 49999 < rows < 50001
assert len(data) > 49999 and len(data) < 50001, f"Data has {len(data)} rows, expected ~50000"
print("Data validation passed: correct number of rows.")


In [None]:
# Read data/combined_data.xlsx and print the columns
import pandas as pd
df = pd.read_excel("data/combined_data.xlsx")
print("Columns in combined_data.xlsx:", df.columns.tolist())

# DRF Nodes = Symm. Diff of Nodes
# DRF Edges = Symm. Diff of Edges
# DRF Shortest Paths = Symm. Diff of Shortest Paths

# Bei ITS existiert nur ein 
# ITS Nodes = 