# CGRdb tutorial
The tutorial expects that you have completed the Docker compose and the Postgres DB is running!

In [None]:
import sys
sys.modules['__main__'].__file__ = None  # ad-hoc for disabling the Pony interactive mode.

from chythonDB.database import Reaction, Molecule, Substance, MoleculeStructure, db
from chython import RDFRead, ReactionContainer, SDFRead
from pony.orm import db_session

In [None]:
# If database is not empty and you want to clear the whole DB, uncomment
#from CGRdb.database import db
#db.bind(provider='postgres', user='postgres', host='localhost', password="example", database='test',
#        port=5432)
#db.generate_mapping(check_tables=False, create_tables=True)
#db.drop_all_tables(with_all_data=True)
#db.commit()
#db.disconnect()
#db.unbind()
# then run the connection cell again and proceed with the cell below

In [None]:
# connection to the database
# change database name, password ,port, host, user according to the settings in docker-compose.yaml file
db.bind(provider='postgres', database='test', password='example', port=5432, host='localhost', user='postgres')
db.generate_mapping(create_tables=True)

# ER Diagram

![ERDiagram.png](attachment:ERDiagram.png)

In [None]:
# Define parameters of the DB for fingerprinting and LSH algorithm
# Linear fragments are used for the fingerprints
# The parameters such as the length of the fragment, the length of the fingerprint vector,
# the number of active bits and bit pairs for the fragment can be modified.
from pony.orm import db_session
from chythonDB.database.config import Config
with db_session():
    db.execute("create extension if not exists intarray;")
    Config(key="fingerprint",value={"min_radius":1, "max_radius":4, "length":2048,
                 "number_active_bits":2, "number_bit_pairs":4})
    Config(key="lsh_num_permute",value=64)
    Config(key="lsh_threshold",value=0.7)
# all setting are stored in a config table

# Data storing

Load example reaction and molecule

In [None]:
# read one SMILES from a zip file and construct CGRtools.MoleculeContainer
import zipfile
from chython import smiles
file = zipfile.ZipFile("../dataset/Chembl28.smi.zip") # test dataset with molecules within the library
line = next(file.open("Chembl28.smi")).decode().split()
if len(line) >= 3 and line[1].startswith("|"):
    mol = smiles("".join([line[0],line[1]]))
else:
    mol = smiles(line[0])

In [None]:
mol.clean2d() # generate coordinates for graph, to make it visualizable
mol

In [None]:
# read one smiles from zip file and construct CGRtools.ReactionContainer
import zipfile
from chython import smiles
file = zipfile.ZipFile("../dataset/USPTO.smi.zip") # test dataset with reactions within the library
line = next(file.open("USPTO.smi")).decode().split()
if len(line) >= 3 and line[1].startswith("|"):
    reaction = smiles("".join([line[0],line[1]]))
else:
    reaction = smiles(line[0])

In [None]:
reaction.clean2d() # generate coordinates for graph, to make it visualizable
reaction

## Direct storage of molecules
Only new molecules can be stored in the DB. The molecules that already exist will raise an exception.

In [None]:
with db_session():
    m = Molecule(mol)
    idx = m.id
m  # molecule record object

# MoleculeStructure object was automatically created for this molecule
# and became a canonic_structure for this Molecule. Later user can introduce additional structures
# for the representation of the same Molecule (for example Tautomer)

In [None]:
# Relation to the canonic structure
with db_session():
    m = Molecule.get(mol) # get DB object by structure
    can_structure = m.canonic_structure
can_structure

In [None]:
# lookup for the whole list of structures that correspond to this molecule
with db_session():
    m = Molecule.get(mol) # get DB object by structure
    structures = list(m.structures)
structures

In [None]:
# take a CGRtools.MolecularContainer object from DB
with db_session():
    m = Molecule.get(mol) 
    structure = m.canonic_structure.structure
structure

In [None]:
# alternative way of accessing the structures
with db_session():
    m = Molecule.get(mol) 
    structure = list(m.structures)[0].structure 
structure
# structure is a CGRtools.MoleculeContainer and provide all method of the Class

In [None]:
# fingerprint of the Molecular Structure can be accessed as well as SMILES 
# (if user do not access the graph based representation of molecule)
with db_session():
    m = Molecule.get(mol) 
    fp = m.canonic_structure.fingerprint
fp

In [None]:
with db_session():
    m = Molecule.get(mol) 
    sm = m.canonic_structure.smiles
sm

In [None]:
# Also reverse relation can be seen
with db_session():
    m = Molecule.get(mol) 
    moldb = m.canonic_structure.molecule
moldb

## Storing of Substances as a safer way for storage of chemical data
A Substance instance does not have to be unique, as it represents a real jar with a chemical product, with a manufacturer, expiration date, barcode, etc. As an example, we will give numbers as a name for the Substance. It is done this way mainly because a Substance can include one or several molecules, and the user has a possibility to store them as a Substance entity instead.

In [None]:
hexane = smiles("C1CCCCC1")
hexane.clean2d()
hexane

In [None]:
y = mol.union(hexane, remap=True) # let's add solvent to our molecule and change atom mapping to avoid collisions

In [None]:
y.clean2d()
y

In [None]:
mol, hexane = y.split() # separate molecules back with their proper mappings

In [None]:
# Substance take a list of tuples of molecules (molecule instance and its fraction).
# If there is no information about ratio, None can be used.
# for example we will add some cyclohexane solvent and store the solution of previous molecule
subs = [(mol, 0.2),(hexane,0.8)]
with db_session():
    s = Substance(subs, name="AA000001")
    idx = s.id
s

In [None]:
s.name # show the name of the substance

In [None]:
# accessing the structure of substance
with db_session():
    s = Substance[idx]
    s_struct = s.structure
s_struct.clean2d()
s_struct

In [None]:
with db_session():
    s = Substance[idx]
    component1, component2 = s.components # components are either Molecules or NonOrganics

In [None]:
# accesing molar_fraction property
print(f"""
molar_fraction of 1st mol, {component1.molar_fraction}
molar_fraction of 2nd mol, {component2.molar_fraction}
      """)

In [None]:
# get individual structures of components
with db_session():
    s = Substance[idx]
    component1, component2 = s.components
    structure1 = component1.structure
    structure2 = component2.structure

In [None]:
structure1

In [None]:
structure2

## Reaction storage
The Reaction instance is considered as an individual experiment, so duplicates are possible

In [None]:
reaction

In [None]:
# store reaction without CGR structure 
with db_session:
    r = Reaction(reaction)
    idx = r.id
r  # reaction record object

In [None]:
# check the structure of reaction
with db_session:
    r = Reaction[idx]
    r_structure = r.structure
r_structure

In [None]:
# check the substances of reaction
with db_session:
    r = Reaction[1]
    substances = list(r.substances)
substances

In [None]:
# check the structure of any substance from reaction
with db_session:
    r = Reaction[1]
    substances = list(r.substances)
    str1 = substances[0].structure
str1

In [None]:
# store reaction with CGR structure
with db_session:
    r = Reaction(reaction, keep_cgr=True)
r  # reaction record object

In [None]:
# check the structure of CGR
with db_session:
    r = Reaction[2]
    str2 = r.CGR.structure
str2.clean2d()
str2

# Getting data from DB

Cartridge based on Pony ORM https://ponyorm.com.

The main concepts of interaction with DB are preserved

# Molecules Searching
All searches are based on Molecules and their MoleculeStructure entities, the following search types are supported:
* exact match - return a molecule that corresponds exactly to a provided structure
* substructure search - return python generator with Molecules that are superstructures of the query structure
* similarity search - return python generator with Molecules that are similar to query structure


## Please upload data from Chembl28 to proceed
You can upload part of the dataset, to save time and resources, but take into account that it also affects the results of the searches.

### Substructure search of molecules

In [None]:
from chythonDB.tests import mol_queries
from chython import smiles

In [None]:
query = smiles(list(mol_queries.values())[0])

In [None]:
query.clean2d()
query

In [None]:
res = Molecule.substructres(query,  ordered=True, request_only=False, tanimoto_limit = None)
# This is a difficult query for db, please use a more precise query for faster responses
# Yields a generator object CursorHolder
# By default substructures method is used with "ordered=True" option, so it will return all substructures ordered by similarity,
# If the user does not want it, "ordered=False" option must be used. It will increase the speed of the first
# response.
# Request_only option is for internal usage or debugging to see actual sql request.
# Last option "tanimoto_limit=None" indicates that Tanimoto limit is taken from the config table, this can be overridden,
# User can set up a higher Tanimoto limit to reduce the number of retrieved structures.
# We do not recommend to put limit under the default value. This could lead to erroneous perception that request
# will return ALL other molecules from DB, but in fact it will give only rest of the molecules from the buckets 
# of LSH algorithm. So in order to go for lower similarities, please reconfigure the DB and re-index it 
# with a new similarity limit.  

In [None]:
a = next(res) # result is a Molecule obj, a MoleculeStructure obj of the query, and the Tanimoto score

In [None]:
a[1].structure.clean2d() # taking structure through MoleculeStructure obj
a[1].structure

In [None]:
a[0].canonic_structure.structure.clean2d() # taking structure through canonic_structure(main) of the Molecule obj
a[0].canonic_structure.structure

In [None]:
# Tanimoto
a[2]

### Similarity search of molecules
Searching of molecules by similarity (Tanimoto coefficient). By default, molecules are considered similar if their Tanimoto index greater than `0.7`.

In [None]:
res = Molecule.similars(query)  # options and return is the same as substructure method

In [None]:
a = next(res)
a

In [None]:
a[1].structure.clean2d() # taking structure through MoleculeStructure obj
a[1].structure

In [None]:
a[0].canonic_structure.structure.clean2d() # taking structure through canonic_structure(main) of Molecule obj
a[0].canonic_structure.structure

In [None]:
# Tanimoto
a[2]

## Searching reactions by one component
* search molecules by similarity and return reactions with these molecules
* search molecules by substructure and return reactions with these molecules

In [None]:
from chythonDB.tests import mol_queries
from chython import smiles

In [None]:
res = Molecule.substructures_in_reactions(query) 
# search molecules by substructure of a component and return reactions with them

In [None]:
a = next(res)

In [None]:
a

In [None]:
a[0].structure.clean2d() # taking structure through MoleculeStructure obj
a[0].structure

In [None]:
a[1].canonic_structure.structure.clean2d()
a[1].canonic_structure.structure

In [None]:
res = Molecule.similars_in_reactions(query) 
# search molecules by similarity and return reactions with that molecules

In [None]:
a = next(res)

In [None]:
a

In [None]:
a[0].structure.clean2d() # taking structure through Reaction obj
a[0].structure

In [None]:
a[1].canonic_structure.structure.clean2d()
a[1].canonic_structure.structure

In [None]:
# Tanimoto
a[2]

## Searching Reactions by all components
It is possible to perform the following types of searches:
* search for reactions by all components according to their role in reaction:
    * by substructure of components
        * with reaction center control (requires atom-to-atom mapping for the query reaction and reactions stored in DB)
        * without reaction center control (default)
    * by similarity of components
        * with reaction center control (requires atom-to-atom mapping for the query reaction and reactions stored in DB)
        * without reaction center control (default)

In [None]:
from chythonDB.tests import reaction_queries
from chython import smiles

In [None]:
query = smiles(list(reaction_queries.values())[0])

In [None]:
query.clean2d()
query

In [None]:
# return reactions by total similarity of the components (with option ordered=True)
res = Reaction.similars(query,ordered=True, fix_roles=True,
                 mapping=False, request_only=False)
# "fix_roles" is responsible for roles matching in query and result reaction, if False - results can contain cases
# of mismatched roles (e.g., a product molecule within reactants)
# "mapping" option turns on the control of reaction center (only for reactions that were stored with mapping,
# query should also contain mapping)
# "request_only" - for debugging and internal usage of DB, returns actual SQL request

In [None]:
a = next(res) # returns Reaction obj, list of Molecules that matched query and total Tanimoto
a

In [None]:
a[0].structure.clean2d() # taking structure through Reaction obj
a[0].structure

In [None]:
a[1][0].canonic_structure.structure.clean2d() # taking structure through Molecule obj
a[1][0].canonic_structure.structure

In [None]:
# Tanimoto
a[2]

In [None]:
# The same as previous request, but with control of reaction center
res = Reaction.similars(query,ordered=True, fix_roles=True,
                 mapping=True, request_only=False) 

In [None]:
a = next(res) # returns Reaction obj, list of Molecules that matched query and total Tanimoto
a

In [None]:
a[0].structure.clean2d() # taking structure through Reaction obj
a[0].structure

In [None]:
a[1][0].canonic_structure.structure.clean2d() # taking structure through Molecule obj
a[1][0].canonic_structure.structure

## Searching Reactions by Condensed Graph of Reaction (CGR)
* The following search types are available:
    * by substructure of CGR
    * by similarity of CGR

In [None]:
from chythonDB.tests import cgr_queries
from chythonDB.database import CGR

In [None]:
query = smiles(list(cgr_queries.values())[0])

In [None]:
query.clean2d()
query

In [None]:
res = CGR.similars(query) 
# same options that available for Molecule.similars
# also can be applied here (ordered, request_only)

In [None]:
a = next(res) # returns CGR that matched query and summary Tanimoto
a

In [None]:
a[0].structure.clean2d()
a[0].structure

In [None]:
# Tanimoto
a[1]

In [None]:
res = CGR.substructres(query)
# yields a generator object CursorHolder
# by default the substructure method is used with "ordered=True" option, 
# so it will return all substructures ordered by similarity,
# if the user does not want that, "ordered=False" option should be used. It will increase the speed of the first
# response.
# "request_only" option is for internal usage or debugging to see the actual sql request

In [None]:
a = next(res) # returns CGR that matched query and total Tanimoto
a

In [None]:
a[0].structure.clean2d()
a[0].structure

In [None]:
# Tanimoto
a[1]