## Get CAS Numbers
Get the CAS numbers associated with each molecule in the database. 

In [1]:
from moldesign.store.mongo import MoleculePropertyDB
from typing import List
from tqdm import tqdm
from time import sleep
import requests

[19:37:33] Enabling RDKit 2019.09.3 jupyter extensions


Configuration

In [2]:
retry = False
spacing = 0.1

## Find the molecules without a CAS identifier
Either because we couldn't find one, or because it's unset

In [3]:
mongo = MoleculePropertyDB.from_connection_info(port=27855)

In [4]:
query = {'identifier.cas': {'$exists': False}}
if retry:
    query = {'$or': [{'identifier.cas': 'none'}, query]}

In [5]:
count = mongo.collection.count_documents(query)
print(f'Found {count} matching records' )

Found 22682 matching records


In [6]:
cursor = mongo.collection.find(query, projection=['identifier'])

## Query CAS with the InChI
See if it is known yet

In [10]:
find_cas_records('InChI=1S/C2H3F3O/c3-2(4,5)1-6/h6H,1H2')

['75-89-8']

In [7]:
def find_cas_records(inchi: str) -> List[str]:
    """Get a list of CAS records for a certain InChI
    
    Args:
        inchi: Record of the molecule to evaluate
    Returns:
        List of CAS records
    """
    result = requests.get("https://commonchemistry.cas.org/api/search", {"q": inchi})
    return [x["rn"] for x in result.json()['results']]
assert find_cas_records('InChI=1S/C2H3F3O/c3-2(4,5)1-6/h6H,1H2') == ['75-89-8']

In [12]:
get_cas_data('75-89-8')

{'uri': 'substance/pt/75898',
 'rn': '75-89-8',
 'name': '2,2,2-Trifluoroethanol',
 'image': '<svg width="107.52" viewBox="0 0 3584 2142" text-rendering="auto" stroke-width="1" stroke-opacity="1" stroke-miterlimit="10" stroke-linejoin="miter" stroke-linecap="square" stroke-dashoffset="0" stroke-dasharray="none" stroke="black" shape-rendering="auto" image-rendering="auto" height="64.26" font-weight="normal" font-style="normal" font-size="12" font-family="\'Dialog\'" fill-opacity="1" fill="black" color-rendering="auto" color-interpolation="auto" xmlns="http://www.w3.org/2000/svg"><g><g stroke="white" fill="white"><rect y="0" x="0" width="3584" stroke="none" height="2142"/></g><g transform="translate(32866,32758)" text-rendering="geometricPrecision" stroke-width="44" stroke-linejoin="round" stroke-linecap="round"><line y2="-30992" y1="-31514" x2="-30798" x1="-31705" fill="none"/><line y2="-31408" y1="-30992" x2="-30079" x1="-30798" fill="none"/><line y2="-31098" y1="-31514" x2="-32424" x1

In [8]:
def get_cas_data(rn: str) -> dict:
    """Get the data about a CAS record
    
    Args:
        rn: CAS number of molecule of interest
    Returns:
        Record held by CAS
    """
    result = requests.get("https://commonchemistry.cas.org/api/detail", {"cas_rn": rn})
    return result.json()

In [9]:
matched = 0
cursor_bar = tqdm(cursor, total=count, desc='matched: 0')
for record in cursor_bar:
    sleep(spacing)
    
    # Find all CAS numbers that match
    cas = find_cas_records(record['identifier']['inchi'])
    
    my_cas = None
    if len(cas) == 1:
        # Easy case
        my_cas = cas[0]
    elif len(cas) > 1:
        # Find one that is not a polymer
        for rn in cas:
            data = get_cas_data(rn)
            if data['molecularMass'] != "":  # Polymers do not have a molecular mass
                my_cas = rn
            if my_cas:
                break
    if my_cas is not None:
        mongo.collection.update_one({'_id': record['_id']}, {'$set': {'identifier.cas': my_cas}})
        matched += 1
        cursor_bar.set_description(f'matched: {matched}')
    else:
        mongo.collection.update_one({'_id': record['_id']}, {'$set': {'identifier.cas': 'none'}})

matched: 1571: 100%|██████████| 22682/22682 [1:21:31<00:00,  4.64it/s]
