# This notebook was simply used to test some syntax and functions and understand how things worked before applying them to the final model, it is simply scattered notes for personal use in other code. See other notebooks in the repo for proper use of code and documentation

In [59]:
import jcamp
import glob
import os
import numpy as np
import pubchempy as pcp
import pandas as pd



In [2]:
#path that contains the IR files
path = os.path.join(os.getcwd(), 'scrapedFTIR', 'ir','goodUnits')
    
extension = 'jdx'
all_files = glob.glob(os.path.join(path, "*.jdx"))

parsedJcamps = []
for file in all_files:
    parsedJcamps.append(jcamp.jcamp_readfile(file))

In [3]:
test = parsedJcamps[1237]
test

{'title': 'Benzyl methacrylate',
 'jcamp-dx': 4.24,
 'data type': 'INFRARED SPECTRUM',
 'origin': 'Sadtler Research Labs Under US-EPA Contract',
 'owner': 'NIST Standard Reference Data Program\nCollection (C) 2018 copyright by the U.S. Secretary of Commerce\non behalf of the United States of America. All rights reserved.',
 'cas registry no': '2495-37-6',
 'molform': 'C 11 H 12 O 2',
 '$nist source': 'MSDC-IR',
 'state': 'gas',
 'xunits': '1/CM',
 'yunits': 'ABSORBANCE',
 'xfactor': 1.0,
 'yfactor': 0.000138599,
 'deltax': 4.0,
 'firstx': 450.0,
 'lastx': 3966.0,
 'firsty': 0.0,
 'maxx': 3966,
 'minx': 450,
 'maxy': 1.38599,
 'miny': 0,
 'npoints': 880,
 'xydata': '(X++(Y..Y))',
 'end': '',
 'x': array([ 450.,  454.,  458.,  462.,  466.,  470.,  474.,  478.,  482.,
         486.,  490.,  494.,  498.,  502.,  506.,  510.,  514.,  518.,
         522.,  526.,  530.,  534.,  538.,  542.,  546.,  550.,  554.,
         558.,  562.,  566.,  570.,  574.,  578.,  582.,  586.,  590.,
         59

In [4]:
#making sure the x and Y arrays are of the same length
badApples = 0
for i in parsedJcamps:
    if len(i['x']) != len(i['y']):
        badApples += 1

print(badApples)

0


In [5]:
#figure out how many unique set lengths their are
uniqueLengths = set()
for i in range(0,len(parsedJcamps)):
    uniqueLengths.add(len(parsedJcamps[i]['x']))

print(len(uniqueLengths))

7


In [6]:
x = test['x']
y = test['y']
print(min(x))
print(len(x))
print(len(y))
tt = [test['x'],test['y']]
tt

450.0
880
880


[array([ 450.,  454.,  458.,  462.,  466.,  470.,  474.,  478.,  482.,
         486.,  490.,  494.,  498.,  502.,  506.,  510.,  514.,  518.,
         522.,  526.,  530.,  534.,  538.,  542.,  546.,  550.,  554.,
         558.,  562.,  566.,  570.,  574.,  578.,  582.,  586.,  590.,
         594.,  598.,  602.,  606.,  610.,  614.,  618.,  622.,  626.,
         630.,  634.,  638.,  642.,  646.,  650.,  654.,  658.,  662.,
         666.,  670.,  674.,  678.,  682.,  686.,  690.,  694.,  698.,
         702.,  706.,  710.,  714.,  718.,  722.,  726.,  730.,  734.,
         738.,  742.,  746.,  750.,  754.,  758.,  762.,  766.,  770.,
         774.,  778.,  782.,  786.,  790.,  794.,  798.,  802.,  806.,
         810.,  814.,  818.,  822.,  826.,  830.,  834.,  838.,  842.,
         846.,  850.,  854.,  858.,  862.,  866.,  870.,  874.,  878.,
         882.,  886.,  890.,  894.,  898.,  902.,  906.,  910.,  914.,
         918.,  922.,  926.,  930.,  934.,  938.,  942.,  946.,  950.,
      

In [7]:
#figure out what the widest common wavenumber range is
# assuming list of dictionaries is called 'list_of_dicts'
min_val = 0
max_val = 50000

for d in parsedJcamps:
    x = d['x']
    local_min = min(x)
    local_max = max(x)
    
    if local_min > max_val or local_max < min_val:
        # if there is no overlap between the local range and the current range, we can't find a common range
        print("There is no common range for all dictionaries.")
        break
    else:
        # otherwise, update the current minimum and maximum values
        min_val = max(min_val, local_min)
        max_val = min(max_val, local_max)

if min_val <= max_val:
    # if there is a common range, print it
    print("The common range is:", min_val, "-", max_val)


The common range is: 551.688 - 3846.0


In [8]:
# define the new x values you want to resample to
new_x = np.linspace(555, 3846, num=880)

# use numpy's interp function to interpolate the y values based on the new x values
new_y = np.interp(new_x, x, y)

# print the new x and y arrays
print(new_x)
print(new_y)


[ 555.          558.7440273   562.48805461  566.23208191  569.97610922
  573.72013652  577.46416382  581.20819113  584.95221843  588.69624573
  592.44027304  596.18430034  599.92832765  603.67235495  607.41638225
  611.16040956  614.90443686  618.64846416  622.39249147  626.13651877
  629.88054608  633.62457338  637.36860068  641.11262799  644.85665529
  648.60068259  652.3447099   656.0887372   659.83276451  663.57679181
  667.32081911  671.06484642  674.80887372  678.55290102  682.29692833
  686.04095563  689.78498294  693.52901024  697.27303754  701.01706485
  704.76109215  708.50511945  712.24914676  715.99317406  719.73720137
  723.48122867  727.22525597  730.96928328  734.71331058  738.45733788
  742.20136519  745.94539249  749.6894198   753.4334471   757.1774744
  760.92150171  764.66552901  768.40955631  772.15358362  775.89761092
  779.64163823  783.38566553  787.12969283  790.87372014  794.61774744
  798.36177474  802.10580205  805.84982935  809.59385666  813.33788396
  817.0

In [34]:
tc = test['cas registry no']
tc

'2495-37-6'

In [20]:
import pubchempy as pcp

# Search for Aspirin by name
result = pcp.get_compounds(test['title'], 'name')



In [26]:
result[0].canonical_smiles

'CC(=C)C(=O)OCC1=CC=CC=C1'

In [45]:
result = pcp.get_compounds('AOJOEFVRHOZDFN-UHFFFAOYSA-N','inchikey')
result[0].canonical_smiles

'CC(=C)C(=O)OCC1=CC=CC=C1'

In [47]:
result = pcp.get_compounds(tc,'name')
result[0].canonical_smiles

'CC(=C)C(=O)OCC1=CC=CC=C1'

In [55]:
import requests
from bs4 import BeautifulSoup

def get_inchi_key(cas):
    # Construct the URL for the NIST website search page
    url = f"https://webbook.nist.gov/cgi/cbook.cgi?Name={cas}&Units=SI&Mask=20"

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML response using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the span tag with the class "inchi-text"
    inchi_span = soup.find("span", class_="inchi-text")

    # Extract the InChI key from the span tag
    inchi_key = inchi_span.text.strip()

    # Return the InChI key
    return inchi_key



In [57]:
inchi = get_inchi_key(tc)

In [58]:
inchi

'AOJOEFVRHOZDFN-UHFFFAOYSA-N'

In [61]:
df = pd.read_csv('FTIRdataSet.csv')

In [71]:
mask = df.smiles.isna()

In [82]:
df_missing_smiles = df[mask]

In [73]:
df_missing_smiles.sample(5)

Unnamed: 0,jcamp,x,y,x_,y_,smiles
2800,"{'title': '3-(2-Hydroxyphenyl)propionic acid',...",[ 549.766 551.695 553.624 ......,[0.000817 0.0008007 0.0007786 ... 0.000144 0...,[ 555. 558.7440273 562.48805461 56...,[7.43932504e-04 7.89950620e-04 7.75249153e-04 ...,
3079,"{'title': '3-Amino-2-nitrobenzotrifluoride', '...",[ 549.759 551.688 553.617 ... 3996.842 3998....,[0.014079 0.014109 0.013429 ... 0.000812...,[ 555. 558.7440273 562.48805461 56...,[1.27622348e-02 1.31431929e-02 1.34201246e-02 ...,
2899,"{'title': 'Ketone, 4-methyl-2-pyridyl 2-thieny...",[ 450. 454. 458. 462. 466. 470. 474. 47...,[0. 0.00689924 0.01155101 0.01322355 0...,[ 555. 558.7440273 562.48805461 56...,[0.01186461 0.01300708 0.01591882 0.02057572 0...,
1601,"{'title': ""Benzophenone, 5,5'-dimethyl-2-hydro...",[ 450. 454. 458. 462. 466. 470. 474. 47...,[7.37388600e-03 1.39185800e-03 6.01164200e-03 ...,[ 555. 558.7440273 562.48805461 56...,[1.90418020e-02 1.58450566e-02 1.20254823e-02 ...,
7686,"{'title': 'Spiro[5.7]tridec-1-ene-3-one', 'jca...",[ 450. 454. 458. 462. 466. 470. 474. 47...,[3.33172980e-02 1.80689050e-02 8.28525400e-03 ...,[ 555. 558.7440273 562.48805461 56...,[4.36297950e-03 4.37832145e-03 5.28214273e-03 ...,


In [77]:
moreSmiles = []
for i in df_missing_smiles.jcamp:
    try:
        result = pcp.get_compounds(i['cas registry no'],'name')
        moreSmiles.append(result[0].canonical_smiles)
    except:
        moreSmiles.append('')

In [78]:
moreSmiles.count('')

519

In [79]:
pip install cirpy

Collecting cirpy
  Downloading CIRpy-1.0.2.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: cirpy
  Building wheel for cirpy (setup.py) ... [?25ldone
[?25h  Created wheel for cirpy: filename=CIRpy-1.0.2-py3-none-any.whl size=7263 sha256=27f726252e28ba909815b054e9e64fb6ce09ee10d37ab20c04e6b6e2cbb1b932
  Stored in directory: /Users/ijvaillant/Library/Caches/pip/wheels/50/3a/92/7532bfd6c46528f53a71dc0419e5cc41aea798bfdc126f8da9
Successfully built cirpy
Installing collected packages: cirpy
Successfully installed cirpy-1.0.2
Note: you may need to restart the kernel to use updated packages.


In [80]:
import cirpy

519