# Chemical Data Extractor File

In [None]:
from chemdataextractor import Document
from chemdataextractor.doc import Sentence, Paragraph
from bs4 import BeautifulSoup
import os
import time
import re
from xml.etree import cElementTree as ET
import pickle
import inflect #used to convert words from singular to plural and opposite
p = inflect.engine()

## Filter Functions

In [None]:
#various filter functions
#function takes a chemical and removes metadata matches and 
#only non-characters chemicals
def filter_match(string,bad_matches):
    
    noMatch=True
    #REGEXes
    title_regex=re.compile('^[A-Z]{2,3}\d+ *\/* *[A-Z]{0,3}\d+.*$')
    char_only_regex=re.compile('^[^A-Za-z]*$')
    
    
    if char_only_regex.match(string)!=None:
        noMatch=False
    elif title_regex.match(string) !=None:
        noMatch=False
    
    if not noMatch:
        bad_matches.add(string)
  
    return noMatch

#builds possible strings for removing possible overlap abbreviations
#patents often have spacing issues and this finds all possible spacings for removal
def create_poss_strings(l):
    new_strings=set()
    new_strings.add(l[0])
    for i in range(1,len(l)):
        old_strings=new_strings
        new_strings=set()
        for s in old_strings:
            new_strings.add(s+l[i])
            new_strings.add(s+" "+l[i])
            
    return new_strings

#full function that fills out possible matches for an abbreviation record
#main function for function before that creates the string to get all possible
#spacings for that string using create_poss_strings function
#takes in a list (its a string broken into word defined by the chemical data extractor 
#abbreviations) and returns a set of possible matches to remove
def add2non_abrev_matches(full_list):
    #preprocess to get words
    first_words=None
    if full_list[-1]==")":
        isParen=True
        first_words=full_list[0].split("(")

        rec_list=first_words+full_list[1:-1]
    else:
        rec_list=full_list



    non_abrev_matches=create_poss_strings(rec_list)
   
    try:
        #get singular and plural versions of chemicals
        copy_list=rec_list.copy()
        end_idx=-1
        if first_words!=None:
            idx=-2
        if p.singular_noun(copy_list[end_idx])==False:
            copy_list[end_idx]=p.plural(copy_list[end_idx])
            non_abrev_matches=non_abrev_matches.union(create_poss_strings(copy_list))
        else:
            copy_list[end_idx]=p.singular_noun(copy_list[end_idx])
            non_abrev_matches=non_abrev_matches.union(create_poss_strings(copy_list))
    except:
        pass
    #put back in any parenthesis and return
    for s in list(non_abrev_matches):
        split_s=s.split()
 
        if first_words!=None and split_s[0]==first_words[0]:
            new_str=s[:len(split_s[0])]+"("+s[len(split_s[0])+1:]+")"

            non_abrev_matches.add(new_str)
    return non_abrev_matches

## Chemical Function

In [None]:
#https://stackabuse.com/reading-and-writing-xml-files-in-python/

#tree = ET.parse("chemical-patents-xml/AU-2014316839-B2.xml")

#takes in a file name and returns all the chemicals in a document
#all chemicals and abbreviations are in lower case
def get_chemical_set(filename):

    chem_matches=set()
    bad_matches=set()
    non_abrev_matches=set()
    non_chem_matches=set()
    
    #http://www2.hawaii.edu/~takebaya/cent110/xml_parse/xml_parse.html
    #get all information from claims, descriptions and abstract
    #als make sure its just english
    claims,abstract,description=[],[],[]
    soup = BeautifulSoup(open(filename, 'r'), 'xml')
 
    claims_soup=soup.find_all('claims')

    for i in range(0, len(claims)):
        if claims_soup[i]['lang']=="EN":
            claims.append(claims_soup[i].get_text())
 
    description_soup=soup.find_all('description')
    for i in range(0, len(description_soup)):
        if description_soup[i]['lang']=="EN":
            description.append(description_soup[i].get_text())

    abstract_soup=soup.find_all('abstract')

    for i in range(0, len(abstract)):
        if abstract_soup[i]['lang']=="EN":
            abstract.append(abstract_soup[i].get_text())
    
    for sect_idx,section in enumerate([abstract,description,claims]):
    
        for text in section:
            #put string into package and get back variable holding various information
            doc=Document(text) 
            #creates set of chemicals that map to abbreviations that are to be removed from final words
            non_abbrev=set()
            if len(doc.abbreviation_definitions)>0:
                
                for abbrev in doc.abbreviation_definitions:
              
                    if abbrev[2]!=None: 
                        chem_matches.add(abbrev[0][0].lower())
                    else:
                        non_chem_matches.add(abbrev[0][0])
                    non_abrev_matches=non_abrev_matches.union(add2non_abrev_matches(abbrev[1]))
                    non_abbrev.add(" ".join(abbrev[1]))
            #now iterate through chemicals
            for span_chem in doc.cems:
                chemical=span_chem.text
                paragraph = Paragraph(chemical)
                for sentence in paragraph.sentences:
                    tokens={value[1] for value in sentence.pos_tagged_tokens}
                    #remove based on part of speech tags
                    if 'CC' in tokens or 'IN' in tokens or 'TO' in tokens or 'RB' in tokens or 'DT' in tokens:
                        continue
                    if 'VBN' in tokens and sentence.pos_tagged_tokens[0][1]!='VBN':
                        continue
                    if sentence in non_abbrev:
                        continue
                    if (not chemical in bad_matches) and filter_match(chemical,bad_matches) and chemical[0]!="-":
                        if '``' in tokens:
                            chemical=" ".join([word[0]  for word in sentence.pos_tagged_tokens if word[1]!='``'])

                        chem_matches.add(chemical.lower())
                        #get sub chemical components by splitting by non character words
                        subwords=re.split('[^a-zA-Z]', chemical.lower())
                        for subw in subwords:
                            if len(subw)>0:
                                chem_matches.add(subw)

                                    
                                
    chem_matches=chem_matches-non_abrev_matches      

    return chem_matches

#None exhaustive list of various filters
#Lower case all letters
#remove - if its the first character (probably gotten by different point as it was cut off)
#POS tagging remove IN and TO. remove Verbs iff not first word
#Only include abbreviations. try to catch all possible permutations for an abbreviation
#Regex to remove records with no characters, and possible names of patent information





## Main driver code

In [None]:
#takes in a folder and gets all chemicals. will write to specified folder
#where each file corresponds to chemicals from a patent
folder="patent_chemicals"
orig_folder="chemical-patents-xml"
no_chems_extracted=set()
for file_idx,filename in enumerate(os.listdir(orig_folder)):
    if filename.endswith(".xml"):
        start=time.time()
        chem_matches=get_chemical_set(f"{orig_folder}/"+filename)
       
        #dont add file if no chemicals matched
        if len(chem_matches)==0:
            no_chems_extracted.add(filename)
            numtime=time.time()-start

            continue
        numtime=time.time()-start
        print(file_idx,numtime)
        print(filename,len(chem_matches))
        if not os.path.exists(folder):
            os.makedirs(folder)
        
        with open(f"{folder}/{filename[:-4]}_words.txt",'wb') as f:
            pickle.dump(chem_matches, f)
            
#will also write to file patents that no chemicals were extracted
fd=open('./no_chem_extracted.txt','wt')
for file in no_chems_extracted:

    fd.write(file)
    fd.write("\n")
fd.close()
        
    

    

## Helper Debug functions functions

In [None]:
print("Length Chemical Matches",len(chem_matches),"|","Length Bad Matches",len(bad_matches),"|","Length Non Abbrev",len(non_abrev_matches))

In [None]:
#print chemicals
for idx,match in enumerate(chem_matches):
    print("Index {}:".format(idx+1),match)

In [None]:
#just prints chemicals for a document. helpful to understaind how chemical data extractor words
soup = BeautifulSoup(open("chemical-patents-xml/EP-1609024-B1.xml", 'r'), 'xml')
 
claims,abstract,description=[],[],[]
claims_soup=soup.find_all('claims')

for i in range(0, len(claims)):
    if claims_soup[i]['lang']=="EN":
        claims.append(claims_soup[i].get_text())

description_soup=soup.find_all('description')
for i in range(0, len(description_soup)):
    if description_soup[i]['lang']=="EN":
        description.append(description_soup[i].get_text())

abstract_soup=soup.find_all('abstract')

for i in range(0, len(abstract)):
    if abstract_soup[i]['lang']=="EN":
        abstract.append(abstract_soup[i].get_text())
doc=Document(description[0])
for c in doc.cems:
    print(c)