### Initialization

In [None]:
####################################################################
## Load packages
####################################################################

import json
import ast
import pickle
import codecs
import pandas as pd
import spacy
import re as reg
import numpy as np

from collections import OrderedDict
from sympy.parsing.sympy_parser import (parse_expr, standard_transformations, implicit_multiplication_application)
from sympy.parsing.latex import parse_latex  
from sympy import *
from spacy import displacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span


####################################################################
## Load NLP
####################################################################

try:
    nlp = spacy.load('en')
except:
    nlp = spacy.load('E:/Users/nasser_qadri/AppData/Local/conda/conda/envs/multivac/Lib/site-packages/en_core_web_sm/en_core_web_sm-2.0.0')
nlp.max_length=1000000000



####################################################################
## Global variables, mostly regex for LateX parsing and text cleaning
####################################################################
# Use this to store LateX code 
latexMap = {}

## Use this for debugging to keep track of the different governors
listGovs = []

#LateX identifiers
latexBlock= reg.compile('\$\$.*?\$\$')
latexInline= reg.compile('\\\\.*?\\\\\)')


### Regex for cleaning Python
re_citationsNumeric = reg.compile('(\[\d+)(,\s*\d+)*]')
re_url= reg.compile(r'((http|ftp|https):\/\/)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)')
author = r"(?:[A-Z][A-Za-z'`-]+)"
etal = r"(?:et al.?)"
additional = r"(?:,? (?:(?:and |& )?" + author + "|" + etal + "))"
year_num = r"(?:19|20)[0-9][0-9]"
page_num = r"(?:, p.? [0-9]+)?"  # Always optional
year = "(?:, *"+year_num+page_num+"| *\("+year_num+page_num+"\))"
re_intextcite = reg.compile(r"(" + author + additional+"*" + year + ")")
#re_intextcite = reg.compile(r"((?:[A-Z][A-Za-z'`-é-]+)(?:,? (?:(?:and |& )?(?:[A-Z][A-Za-z'`-]+)|(?:et al.?)))*(?:,* *(?:19|20)[0-9][0-9](?:, p.? [0-9]+)?| *\\((?:19|20)[0-9][0-9](?:, p.? [0-9]+)?\\)))")
#re_intextcite = reg.compile(r"((?:[A-Za-z][A-Za-z'`-é-]+)(?:,? (?:(?:and |& )?(?:[A-Za-z][A-Za-z'`-é-]+)|(?:et al.?)))*(?:,* *((?:19|20)[0-9][0-9][a-z]*)(, (\d+))*(?:, p.? [0-9]+)?| *\\((?:19|20)[0-9][0-9][a-z](?:, p.? [0-9]+)?\\)))")
re_intextcite = reg.compile(r"((?:[A-Za-z][A-Za-z'`-éü-]+)(?:,? (?:(?:and |& )?(?:[A-Za-z][A-Za-z'`-éü-]+)|(?:et al.?)))*(?:,* *((?:19|20)[0-9][0-9][a-z]*)(\s*&\s*[0-9]*[a-z]*)*(, (\d+))*(?:, p.? [0-9]+)?| *\\((?:19|20)[0-9][0-9][a-z](\s*&)(?:, p.? [0-9]+)?\\)))")

#re_emptyCite = reg.compile(r"\((\s*;*\s*)+\)")
re_emptyCite = reg.compile(r"\(([\s]*[;]+[\s]*)+\)")
re_emptyEg = reg.compile(r'\(e.g.[\s*;\s*]*[,]*\s*\)')
re_clickHere = reg.compile(r'Click here[^.]*\.')
re_cid=reg.compile(r"\(cid:\d+\)")
re_email = reg.compile(r"[\w.-]+@[\w.-]+")
re_emptyParens = reg.compile(r"\(\s*\)")
re_emptySee = reg.compile(r"\(see(\s)*\)")
re_sponsors = reg.compile(r'(This work was supported).+')
re_arxivHeader = reg.compile(r"(a r X i v).*?(?=[a-zA-Z]{2,})")
re_vixraHeader = reg.compile(r"^(\s?.?\s)+(v i X r a)")
re_hyphenatedWords = reg.compile(r'\S(?=\S*[-]\s)([a-zA-Z-]+)(\s)[A-za-z]+')

### Classes and functions

In [None]:
class EntityMatcher(object):
    '''
    This creates a step in Spacy's NLP pipeine to recognize and handle equations
    '''
    name = 'entity_matcher'

    def __init__(self, nlp, terms, label):
        patterns = [nlp.make_doc(text) for text in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        for match_id, start, end in matches:
            span = Span(doc, start, end, label=match_id)
            doc.ents = list(doc.ents) + [span]
        return doc
    

def load_data(picklePath = None):
    """Load data - if picklePath is specified, load the pickle. Else, try json file.
    This returns the JSON file as well as a list of document texts 
    """
    if picklePath is not None:
        l_docs = pickle.load(open(picklePath, "rb" ))
    else:

        ## Read JSON data into the datastore variable - this comes from Peter and Domonique's effort. Don
        with open('../../data/20181212.json', 'r') as f:
            datastore = json.load(f)
            
        
        ## These were some bad files - nothing substantive in them, or they were retrieved in bad format
        for e in ['1805.10677v1', '0911.5378v1']: 
            datastore.pop(e)

        ## Extract texts
        l_docs = [value['text'] for key,value in list(datastore.items())[0:] if value['text'] ]
        
    print('# of documents: ', len(l_docs))
    
    return datastore, l_docs
    
    
def retrieve_JSON_output(l_docs):
    """Create a JSON output of dependency trees - this has been replaced with a text output (instead of a JSON output). 
    """
    
    sentences = []
    dependencyDocuments = []
    
    for di, doc in enumerate(l_docs[0:]):    
        for sent in list(doc.sents)[0:]:
            sentenceObj = {}
            sentenceObj['sentence']=sent.text
            words = []

            for token in sent:        
                wordObj = {
                    'tokenText':token.text,
                     'tokenTag':token.tag_,
                     'tokenDep':token.dep_,
                     'tokenHeadText':token.head.text,
                     'tokenHeadTag':token.head.tag_
                }
                words.append(wordObj)
                #print("{0}/{1} <--{2}-- {3}/{4}".format(token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

            sentenceObj['words'] = words
            sentences.append(sentenceObj)

        docObject = {}
        docObject['id']=di
        docObject['sentences']=sentences
        dependencyDocuments.append(docObject)
    
    return dependencyDocuments



def getAdjustmentPosition(tokenPosition, adjustmentDictionary):
    '''This determines the adjustment position for DEP files, because things get reordered when there are equations
    '''
    if len(adjustmentDictionary)>1:
        for key, val in sorted(list(adjustmentDictionary.items()), key=lambda x:x, reverse=True):
            if tokenPosition>key:
                return val
    return 0



def create_parse_files(doc, docNum, writeFile = True, pathToFolders=''):
    """ Creates parse files and stores them in the folder passed when writeFile=True and pathToFolders is provided
        The following file types are created
            * dep -- for dependencies
            * input -- for POS tagging
            * morph -- lemmatized words
    """
    
    d_documentData = {
        'depData' : [],
        'posData' : [],
        'morData' : []
    }
    
    l_depSentences = [] # for dependencies
    l_posSentences = [] # for POS tagging
    l_morSentences = [] # for morphology/lemmatization 

    for sent in list(doc.sents)[0:]:

        l_depTokens_tuples=[]
        l_depTokens=[]
        l_posTokens=[]
        l_morTokens=[]
        l_depTokens_latex_tuples=[]
        l_depTokens_latex=[]
        l_posTokens_latex=[]
        l_morTokens_latex=[]

        adjustedPosition = 0
        adjustmentDictionary = {0:0}
        for token in sent:
            
            if 'LateXEquation' in token.head.text:
                pass
            
            if  (token.text==' '):
                adjustedPosition= adjustedPosition-1
                adjustmentDictionary[(token.i - sent.start +1)]=adjustedPosition
                pass
            
            
            elif 'LateXEquation' in token.text:
                
                l_depTokens_latex_sub_tuples, l_posTokens_latex_sub, l_morTokens_latex_sub = latexParsing(
                    token, token.i - sent.start  + 1 + adjustedPosition)
                
                # Need to adjust position so that it we add all the new tokens, then subtract 1 for LateXEquation##
                
                adjustedPosition = adjustedPosition + (len(l_posTokens_latex_sub) -1)
                adjustmentDictionary[(token.i - sent.start +1)]=adjustedPosition
                

                ## Go backwards and make sure all the previous ones are good
                new_l_depTokens_tuples = []
                
                
                for depSet in l_depTokens_tuples:
                    t1, t2, t3 = depSet
                    t2=list(t2)
                    t3=list(t3)
                    
                    #current position is the threshold for change
                    if t2[1]> (token.i - sent.start  + 1):
                        t2[1] = t2[1]+adjustedPosition
                    if t3[1]> (token.i - sent.start  + 1):
                        t3[1] = t3[1]+adjustedPosition
                                
                    adjustedTuple = (t1,tuple(t2),tuple(t3))
                    new_l_depTokens_tuples.append(adjustedTuple)
                    
                l_depTokens_tuples = new_l_depTokens_tuples
                
                # Now add to the master list
                l_depTokens_tuples = l_depTokens_tuples + l_depTokens_latex_sub_tuples
                l_posTokens = l_posTokens + l_posTokens_latex_sub
                l_morTokens = l_morTokens + l_morTokens_latex_sub
                
                    
            else:
                ## For rest of sentence 
                ## For dependency trees
                childTokenPosition = token.i - sent.start  + 1
                headTokenPosition =  token.head.i - sent.start +1 

                if token.dep_ not in ['ROOT','punct']:
                    
                    
                    
                    l_depTokens_tuples.append( ( token.dep_ , (token.head.text, headTokenPosition + 
                                                               getAdjustmentPosition(headTokenPosition, adjustmentDictionary)), 
                                         (token.text, childTokenPosition + 
                                          getAdjustmentPosition(childTokenPosition, adjustmentDictionary)) ) )
                    
                    
                ## For POS
                l_posTokens.append("{0}_{1}".format(token, token.tag_))  
                #print(token.tag_)

                ## For Morphologies
                l_morTokens.append(token.lemma_)
        
        
        ## Need to Parse out DEPTokens from tuples out to text
        for depSet in l_depTokens_tuples:
            t1, t2, t3 = depSet
            l_depTokens.append("{0}({1}-{2}, {3}-{4})".format(t1, t2[0],t2[1],t3[0], t3[1]))
 
        for depSet in l_depTokens_latex_tuples:
            t1, t2, t3 = depSet
            l_depTokens_latex.append("{0}({1}-{2}, {3}-{4})".format(t1, t2[0],t2[1],t3[0], t3[1]))
 
        
        
        l_depSentences.append("\n".join(l_depTokens + l_depTokens_latex))
        l_posSentences.append("\n".join(l_posTokens))
        l_morSentences.append("\n".join(l_morTokens))
        

    d_documentData['depData'].append(l_depSentences)
    d_documentData['posData'].append(l_posSentences)
    d_documentData['morData'].append(l_morSentences)

    if writeFile:
        with open(pathToFolders+'\\dep\\{0:04d}.dep'.format(docNum), "w", encoding='utf8') as text_file:
            text_file.write('\n\n'.join(l_depSentences))
        with open(pathToFolders+'\\input\\{0:04d}.input'.format(docNum), "w", encoding='utf8') as text_file:
            text_file.write('\n\n'.join(l_posSentences))
        with open(pathToFolders+'\\morph\\{0:04d}.morph'.format(docNum), "w", encoding='utf8') as text_file:
            text_file.write('\n\n'.join(l_morSentences))

        print('Files written to folder:', pathToFolders)
    return d_documentData


def replace_latex(m):
    '''
    Replace LateX equations with placeholder token, with format LateXEquation
    '''
    latexStr = m.group()
    latexStr = cleaned_latex(latexStr)
    
    aggregatedMapKey = ''
    if ('cid:' not in latexStr):
        #print('\n',latexStr)
        
        #Sometimes there are multiple equations within each block. Get those here
        latexArray = latexStr.split(', \\\\')
        for latexItem in latexArray: 
            counter = len(latexMap)
            thisMapKey = ' LateXEquation'+str(counter) + ' '
            aggregatedMapKey = aggregatedMapKey +  thisMapKey
            latexMap[thisMapKey.replace(' ','')] = latexItem ## in this case, 'key' is the latex code

        return (aggregatedMapKey)
        

def extract_and_replace_latex(doc, docNum):
    '''
    Find and extract LateX, start with blockquote and then do inline
    '''
    
    doc = reg.sub(latexBlock, replace_latex, doc)
    doc = reg.sub(latexInline, replace_latex, doc)


    return doc


def cleaned_latex(s):    
    '''LateX requires some cleaning from original file format
    '''
    s=s.replace('$$','')
    s = reg.sub(r'\\begin{array}{.*?}', '', s)
    s = reg.sub(r'\\end{array}', '', s)
    s = reg.sub(r'\\begin{aligned}', '', s)
    s = reg.sub(r'\\end{aligned}', '', s)
    s=s.replace('&=&','=')
    s=s.replace('\(','(')
    s=s.replace('\)',')')
    s=s.lstrip().rstrip()
    
    return s


def clean_doc(doc):   
    '''
    Clean individual documents and remove citations, URLs, emails, other trivial content. Returns cleaned doc
    '''
    doc = reg.sub(re_cid, ' ', doc)
    doc = reg.sub(re_citationsNumeric, ' NumericCitation ', doc)
    doc = reg.sub(re_url, ' ', doc)
    doc = reg.sub(re_intextcite, ' Citation ', doc)
    doc = reg.sub(re_emptyCite, ' ', doc)
    doc = reg.sub(re_emptyEg, ' ', doc)
    doc = reg.sub(re_clickHere, ' ', doc)
    doc = reg.sub(re_email, ' ', doc)
    doc = reg.sub(re_emptyParens, ' ', doc)
    doc = reg.sub(re_emptySee, ' ', doc)
    doc = reg.sub(re_arxivHeader, ' ', doc)
    doc = reg.sub(re_vixraHeader, ' ', doc)
    
    #This work supported by --> all the way to end of document
    #Only remove this when it appears in the second half of the article
    removeSupported = False
    for m in reg.finditer(re_sponsors, doc):
        if m.start()>(len(doc)/2):
            #print('************',m.start(), len(doc))
            doc = reg.sub(re_sponsors, ' ', doc)
    
    #Handling hyphens - 2-28-2018
    for m in reg.finditer(re_hyphenatedWords, doc):
        match=m.group(0)
        
        mergedWord = match.replace(' ', '').replace('-','')
        if mergedWord in nlp.vocab: 
            
            doc = doc.replace(match, mergedWord)
        else:
            allWords = True
            for i in match.replace(' ', '').split('-'):
                allWords = allWords and (i in nlp.vocab)
            if allWords:
                doc = doc.replace(match,(match.replace(' ', '')) )
            else:
                doc = doc.replace(match, mergedWord)
    
    return doc
  
    
def find_matches(allDocs,regPat):
    '''
    Use this for debugging to find matches
    '''
    c=0
    for i, doc in enumerate(allDocs):
        for m in reg.finditer(regPat, doc):
            print(i, m.start(), len(doc))
            c=c+1
    print('length:', c)


def get_symbol_and_type(s):
    '''
    For LateX Symbols/Integers/Rational, return value of symbol and symbol type
    '''
    symbol = s[s.find("(")+1:s.find(")")]
    symbolType = s[0:3]
    return symbol, symbolType


def latexParsing(token, tokenPos):
    '''
    LateX parsing function for DIM files
    '''
    lastPos = 0
    # Each line gets stored as a separate li in the list
    l_depTokens = []
    l_posTokens = []
    l_morTokens = []
    stringRep = ''
    
    # Try parsing the latex code as is
    try:
        expr = parse_latex(latexMap[token.text])
        stringRep = srepr(expr)
    except:
        # Good chance the problem is the leading and trailing parens - remove them and try again
        try: 
            expr = parse_latex(latexMap[token.text].lstrip('(').rstrip(')'))
            stringRep = srepr(expr)
        except:
            pass
    
    
    ## If we have a sympy string representation...
    if stringRep !='':
        
        # Problematic artefact from sympy parsing
        
        stringRep = stringRep.replace(", precision=53","")
        
        # Call gov_dep function to get list of dependencies, objects
        l_dependencies = (gov_dep(stringRep))
        
        # This will store each dependency item
        dictAll = {}
        
        
        ## If we actually have a list of items rather than a single symbol/integer
        if len(l_dependencies)>0:
        
            #Do the D in DIM
            for li in l_dependencies:

                head=li[0]
                tail=li[1]
                #print('head:',head, '---', 'tail:',tail)

                dictAll[head[1]]=head[0]
                dictAll[tail[1]]=tail[0]
                
                #if 'Float(' in tail[0]:
                #    print(tail[0])
                
                l_depTokens.append( ( get_rel(head[0]) , (head[0], head[1]+tokenPos-1), (tail[0], tail[1]+tokenPos-1) ) )
                
                #Keep track of govs for debugging 
                listGovs.append(head[0])
                listGovs.append(tail[0])
                
                if head[1]> lastPos:
                    lastPos = head[1]
                
                if tail[1]> lastPos:
                    lastPos = tail[1]

            
        ## We're dealing with just a symbol or integer
        else: 
            
            #Keep track of govs for debugging 
            listGovs.append(stringRep)
            dictAll[1]=stringRep
            lastPos = 1
            
        ## Do the IM in DIM
        for key, val in dictAll.items():
            
            # IF it's a symbol/integer
            if '(' in val:
                symbol, symbolType =get_symbol_and_type(val)
                #if symbolType =='Flo':
                #    symbol = str(round(float(symbol.replace("'", "")), 4))
                l_posTokens.append('{}\t{}_{}'.format(key, symbol,symbolType[0:5].upper()))
                l_morTokens.append(symbol)
            else:
                thisPos = 'LATEX'
                if val in ['Equality','StrictGreaterThan','StrictLessThan','Approx','approx']:
                    thisPos = "COMPARE"
                elif val in ['Mul','Add','Pow']:
                    thisPos = "COMBINE"
                elif val in ['Function']:
                    thisPos = "FUNCTION"
                else:
                    thisPos = "TRANSFORM"
                
                l_posTokens.append('{}\t{}_{}'.format(key, val, thisPos))
                l_morTokens.append(val)

    return l_depTokens, l_posTokens, l_morTokens


def find_parens(s):
    '''
    '''
    toret = OrderedDict()
    pstack = []

    for i, c in enumerate(s):
        if c == '(':
            pstack.append(i)
        elif c == ')':
            if len(pstack) == 0:
                raise IndexError("No matching closing parens at: {} for string: {}".format(i, s))
            toret[pstack.pop()] = i

    if len(pstack) > 0:
        raise IndexError("No matching opening parens at: {} for string: {}".format(i, s))

    return OrderedDict(sorted(toret.items()))

def gov_dep(s, i=1):
    '''
    '''
    results = []
    
    # ignore inputs that don't match the formula syntax - there are some "true" values here, and we don't want to 
    # recurse into our "Symbol('x')" etc. tokens
    if "(" in s and not s.startswith("'"):
        # Get an OrderedDict of all our parentheses pairs
        parens = find_parens(s)

        # get our parent/governor token
        p1 = next(iter(parens))
        p2 = parens.pop(p1)
        
        # if it's "Function" we need to include the next parenthetical(s) in the title
        # and skip it/them so we don't try to interpret the contents as dependencies
        if s[:p1] == "Function":
            gov = (s[:p2+1], i)
            
            while True:
                p1 = next(iter(parens))
                p_2 = parens.pop(p1)
                
                if p1 > p2:
                    p2 = p_2
                    break
        else:
            
            gov = (s[:p1], i)
            
        # Once we've got our parent/governor, grab the children/dependents
        # and add those dependencies to our list
        while parens:
            # get next token as a child/dependent
            p3 = next(iter(parens))
            p4 = parens.pop(p3)
            
            # if there's a ', ' preceding us we need to index from that, not the 
            # parent parenthesis mark
            if ", " in s[:p3]:
                dep_p1 = s[:p3].rfind(", ")+2
            else:
                dep_p1 = p1+1

            # Again, if this is 'Function' include the next parenthetical portion
            if s[dep_p1:p3] == "Function":
                dep = (s[dep_p1:p4+1], i+1)
                
                while True:
                    p3 = next(iter(parens))
                    p_4 = parens.pop(p3)

                    if p3 > p4:
                        p4 = p_4
                        break
            else:                  
                ##NQ - getting value of certain things
                if s[dep_p1:p3] in ['Symbol', 'Integer', 'Float']:
                    dep = (s[dep_p1:p4+1], i+1)
                else:
                    dep = (s[dep_p1:p3], i+1)
                    

            # add dependency pair to results!
            results.append((gov, dep))

            # if there are more tokens
            if len(parens) > 0:
                # and the next is a child/dependent of the current child/dependent token
                if next(iter(parens)) < p4:
                    # recurse
                    results += gov_dep(s[dep_p1:p4+1], i+1)
                    
                    # and then clean up the parentheticals we just covered recursively 
                    # so we don't try to parse them again at this level
                    for p in [key for key in parens if key < p4]:
                        del parens[p]
            
            # keep track of our token counts so we're numbering things right
            i += len(results)
    
    return results


def get_rel(gov):
    # Not yet exhaustive of options - but we need to add the relation to the dependencies for a 
    # final format of: relation(gov-#, dep-#)
    if gov in ['Equality','StrictGreaterThan','StrictLessThan','Approx','approx']:
        rel = "compare"
    elif gov in ['Mul','Add','Pow']:
        rel = "combine"
    elif gov in ['Function']:
        rel = "function"
    else:
        rel = "transform"
    
    return rel

In [None]:
# thisDocumentData = create_parse_files(nlp(allDocs2[1540]), 9999, True, '..\\..\\data\\processed with equations2-28\\')

###  Load documents

In [None]:
jsonObj, allDocs = load_data()

### Clean documents

In [None]:
try: 
    allDocsClean = pickle.load(open('allDocsClean.pkl', "rb" ))
    print('Loaded pickle!')
except:
    print('Starting from scratch')
    allDocsClean= []
    for i, doc in enumerate(allDocs):
        if i%10==0:
            print(i)
        if i%100==0:
            gc.collect()
            print(i)
        allDocsClean.append(clean_doc(doc))
        
    with open('allDocsClean.pkl', 'wb') as f:
        pickle.dump(allDocsClean, f)


### Extract LateX, store it in latexMap (dictionary), and replace it with LateXEquation##

In [None]:
allDocs2 = [extract_and_replace_latex(doc, docNum) for docNum, doc in enumerate(allDocsClean)]

### spaCy NLP pipeline setup, and NLPify

In [None]:
# terms = tuple(latexMap.keys())
# try:
#     entity_matcher = EntityMatcher(nlp, terms, 'EQUATION')
#     nlp.add_pipe(entity_matcher, before='ner')
#     #print(nlp.pipe_names) 
# except:
#     pass


#### Add LateXEquations to NlP lexeme
for key in latexMap.keys():
    lex = nlp.vocab[key]

#### If I want to remove the pipeline for any reason...
#nlp.remove_pipe('entity_matcher')


#### NLPify documents
# allDocs3= []
# for i, doc in enumerate(allDocs2[1500:1600]):
#     print(i)
#     allDocs3.append(nlp(doc))

### Create DIM files

In [None]:
# documentData = create_parse_files(allDocs3, False, '..\\..\\data\\processed with equations2-27_3\\')

import gc
documentsData = []
for i, doc in enumerate(allDocs2):
    if i%100==0:
        gc.collect()
    # Send things in batches
    nlpifiedDoc = nlp(doc)
    thisDocumentData = create_parse_files(nlpifiedDoc, i, True, r'W:\DARPA_ASKE\CONSULTING\Nasser\Processed2-28')
    documentsData.append(thisDocumentData)

#  Keep this to fix hyphenated words -- DO NOT RUN

In [None]:
# def find_matches2(doc, regpat):
#     for m in reg.finditer(regpat, doc):
#         print( m.group(0))

In [None]:
# regPat = '\S(?=\S*[-]\s)([a-zA-Z-]+)(\s)[A-za-z]+'
# find_matches2(allDocs[0], regPat)


In [None]:
# tempDoc = allDocs[0]

# for m in reg.finditer(regPat, tempDoc):
#     match=m.group(0)
    
    
#     # Check to see if these are all words individually 
#     allWords = True
#     for i in match.replace(' ', '').split('-'):
#         allWords = allWords and (i in nlp.vocab)
    
#     if allWords: ##all words are individually words - now see if they combine to make a word
#         mergedWord = match.replace(' ', '').replace('-','')
#         print('merged word:', mergedWord)
#         if mergedWord in nlp.vocab: 
#             doc.replace(match, mergedWord)
#         else:
            
#             if mergedWord in nlp.vocab:
#                 doc.replace(match, mergedWord)
#     else:
#         print('replacing with:', ''.join(match.replace(' ', '').split('-')))
#         doc.replace(match, ''.join(match.replace(' ', '').split('-')))
        
        
    
    
#     #print(match, allWords)
    

In [None]:
# tempDoc = allDocs[0]

# for m in reg.finditer(regPat, tempDoc):
#     match=m.group(0)
    
#     mergedWord = match.replace(' ', '').replace('-','')
#     if mergedWord in nlp.vocab: 
#         doc.replace(match, mergedWord)
#     else:
#         allWords = True
#         for i in match.replace(' ', '').split('-'):
#             allWords = allWords and (i in nlp.vocab)
#         if allWords:
#             doc.replace(match,(match.replace(' ', '')) )
#         else:
#             print(mergedWord)
#             doc.replace(match, mergedWord)
            
        
