In [1]:
import pandas as pd
import numpy as np
import re
import spacy
import scispacy
from spacy.matcher import Matcher

In [3]:
from spacy.tokens.doc import Doc
from spacy.tokens.token import Token
from spacy.tokens.span import Span
from spacy.lang.en import English

In [33]:
nah_nlp = spacy.load("en_nah_sm")
nah_nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f442bce1a10>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f442bc8b0b0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f442bcffe60>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7f442bd05eb0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f442be438a0>),
 ('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x7f442bcab050>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f442be439f0>)]

In [29]:
# Common Patterns Matching
NOUN_PHRASE = [ {'POS': 'DET', 'OP': '?'}, {'POS': {'IN': ['ADJ', 'NOUN', 'PRON', 'PROPN', 'NUM', 'CCONJ', 'PART']}, 'OP': '+'},
                {'POS': 'VERB', 'DEP': {'IN': ['amod', 'acl']}, 'OP': '*'}, {'POS': {'IN': ['ADJ', 'NOUN', 'PRON', 'PROPN', 'PART', 'NUM']}, 'OP': '*'} ]

NO_CONJ_NOUN_PHRASE = [ {'POS': 'DET', 'OP': '?'}, {'POS': {'IN': ['ADJ', 'NOUN', 'PRON', 'PROPN', 'NUM', 'PART']}, 'OP': '+'},
              {'POS': 'VERB', 'DEP': {'IN': ['amod', 'acl']}, 'OP': '*'}, {'POS': {'IN': ['ADJ', 'NOUN', 'PRON', 'PROPN', 'PART', 'NUM']}, 'OP': '*'} ]

In [4]:
# preprocessing sentence
def process_sentence(sentence:str)->str:
  """
  Pre-processing a sentence before extract its clauses or any after-processing.
  """
  sentence = re.sub(r"[,]*[ ]*but not only[,]*", " ", sentence)
  sentence = re.sub(r"[,]*[ ]*[Nn]ot only", " ", sentence)
  sentence = re.sub(r"[,]*[ ]but also", " and ", sentence)
  sentence = re.sub(r"[Bb]oth", " ", sentence)
  sentence = re.sub(r" +- +", r"-", sentence)
  sentence = re.sub(r" +", r" ", sentence)
  return sentence.strip()
sentence = "Increased levels    of dopamine can lead to a tic syndrome, while reduced levels of serotonin and increased dopamine can both lead to obsessive compulsive disorder."
process_sentence(sentence=sentence)

'Increased levels of dopamine can lead to a tic syndrome, while reduced levels of serotonin and increased dopamine can lead to obsessive compulsive disorder.'

## Splitting a sentence into clauses

In [53]:
# Noun chunks processing
def has_negative_conj(noun:Token)->bool:
  """
  Check if a noun has the negative conjunction
  Example: A, but not B, is somthing.
  """
  for child in noun.children:
    if child.dep_ == 'cc' and child.text in ['not']:
      return True
  return False

def is_negative_nmod(noun:Token)->bool:
  """
  Check if a noun has the negative modify
  Example: A, without B, is something.
  """
  for child in noun.children:
    if child.dep_ == 'case' and child.text in ['without', 'no', 'neither', 'except']:
      return True
    if child.dep_ in ['det', 'neg'] and child.text in ['any', 'no']:
      return True
  return False

def noun_chunks(noun:Token, chunks_for:str='subj')->str:
  """
  Get chunks from a given noun.
  """
  from_noun_dep = ['compound', 'amod', 'case', 'mark', 'dep', 'nummod']
  if chunks_for == 'obj':
    conj_dep = ['conj', 'cc']
    from_noun_dep += ['punct', 'appos']
  else:
    conj_dep = []

  nmod_dep = ['nmod', 'nmod:poss']
  children = [child for child in noun.children if child.dep_ in from_noun_dep + ([] if (has_negative_conj(noun=noun)) else conj_dep)]
  children += [child for child in noun.children if child.dep_ in nmod_dep and not is_negative_nmod(child)]

  idx = 0
  while idx < len(children):
    child = children[idx]
    tmp_children = [sub_child for sub_child in child.children if sub_child.dep_ in from_noun_dep + ['mwe'] + ([] if (has_negative_conj(noun=noun)) else conj_dep)]
    tmp_children += [sub_child for sub_child in child.children if sub_child.dep_ in nmod_dep and not is_negative_nmod(sub_child)]
    if len(tmp_children) > 0:
      children += tmp_children
    idx += 1
  return ' '.join([element.text for element in sorted(children + [noun], key=lambda child: child.idx)]).replace(" 's", "'s")

def is_meaning_chunk(doc:Doc, chunk:str):
  """
  Check if a given chunk is a meaning chunk in Doc.
  """
  for ent in doc.ents:
    if ent.text in chunk:
      return True
  return False

In [54]:
# Subject processing
def find_all_subj(nlp:English, sentence:str)->list:
  """
  Find all the available subjects in a given sentence.
  """
  doc = nlp(sentence)
  all_chunks_subj = []
  appos_list = []
  for token in doc:
    if token.dep_ in ['nsubj', 'nsubjpass']:
      if not has_negative_conj(noun=token):
        can_have_conj = ['conj']
      else:
        can_have_conj = []

      chunk_subj = noun_chunks(token)
      if is_meaning_chunk(doc=doc, chunk=chunk_subj):
        all_chunks_subj.append(chunk_subj)
      for child in token.children:
        if child.dep_ in ['nsubj', 'appos', 'acl:relcl'] + can_have_conj:
          child_subj = noun_chunks(child)
          if is_meaning_chunk(doc=doc, chunk=child_subj):
            all_chunks_subj.append(child_subj)

        elif child.dep_ in ['nmod']:
          if not has_negative_conj(noun=child):
            can_have_conj = ['conj']
          else:
            can_have_conj = []
          for sub_child in child.children:
            if sub_child.dep_ in ['nsubj', 'appos', 'acl:relcl'] + can_have_conj:
              sub_child_subj = noun_chunks(sub_child)
              if is_meaning_chunk(doc=doc, chunk=sub_child_subj):
                  all_chunks_subj.append(sub_child_subj)

  return all_chunks_subj

In [80]:
# clause extraction
def is_negative(verb:Token)->bool:
  """
  Check if a verb is negative.
  """
  if verb.lemma_ in ["fail", "unable"]:
    return True
  for child in verb.children:
    if child.dep_ in ['neg'] or child.lemma_ in ['neither']:
      return True
    for sub_child in child.children:
      if sub_child.dep_ in ['neg']:
        return True
  return False

def is_important(verb:Token)->bool:
  """
  Check if a verb is important
  """
#  if verb.lemma_ not in ['measure', 'characterize', 'report', 'suggest', 'justify', 'demonstrate', 'correspond',
#                         'examine', 'observe']:
  return True

def get_verb_phrase(verb:Token):
  root = verb
  if verb.dep_ not in ['cop']:
    from_verb_dep = ['cop', 'auxpass']
    children = sorted([child for child in verb.children if (child.dep_ in from_verb_dep)] + [verb], key=lambda token: token.idx)
    if len(children) == 1:
      children_string = ' '.join([child.text if child != verb else child.lemma_ for child in children])
    else:
      children_string = ' '.join([child.text if child.dep_ not in ['cop', 'auxpass'] else child.lemma_ for child in children])
  else:
    root = verb.head
    from_noun_dep = ['amod', 'det', 'compound', 'cop']
    sorted_list_from_root = sorted([c_root for c_root in root.children if c_root.dep_ in from_noun_dep] + [root], key=lambda token: token.idx)
    children_string = ' '.join([child.text if child != verb else child.lemma_ for child in sorted_list_from_root])

  for child in root.children:
    if child.dep_ in ['dobj', 'nmod', 'dep', 'xcomp', 'acl', 'attr']: # acl: used... ; amod: (agent - ROOT) effective in ...
      obj = noun_chunks(noun=child, chunks_for='obj')
      children_string += (' ' + obj)
    elif child.dep_ in ['amod']:
      for amod_child in child.children:
        if amod_child.dep_ in ['nmod', 'xcomp']:
          obj = noun_chunks(noun=amod_child, chunks_for='obj')
          children_string += (' ' + obj)

  return children_string.replace(" 's", "'s")

def find_sub_verbs(verb:Token):
  """
  Find the verbs which relate to a given verb
  """
  if verb.dep_ == 'cop':
    new_verb = verb.head
  else:
    new_verb = verb

  sub_verbs = [s_verb for s_verb in new_verb.children if s_verb.dep_ in ['conj', 'advcl'] and s_verb.pos_ in ['VERB'] and not is_negative(s_verb)]
  idx = 0
  while idx < len(sub_verbs):
    sub_verbs += [s_verb for s_verb in sub_verbs[idx].children if s_verb.dep_ in ['conj'] and s_verb.pos_ in ['VERB'] and not is_negative(s_verb)]
    idx += 1
  return [sub_verb for sub_verb in sub_verbs if is_main_verb(verb=sub_verb) and is_important(verb=sub_verb) and (not is_negative(verb=sub_verb))]

def find_subj_of(verb:Token):
  """
  Find subject for a verb.
  """
  curr_verb = verb
  while True:
    for child in curr_verb.children:
      if child.dep_ in ['nsubj', 'nsubjpass']:
        return child
    if curr_verb == curr_verb.head:
      break
    curr_verb = curr_verb.head

  return None

def is_main_adj(adj:Token):
  """
  Check if an adjective is important or not.
  Example 1: be useful => important
  Example 2: be able to => not important
  """
  for child in adj.children:
    if child.dep_ == 'xcomp' and child.pos_ not in ['NOUN']:
      return False
  
  if adj.pos_ not in ['ADJ']:
    return False
  
  return True

def is_main_verb(verb:Token):
  """
  Check if a verb is important. A verb is not important when it doesn't mean as the function, but the modifier.
  """
  if verb.dep_ in ['amod']:
    return False
  for child in verb.children:
    if child.dep_ == 'xcomp' and child.pos_ == 'VERB':
      return False
  return True

def clause_extraction(nlp:English, sentence:str)->list:
  """
  Extract the clauses in a given sentence.
  Example: A is B and A do C => [A is B ; A do C]
  """
  all_subj = find_all_subj(nlp=nlp, sentence=sentence)
  if len(all_subj) == 0:
    return []

  from_verb_dep = ['det', 'dobj', 'nmod', 'neg']
  clauses = []
  doc = nlp(sentence)
  for subj in all_subj:
    verbs_for_subj = []
    for token in doc:
      if noun_chunks(noun=token) == subj:
        verb = None
        if token.dep_ in ['nsubj', 'nsubjpass']:
          if is_negative(token) or is_negative(token.head):
            continue
          if ((not is_negative(token.head)) and is_important(verb=token.head) and (token.head.pos_ in ['VERB'] and is_main_verb(token.head))) or is_main_adj(token.head):
            verb = token.head
          elif token.head.pos_ in ['NOUN']:
            # connect 2 noun phrase with a cop, such as tobe: A is B ==> A has head is B
            main_result = [child for child in token.head.children if child.pos_ == 'VERB' and child.dep_ == 'cop']
            if len(main_result) > 0:
              verb = main_result[0]

          elif (not is_negative(token.head)) and token.head.pos_ in ['ADJ', 'VERB'] and token.head.dep_ not in ['amod'] and is_important(verb=token.head):
            # Example: The 3-NP administered animals had impaired spatial memory ==> administered is amod
            # but also verb of subj 3-NP
            main_result = [child for child in token.head.children if child.pos_ == 'VERB' and child.dep_ == 'xcomp']
            if len(main_result) > 0:
              verb = main_result[0]
          else:
            continue
        elif token.dep_ in ['appos', 'acl:relcl', 'conj']: # For appos subj 
          # Example: Omega-3, which is a PUFA ==> acl:relcl
          # Example, Omega-3, a fatty acid, ... ==> appos  
           tmp = token

           while tmp.head.pos_ not in ['VERB', 'ADJ'] and tmp != tmp.head: # Find the root of alternative subj if it doesn't has its own verb
             tmp = tmp.head
           
           if (not is_negative(tmp.head) and is_important(verb=tmp.head) and tmp.head.pos_ in ['VERB'] and is_main_verb(tmp.head)) or is_main_adj(tmp.head):
             verb = tmp.head

           elif tmp.head.pos_ in ['NOUN']:
             result = [child for child in tmp.head.children if child.pos_ == 'VERB' and child.dep_ == 'cop']
             if len(result) > 0:
               verb = result[0]
             
           elif (not is_negative(tmp.head)) and tmp.head.pos_ in ['ADJ', 'VERB'] and tmp.head.dep_ not in ['amod'] and is_important(verb=tmp.head):
             result = [child for child in tmp.head.children if child.pos_ == 'VERB' and child.dep_ == 'xcomp']
             if len(result) > 0:
               verb = result[0]
        else:
          continue

        if not verb:
          continue
        
        verbs_for_subj.append(verb)
        sub_verbs = find_sub_verbs(verb=verb)
        if token.dep_ not in ['appos', 'conj', 'acl:relcl']:
          verbs_for_subj += [s_verb for s_verb in sub_verbs if find_subj_of(verb=s_verb) and find_subj_of(verb=s_verb) == token]
        else:
          head_of_subj = token
          while head_of_subj.dep_ not in ['nsubj', 'nsubjpass']:
            head_of_subj = head_of_subj.head
          verbs_for_subj += [s_verb for s_verb in sub_verbs if find_subj_of(verb=s_verb) and find_subj_of(verb=s_verb) == head_of_subj]
    for verb in verbs_for_subj:
      verb_phrase = get_verb_phrase(verb=verb)
      if is_meaning_chunk(doc=doc, chunk=verb_phrase):
        clauses.append((subj, verb_phrase))
  return list(set(clauses))

In [81]:
sentence = "It was concluded that an anxiety state constitutes a contributing background for developing VA and it was speculated that such an anxiety state may lead to an exaggerated secretion of stress hormones, resulting in vasospasm of the coronary arteries."
print(sentence)
doc = nah_nlp(process_sentence(sentence=sentence))

for token in doc:
  print(f"{token.text:20} {token.dep_:20} {token.pos_:20} {token.head.text:20} {token.idx:5}")
[print(clause) for clause in clause_extraction(nlp=nah_nlp, sentence=process_sentence(sentence=sentence))]
print("--------")
#for ent in doc.ents:
#  print(f"{ent.text:40} {ent[-1].dep_:20} {ent[-1].pos_:20}")

It was concluded that an anxiety state constitutes a contributing background for developing VA and it was speculated that such an anxiety state may lead to an exaggerated secretion of stress hormones, resulting in vasospasm of the coronary arteries.
It                   nsubjpass            PRON                 concluded                0
was                  auxpass              VERB                 concluded                3
concluded            ROOT                 VERB                 concluded                7
that                 mark                 SCONJ                constitutes             17
an                   det                  DET                  state                   22
anxiety              compound             NOUN                 state                   25
state                nsubj                NOUN                 constitutes             33
constitutes          ccomp                VERB                 concluded               39
a                    det      

In [None]:
dep_list = ["ROOT", "acl", "acl:relcl", "acomp", "advcl", "advmod", "amod", "amod@nmod", "appos",
            "attr", "aux", "auxpass", "case", "cc", "cc:preconj", "ccomp", "compound", "compound:prt",
            "conj", "cop", "csubj", "dative", "dep", "det", "det:predet", "dobj", "expl", "intj", "mark",
            "meta", "mwe", "neg", "nmod", "nmod:npmod", "nmod:poss", "nmod:tmod", "nsubj", "nsubjpass", 
            "nummod", "parataxis", "pcomp", "pobj", "preconj", "predet", "prep", "punct", "quantmod", "xcomp"]

for dep in dep_list:
  if spacy.explain(dep):
    print(f"{dep:20} {spacy.explain(dep)}")

acl                  clausal modifier of noun (adjectival clause)
acomp                adjectival complement
advcl                adverbial clause modifier
advmod               adverbial modifier
amod                 adjectival modifier
appos                appositional modifier
attr                 attribute
aux                  auxiliary
auxpass              auxiliary (passive)
case                 case marking
cc                   coordinating conjunction
ccomp                clausal complement
compound             compound
conj                 conjunct
cop                  copula
csubj                clausal subject
dative               dative
dep                  unclassified dependent
det                  determiner
dobj                 direct object
expl                 expletive
intj                 interjection
mark                 marker
meta                 meta modifier
neg                  negation modifier
nmod                 modifier of nominal
nsubj                nomi

In [None]:
# Test for a sample
sentences = pd.read_csv("sentences.csv")['Sentence'].tolist()[:50]
for sentence in sentences:
  processed_sentence = process_sentence(sentence)
  print(processed_sentence)
  clauses = clause_extraction(nlp=nah_nlp, sentence=processed_sentence)
  if len(clauses) > 0:
    [print(clause) for clause in clauses]
  print('---')

In humans, until today, only two case reports are available reporting successful treatment with dronabinol (tetrahydrocannabinol, THC), an agonist at central cannabinoid CB1 receptors, in patients with otherwise treatment refractory OCD.
---
These reports are in line with data from a limited number of case studies and small controlled trials in patients with Tourette syndrome (TS), a chronic motor and vocal tic disorder often associated with comorbid obsessive compulsive behavior (OCB), reporting an improvement of tics and of comorbid OCB after use of different kinds of cannabis-based medicines including THC, cannabis extracts, and flowers.
---
Here we present the case of a 22-year-old male patient, who suffered from severe OCD since childhood and significantly improved after treatment with medicinal cannabis with markedly reduced OCD and depression resulting in a considerable improvement of quality of life.
---
In TS/CTD, RA may be regarded as a major comorbidity that requires clinica

---
🤔 🤔
There are multiple sentences which have no valid clauses (Meaning Subject and Meaning Verb phrase).

---

## Relation Extraction Function

Apply `extract_relation` function to any sentence which has at least a valid clause. 

In [30]:
# Find max relation
def find_max_relation(relations:list)->tuple:
  """
  From a list of candidate relations, get the longest relation.
  """
  max_length = 0
  max_idx = 0

  idx = 0
  while idx < len(relations):
    if relations[idx][2] - relations[idx][1] > max_length:
      max_length = relations[idx][2] - relations[idx][1]
      max_idx = idx
    idx += 1
  return relations[max_idx]

# Match groups from matcher results
def is_same_group(match_1:tuple, match_2:tuple)->bool:
  if match_1[1] >= match_2[2]:
    return False
  if match_2[1] >= match_1[2]:
    return False
  return True

def group_matches(matches:list)->list:
  groups = []
  if len(matches) > 0:
    curr_longest_match = matches[0]
    for idx in range(1, len(matches)):
      if is_same_group(match_1=matches[idx-1], match_2=matches[idx]):
        if (curr_longest_match[2] - curr_longest_match[1]) < (matches[idx][2] - matches[idx][1]):
          curr_longest_match = matches[idx]
      else:
        groups.append(curr_longest_match)
        curr_longest_match = matches[idx]
    
    # the final match
    groups.append(curr_longest_match)
        
  return groups
  
# noun phrase
def get_noun_phrases(nlp:English, clause:str, pattern:list=NOUN_PHRASE)->list:
  """
  Get all available noun phrases in a given string. 
  """
  np_matcher = Matcher(nlp.vocab)
  np_matcher.add("noun_phrase", [pattern])

  text = nlp(clause)
  np_matches = np_matcher(text)

  groups_np = []
  if len(np_matches) > 0:
    tmp_match = np_matches[0]
    is_same_group = False
    for i in range(len(np_matches)-1):
        if np_matches[i][1] == np_matches[i+1][1] or np_matches[i][2] == np_matches[i+1][2]\
            or tmp_match[1] == np_matches[i+1][1] or tmp_match[2] == np_matches[i+1][2] :
            is_same_group = True
            if (tmp_match[2] - tmp_match[1]) < (np_matches[i+1][2] - np_matches[i+1][1]):
                tmp_match = np_matches[i+1]
        else:
            groups_np.append(tmp_match)
            tmp_match = np_matches[i+1]
        if (i+1) == len(np_matches) - 1:
            groups_np.append(tmp_match)
  return group_matches(np_matches)

# Normalize relation
def simplify_relation(relation:Span)->str:
  """
  Simplify a given relation and convert it into a string.
  """
  new_relation = [token.text for token in relation if token.dep_ not in ['det', 'nummod']]
  return ' '.join(new_relation)

# Relation extraction function
def extract_relation(nlp:English, clause_tuple:tuple)->list:
  """
  Get all ( Noun phrase 01 , relation, Noun phrase 02 ) from an extracted clause tuple (NP1, VP)
  Example: ("A", "do B to get C") => [(A, do, B), (A, do B to get, C)]
  """
  results = []

  subject = clause_tuple[0]
  clause = clause_tuple[1].replace(" 's", "'s")
  # Noun Phrase
  noun_phrase = [ {'POS': 'DET', 'OP': '?'}, {'POS': {'IN': ['ADJ', 'NOUN', 'PRON', 'NUM', 'CCONJ']}, 'OP': '+'} ]
  nb_of_noun_phrase = len(get_noun_phrases(nlp=nlp, clause=clause))
  #print("Number of noun phrase:", get_noun_phrases(nlp=nlp, clause=clause))
  relation_phrases = []

  # Verb Phrase
  verb_phrase = [ {'POS': 'VERB', 'OP': '+'}, {'POS': {'IN': ['ADP', 'PART']}, 'OP': '*'}]
  linking_verb_phrase = [ {'POS': 'VERB', 'OP': '+'}, {'POS': 'ADJ', 'OP': '+'}, {'POS': {'IN': ['ADP', 'PART']}, 'OP': '*'}]

  # Added Pattern
  curr_nb = nb_of_noun_phrase - 1
  while curr_nb >= 0:
    pattern_phrase = verb_phrase + (noun_phrase + [{'POS': {'IN': ['PART', 'ADP']}}])*curr_nb
    relation_phrases.append(pattern_phrase)
    
    pattern_phrase = linking_verb_phrase + (noun_phrase + [{'POS': {'IN': ['PART', 'ADP']}}])*curr_nb
    relation_phrases.append(pattern_phrase)

    curr_nb -= 1

  # Find relation
  text = nlp(clause)
  long_relations = []
  for relation in relation_phrases:
    relation_match = Matcher(nlp.vocab)
    relation_match.add("relation_phrase", [relation])
    relations = relation_match(text)
    if len(relations) > 0:
      max_relation = find_max_relation(relations)
      if is_meaning_chunk(doc=text, chunk=text[max_relation[2]:].text):
        results.append((subject, simplify_relation(text[max_relation[1]:max_relation[2]]).replace(" 's", "'s"), text[max_relation[2]:].text.replace(" 's", "'s")))
      if is_meaning_chunk(doc=text, chunk=text[max_relation[1]:max_relation[2]].text):
        long_relations.append(text[max_relation[1]:max_relation[2]])
      break

  idx = 0
  while idx < len(long_relations):
    short_text = long_relations[idx][:-1]
    nb_of_noun_phrase = nb_of_noun_phrase = len(get_noun_phrases(nlp=nlp, clause=short_text.text))
    relation_phrases = []

    curr_nb = nb_of_noun_phrase - 1
    while curr_nb >= 0:
      pattern_phrase = verb_phrase + (noun_phrase + [{'POS': {'IN': ['PART', 'ADP']}}])*curr_nb
      relation_phrases.append(pattern_phrase)
      
      pattern_phrase = linking_verb_phrase + (noun_phrase + [{'POS': {'IN': ['PART', 'ADP']}}])*curr_nb
      relation_phrases.append(pattern_phrase)

      curr_nb -= 1

    for relation in relation_phrases:
      relation_match = Matcher(nlp.vocab)
      relation_match.add("relation_phrase", [relation])
      
      relations = relation_match(short_text)

      if len(relations) > 0:
        max_relation = find_max_relation(relations)
        if is_meaning_chunk(doc=text, chunk=short_text[max_relation[2]:].text):
          results.append((subject, simplify_relation(relation=short_text[max_relation[1]:max_relation[2]]).replace(" 's", "'s"), short_text[max_relation[2]:].text.replace(" 's", "'s")))
        if is_meaning_chunk(doc=text, chunk=short_text[max_relation[1]:max_relation[2]].text):
          long_relations.append(short_text[max_relation[1]:max_relation[2]])
        break
    idx += 1

  return results

In [None]:
sentence = process_sentence("Smoking and alcohol abuse are established risk factors for chronic pancreatitis (CP).")
clauses = clause_extraction(nlp=nah_nlp, sentence=sentence)
if len(clauses) > 0:
  for clause in clauses:
    print(extract_relation(nlp=nah_nlp, clause_tuple=clause))

[('Smoking abuse', 'be established risk factors for', 'chronic pancreatitis ( CP )')]


In [None]:
sentences = pd.read_csv("sentences.csv")['Sentence'].tolist()
no_clause = 0
no_relation = 0
has_relation = 0

for sent in sentences[:200]:
  sentence = process_sentence(sentence=sent)
  print(sentence)
  print()
  clauses = clause_extraction(nlp=nah_nlp, sentence=sentence)
  if len(clauses) > 0:
    for clause in clauses:
      relations = extract_relation(nlp=nah_nlp, clause_tuple=clause)
      if len(relations) > 0:
        for relation in relations:
          print(relation)
          has_relation += 1
        print()
      else:
        print("No relation")
        no_relation += 1
  else:
    print("No clause")
    no_clause += 1
  print("-----")
  print()

print(f"There are {no_clause} sentences has no clause and {no_relation} sentences has no relation.")
print(f"There are {has_relation} relations in {len(sentences[:200])} sentences.")

In humans, until today, only two case reports are available reporting successful treatment with dronabinol (tetrahydrocannabinol, THC), an agonist at central cannabinoid CB1 receptors, in patients with otherwise treatment refractory OCD.

No clause
-----

These reports are in line with data from a limited number of case studies and small controlled trials in patients with Tourette syndrome (TS), a chronic motor and vocal tic disorder often associated with comorbid obsessive compulsive behavior (OCB), reporting an improvement of tics and of comorbid OCB after use of different kinds of cannabis-based medicines including THC, cannabis extracts, and flowers.

No clause
-----

Here we present the case of a 22-year-old male patient, who suffered from severe OCD since childhood and significantly improved after treatment with medicinal cannabis with markedly reduced OCD and depression resulting in a considerable improvement of quality of life.

No clause
-----

In TS/CTD, RA may be regarded as

---
😃 😃

Only consider sentences with valid clauses, there are only **6/99** sentences doesn't have any valid tuple (NP01, relation, NP02).

---

## Non-verb Relation Extraction

Apply some common patterns to extract the relation between entities that are not connected by a verb.

In [None]:
sentence = "Also, compromised contextual and cued fear conditioning in these animals suggests a considerable memory deficit, thus potentially forming a connection to the previously determined ether lipid deficit in human patients with Alzheimer's disease."
doc = nah_nlp(sentence)
for token in doc:
  print(f"{token.text:20} {token.dep_:20} {token.pos_:20} {token.head.text}")

Also                 advmod               ADV                  compromised
,                    punct                PUNCT                compromised
compromised          ROOT                 VERB                 compromised
contextual           dobj                 NOUN                 compromised
and                  cc                   CCONJ                compromised
cued                 conj                 VERB                 compromised
fear                 compound             NOUN                 conditioning
conditioning         dobj                 NOUN                 cued
in                   case                 ADP                  animals
these                det                  DET                  animals
animals              nmod                 NOUN                 conditioning
suggests             conj                 VERB                 compromised
a                    det                  DET                  deficit
considerable         amod                 

In [32]:
def triple_generate_from_pattern(nlp:English, text:Doc, groups_matches:list, type_triple:str="left", pattern=NOUN_PHRASE)->tuple:
  """
  Generate triple from a group matches which contains the no-verb patterns.
  """
  for group in groups_matches:    
    no_verb_text = text[group[1]:group[2]]
    noun_phrases = [(no_verb_text[np[1]-1], no_verb_text[np[1]:np[2]]) for np in get_noun_phrases(nlp=nlp, clause=no_verb_text.text, pattern=pattern)]
    meaning_noun_phrases = [np for np in noun_phrases if is_meaning_chunk(doc=text, chunk=np[1].text)]
    
    if len(meaning_noun_phrases) >= 2 and len(noun_phrases) == 3:
      if type_triple == 'left':
        np_1 = noun_phrases[1][1].text
        np_2 = noun_phrases[2][1].text
        relation = "have " + noun_phrases[0][1].text + " " + noun_phrases[2][0].text
      elif type_triple == 'right':
        np_1 = noun_phrases[2][1].text
        np_2 = noun_phrases[1][1].text
        relation = "have " + noun_phrases[0][1].text + " " + noun_phrases[1][0].text
      elif type_triple == 'center':
        np_1 = noun_phrases[1][1].text
        np_2 = noun_phrases[2][1].text
        relation = "have " + noun_phrases[0][1].text + " with"
        if is_meaning_chunk(doc=text, chunk=np_1) and is_meaning_chunk(doc=text, chunk=np_2):
          return ((np_1, relation, np_2), (np_2, relation, np_1))
      else:
        np_1 = noun_phrases[0][1].text
        np_2 = noun_phrases[2][1].text
        relation = noun_phrases[1][0].text + " " + noun_phrases[1][1].text + " " + noun_phrases[2][0].text

      if is_meaning_chunk(doc=text, chunk=np_1) and is_meaning_chunk(doc=text, chunk=np_2):        
        return (np_1, relation, np_2)

    return None

def non_verb_relation_extraction(nlp:English, sentence:str)->list:
  """
  Extract relations between no-verb connected noun-phrases
  """
  text = nlp(sentence)
  results = []

  # Left -> Right
  pattern = NOUN_PHRASE + [ {'LOWER': {'IN': ['in', 'of']}} ] + NOUN_PHRASE + [ {'LOWER': {'IN': ['to', 'on', 'for', 'in', 'with']}} ] + NOUN_PHRASE
  # Left -> Right Pattern Matching
  no_verb_matcher = Matcher(nlp.vocab)
  no_verb_matcher.add("no_verb_matcher", [pattern])

  matches = no_verb_matcher(text)
  groups = group_matches(matches=matches)
  result = triple_generate_from_pattern(nlp=nlp, text=text, groups_matches=groups)
  if result:
    results.append(result)

  # Right -> Left
  pattern = NOUN_PHRASE + [ {'LOWER': {'IN': ['to', 'for', 'of', 'on']}} ] + NOUN_PHRASE + [ {'LOWER': {'IN': ['by', 'through']}} ] + NOUN_PHRASE
  # Right -> Left Pattern Matching
  no_verb_matcher = Matcher(nlp.vocab)
  no_verb_matcher.add("no_verb_matcher", [pattern])

  matches = no_verb_matcher(text)
  groups = group_matches(matches=matches)
  result = triple_generate_from_pattern(nlp=nlp, text=text, groups_matches=groups, type_triple="right")
  if result:
    results.append(result)

  # Left -> Right and Left <- Right
  pattern = NO_CONJ_NOUN_PHRASE + [{'LOWER': 'between'}] + NO_CONJ_NOUN_PHRASE + [{'LOWER': 'and'}] + NO_CONJ_NOUN_PHRASE

  # Center Pattern Matching
  no_verb_matcher = Matcher(nlp.vocab)
  no_verb_matcher.add("no_verb_matcher", [pattern])

  matches = no_verb_matcher(text)
  groups = group_matches(matches=matches)
  result = triple_generate_from_pattern(nlp=nlp, text=text, groups_matches=groups, type_triple="center", pattern=NO_CONJ_NOUN_PHRASE)
  if result:
    results += result

  # Observation Pattern
  pattern = NO_CONJ_NOUN_PHRASE + [{'LOWER': 'in'}, {'POS': {'IN': ['NOUN', 'ADJ']}, 'OP': '*'}, {'LOWER': {'IN': ['patient', 'patients', 'mice', 'rat', 'rats', 'mouse', 'animal', 'animals']}}, {'LOWER': 'with'}] + NO_CONJ_NOUN_PHRASE
  
  # Observation Pattern Matching
  no_verb_matcher = Matcher(nlp.vocab)
  no_verb_matcher.add("no_verb_matcher", [pattern])

  matches = no_verb_matcher(text)
  groups = group_matches(matches=matches)
  
  result = triple_generate_from_pattern(nlp=nlp, text=text, groups_matches=groups, type_triple="observation", pattern=NO_CONJ_NOUN_PHRASE)
  if result:
    results.append(result)

  return results

In [None]:
sentences = ["In the 1980s, there were positive outcomes in research studies using GLA in schizophrenia (Vaddadi et al., 1989).",
             "The hypothesis considered states that low omega-3 PUFA abundance in schizophrenia patients with chronic stress and related disorders is due to an underlying genetically determined abnormality.",
             "In this study, we investigated the role of DHA in stress-evoked pain chronicity using diet-induced n-3 fatty acid deficient mice.",
             "The robust relationship between low erythrocyte n-3 PUFA concentrations and SAD justifies exploration of relevant neuropathophysiological mechanisms.",
             "Also, compromised contextual and cued fear conditioning in these animals suggests a considerable memory deficit, thus potentially forming a connection to the previously determined ether lipid deficit in human patients with Alzheimer's disease."]
for sentence in sentences:
  print(sentence)
  print(non_verb_relation_extraction(nlp=nah_nlp, sentence=sentence))
  print('---')

In [None]:
sentences = pd.read_csv("sentences.csv")['Sentence'].tolist()
no_relation = 0

for sent in sentences[:200]:
  sentence = process_sentence(sentence=sent)
  print(sentence)
  print()

  # Find relation within a verb
  clauses = clause_extraction(nlp=nah_nlp, sentence=sentence)
  if len(clauses) > 0:
    for clause in clauses:
      relations = extract_relation(nlp=nah_nlp, clause_tuple=clause)
      if len(relations) > 0:
        for relation in relations:
          print(relation)
        print()
  
  # Find relation without any verb
  no_verb_triples = non_verb_relation_extraction(nlp=nah_nlp, sentence=sentence)
  if len(no_verb_triples) > 0:
    for triple in no_verb_triples:
      print(triple)
    print()
  
  # Cannot find any relation
  if len(clauses) == 0 and len(no_verb_triples) == 0:
    no_relation += 1

  print("-----")
  print()

print(f"There are {no_relation} sentences has no relation.")

In humans, until today, only two case reports are available reporting successful treatment with dronabinol (tetrahydrocannabinol, THC), an agonist at central cannabinoid CB1 receptors, in patients with otherwise treatment refractory OCD.

-----

These reports are in line with data from a limited number of case studies and small controlled trials in patients with Tourette syndrome (TS), a chronic motor and vocal tic disorder often associated with comorbid obsessive compulsive behavior (OCB), reporting an improvement of tics and of comorbid OCB after use of different kinds of cannabis-based medicines including THC, cannabis extracts, and flowers.

-----

Here we present the case of a 22-year-old male patient, who suffered from severe OCD since childhood and significantly improved after treatment with medicinal cannabis with markedly reduced OCD and depression resulting in a considerable improvement of quality of life.

-----

In TS/CTD, RA may be regarded as a major comorbidity that requ

-----

## Triples of Entity Construction

In [31]:
def full_entities(entity:Token):
  return ' '.join([token.text for token in sorted([child for child in entity.children if child.dep_ in ['amod', 'compound']] + [entity], key=lambda token:token.idx)])
  
def get_entities(noun_phrase:Doc):
  """
  Get named entities in a noun phrase.
  """
  entities_label = ['CHEMICAL', 'NUTRITION', 'DISEASE', 'MENTAL_HEALTH']
  full_entity = ['appos', 'acl:recl', 'nmod', 'dobj', 'conj', 'dep', 'ROOT']
  supplement_entity = ['amod', 'compound']
  
  entities = []
  for ent in noun_phrase.ents:
    if ent.label_ in entities_label:
      if ent[-1].dep_ in full_entity:
        entities.append(full_entities(ent[-1]))
      elif ent[-1].dep_ in supplement_entity:
        entities.append(ent.text + ' ' + ent[-1].head.text)
  return entities

def extract_entities_relation(nlp:English, triple:tuple):
  """
  Extract entity relation triples from a given noun phrase relation triple.
  """
  entities_1 = []
  entities_2 = []
  
  # Get entities from noun phrases group 1
  np_1 = triple[0]
  doc_1 = nlp(np_1)
  entities_1 = get_entities(doc_1)

  # Get entities from noun phrases group 2
  np_2 = triple[2]
  doc_2 = nlp(np_2)
  entities_2 = get_entities(doc_2)

  # Get only unique values of entities_1 and entities_2
  entities_1 = set(entities_1)
  entities_2 = set(entities_2)

  # Generate entities_triple
  entity_triples = []
  for ent_1 in entities_1:
    for ent_2 in entities_2:
      new_triple = (ent_1, triple[1], ent_2)
      entity_triples.append(new_triple)

  return entity_triples

In [None]:
sentences = pd.read_csv("sentences.csv")['Sentence'].tolist()
no_relation = 0
entitiy_triples = []
for sent in sentences[:200]:
  sentence = process_sentence(sentence=sent)
  print(sentence)

  # Find relation within a verb
  clauses = clause_extraction(nlp=nah_nlp, sentence=sentence)
  if len(clauses) > 0:
    for clause in clauses:
      relations = extract_relation(nlp=nah_nlp, clause_tuple=clause)
      if len(relations) > 0:
        for relation in relations:
          entity_triple = extract_entities_relation(nlp=nah_nlp, triple=relation)
          if len(entity_triple) > 0:
            for triple in entity_triple:
              entitiy_triples.append((sentence, triple[0], triple[1], triple[2]))
              print(triple)
  
  # Find relation without any verb
  no_verb_triples = non_verb_relation_extraction(nlp=nah_nlp, sentence=sentence)
  if len(no_verb_triples) > 0:
    for triple in no_verb_triples:
      entity_triple = extract_entities_relation(nlp=nah_nlp, triple=triple)
      if len(entity_triple) > 0:
        for triple in entity_triple:
          entitiy_triples.append((sentence, triple[0], triple[1], triple[2]))
          print(triple)
  
  # Cannot find any relation
  if len(clauses) == 0 and len(no_verb_triples) == 0:
    no_relation += 1
  print("---")
  
print(f"There are {no_relation} sentences has no relation.")


In humans, until today, only two case reports are available reporting successful treatment with dronabinol (tetrahydrocannabinol, THC), an agonist at central cannabinoid CB1 receptors, in patients with otherwise treatment refractory OCD.
---
These reports are in line with data from a limited number of case studies and small controlled trials in patients with Tourette syndrome (TS), a chronic motor and vocal tic disorder often associated with comorbid obsessive compulsive behavior (OCB), reporting an improvement of tics and of comorbid OCB after use of different kinds of cannabis-based medicines including THC, cannabis extracts, and flowers.
---
Here we present the case of a 22-year-old male patient, who suffered from severe OCD since childhood and significantly improved after treatment with medicinal cannabis with markedly reduced OCD and depression resulting in a considerable improvement of quality of life.
---
In TS/CTD, RA may be regarded as a major comorbidity that requires clinica

In [None]:
entitiy_triples

[('Flibanserin, a multifunctional serotonin receptor agonist and antagonist, is currently approved in the United States and Canada for the treatment of acquired, generalized hypoactive sexual desire disorder (HSDD) in premenopausal women.',
  'Flibanserin',
  'be approved in',
  'HSDD'),
 ('Flibanserin, a multifunctional serotonin receptor agonist and antagonist, is currently approved in the United States and Canada for the treatment of acquired, generalized hypoactive sexual desire disorder (HSDD) in premenopausal women.',
  'Flibanserin',
  'be approved in',
  'acquired generalized hypoactive sexual desire disorder'),
 ('Flibanserin, a multifunctional serotonin receptor agonist and antagonist, is currently approved in the United States and Canada for the treatment of acquired, generalized hypoactive sexual desire disorder (HSDD) in premenopausal women.',
  'serotonin agonist',
  'be approved in',
  'HSDD'),
 ('Flibanserin, a multifunctional serotonin receptor agonist and antagonist, 

# Implement for the entire sentence dataset

In [None]:
sentences_df = pd.read_csv("sentences.csv")

entitiy_triples = []

for (idx, row) in sentences_df.iterrows():
  print('Sentence', idx)
  sentence = process_sentence(sentence=row['Sentence'])
  print(sentence)

  # Find relation within a verb
  clauses = clause_extraction(nlp=nah_nlp, sentence=sentence)
  if len(clauses) > 0:
    for clause in clauses:
      relations = extract_relation(nlp=nah_nlp, clause_tuple=clause)
      if len(relations) > 0:
        for relation in relations:
          entity_triple = extract_entities_relation(nlp=nah_nlp, triple=relation)
          if len(entity_triple) > 0:
            for triple in entity_triple:
              entitiy_triples.append((row['ID_papers'], row['Sentence'], triple[0], triple[1], triple[2]))
  
  # Find relation without any verb
  no_verb_triples = non_verb_relation_extraction(nlp=nah_nlp, sentence=sentence)
  if len(no_verb_triples) > 0:
    for triple in no_verb_triples:
      entity_triple = extract_entities_relation(nlp=nah_nlp, triple=triple)
      if len(entity_triple) > 0:
        for triple in entity_triple:
          entitiy_triples.append((row['ID_papers'], row['Sentence'], triple[0], triple[1], triple[2]))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Sentence 20267
The aim of this study was to determine a potential benefit of the specific psychoeducational intervention "Learning to Live with Cancer" (LTLWC) for patients with operated nonmetastatic breast cancer, with respect to psychological variables and endocrine and immune parameters.
Sentence 20268
The aim of the study was to analyze amygdala volume in patients with CS and its relationship with anxiety, depression, and hormone levels.
Sentence 20269
We describe an unusual and challenging clinical scenario: a patient with end-stage renal disease on hemodialysis with severely uncontrolled hypothyroidism and worsening psychosis, who refused oral and intramuscular levothyroxine, but was successfully treated with intravenous (IV) levothyroxine given on hemodialysis days.
Sentence 20270
Danzhi Xiaoyao Powder (DP), a historical traditional Chinese medicine formula, is a promising treatment for BP control in hypertensive 

In [None]:
print(f"There are {len(entitiy_triples)} relations between 2 entities in all sentences.")

There are 19012 relations between 2 entities in all sentences.


In [None]:
entities_relations_all_df = pd.DataFrame(entitiy_triples, columns=["ID_papers", "Sentence", "E1", "Relation", "E2"])
entities_relations_all_df.head(5)

Unnamed: 0,ID_papers,Sentence,E1,Relation,E2
0,10,"Flibanserin, a multifunctional serotonin recep...",Flibanserin,be approved in,HSDD
1,10,"Flibanserin, a multifunctional serotonin recep...",Flibanserin,be approved in,acquired generalized hypoactive sexual desire ...
2,10,"Flibanserin, a multifunctional serotonin recep...",serotonin agonist,be approved in,HSDD
3,10,"Flibanserin, a multifunctional serotonin recep...",serotonin agonist,be approved in,acquired generalized hypoactive sexual desire ...
4,10,"Flibanserin, a multifunctional serotonin recep...",multifunctional serotonin receptor agonist,be approved in,HSDD


In [None]:
entities_relations_all_df.to_csv("nah_data_raw.csv", index=False)

------

# Example

In [82]:
sentence = "Pioglitazone (or glitazone) ameliorated LPS-induced dopaminergic neuronal loss, as well as mitigated neurobehavioral impairments while  vitamin C administration successfully attenuated memory impairment, fatigue development and more."
sentence_processed = process_sentence(sentence=sentence)

clauses = clause_extraction(nlp=nah_nlp, sentence=sentence_processed)

for clause in clauses:
  relations = extract_relation(nlp=nah_nlp, clause_tuple=clause)
  if len(relations) > 0:
    print("Clause:", clause)
    [print("Relation:", relation) for relation in relations]
    for relation in relations:
      entity_triple = extract_entities_relation(nlp=nah_nlp, triple=relation)
      if len(entity_triple) > 0:
        for triple in entity_triple:
          print("Triple:", triple)
    print("---")

Clause: ('glitazone', 'ameliorate LPS-induced dopaminergic neuronal loss ,')
Relation: ('glitazone', 'ameliorate', 'LPS-induced dopaminergic neuronal loss ,')
Triple: ('glitazone', 'ameliorate', 'LPS-induced dopaminergic neuronal loss')
---
Clause: ('Pioglitazone', 'mitigate neurobehavioral impairments')
Relation: ('Pioglitazone', 'mitigate', 'neurobehavioral impairments')
Triple: ('Pioglitazone', 'mitigate', 'neurobehavioral impairments')
---
Clause: ('vitamin C administration', 'attenuate memory impairment , fatigue development and more')
Relation: ('vitamin C administration', 'attenuate', 'memory impairment , fatigue development and more')
Triple: ('vitamin C administration', 'attenuate', 'memory impairment')
Triple: ('vitamin C administration', 'attenuate', 'fatigue development')
---
Clause: ('glitazone', 'mitigate neurobehavioral impairments')
Relation: ('glitazone', 'mitigate', 'neurobehavioral impairments')
Triple: ('glitazone', 'mitigate', 'neurobehavioral impairments')
---
Cla

In [83]:
sentence = "Vitamin C (ascorbic acid) and vitamin E, which is a fat-soluble nutrient, have beneficials effects on psychotic disorder and improve anxiety"
for token in nah_nlp(sentence):
  print(f"{token.text:20} {token.dep_:20} {token.head.text:20}")

Vitamin              compound             C                   
C                    nsubj                have                
(                    punct                acid                
ascorbic             amod                 acid                
acid                 appos                C                   
)                    punct                acid                
and                  cc                   C                   
vitamin              compound             E                   
E                    conj                 C                   
,                    punct                C                   
which                nsubj                nutrient            
is                   cop                  nutrient            
a                    det                  nutrient            
fat-soluble          amod                 nutrient            
nutrient             acl:relcl            C                   
,                    punct                C            

In [84]:
sentence_processed = process_sentence(sentence=sentence)

clauses = clause_extraction(nlp=nah_nlp, sentence=sentence_processed)
for clause in clauses:
  relations = extract_relation(nlp=nah_nlp, clause_tuple=clause)
  if len(relations) > 0:
    [print("Relation:", relation) for relation in relations]
    for relation in relations:
      entity_triple = extract_entities_relation(nlp=nah_nlp, triple=relation)
      if len(entity_triple) > 0:
        for triple in entity_triple:
          print("Triple:", triple)
    print("---")

Relation: ('fat-soluble nutrient', 'have beneficials effects on', 'psychotic disorder')
Triple: ('fat-soluble nutrient', 'have beneficials effects on', 'psychotic disorder')
---
Relation: ('Vitamin C', 'improve', 'anxiety')
Triple: ('Vitamin C', 'improve', 'anxiety')
---
Relation: ('vitamin E', 'improve', 'anxiety')
Triple: ('vitamin E', 'improve', 'anxiety')
---
Relation: ('ascorbic acid', 'improve', 'anxiety')
Triple: ('ascorbic acid', 'improve', 'anxiety')
---
Relation: ('ascorbic acid', 'have beneficials effects on', 'psychotic disorder')
Triple: ('ascorbic acid', 'have beneficials effects on', 'psychotic disorder')
---
Relation: ('vitamin E', 'have beneficials effects on', 'psychotic disorder')
Triple: ('vitamin E', 'have beneficials effects on', 'psychotic disorder')
---
Relation: ('Vitamin C', 'have beneficials effects on', 'psychotic disorder')
Triple: ('Vitamin C', 'have beneficials effects on', 'psychotic disorder')
---
Relation: ('fat-soluble nutrient', 'improve', 'anxiety')