# Sentence Simplification

https://github.com/szymeklimek/BEng-Thesis---Semantic-Analysis/blob/89552577f77a40e8cce89296fe45aac86adda6b9/src/simple_sentences_algorithm.py

In [9]:
import json
from pycorenlp import *

In [156]:
class SimpleSentence:
    
    def __init__(self,uri="http://localhost:9000/" ):
        self.nlp = StanfordCoreNLP(uri)
        self.prop = {
                "timeout": "50000",
                "annotators":"tokenize, pos, lemma, openie, depparse",
                "outputFormat": "json",
                "openie.max_entailments_per_clause":"1", # no of triples from each clause
                "openie.triple.strict":"true",
            #'openie.affinity_probability_cap': 2 / 3
        }
   
    def _check_if_found(self, found, idx_gov, idx_dep):
        if idx_gov in found or idx_dep in found:
            return True
        return False

    def _get_subj_indexes(self, sentence):
        sub_indexes = []
        for idx, dep in enumerate(sentence['basicDependencies']):
            if 'nsubj' in str(dep['dep']):
                sub_indexes.append(idx)
        return sub_indexes

    def _check_if_simple_sentence(self, sub_count):
        if len(sub_count) == 1:
            return True
        return False

    def _reccurent_find_all(self, deps_to_consider, found=[], to_find=[]):
        deps = []
        temp_deps_to_consider = deps_to_consider[:]
        for idx, dep in enumerate(temp_deps_to_consider):
            if dep['governor'] in to_find or dep['dependent'] in to_find:
                if not self._check_if_found(found, dep['governor'], dep['dependent']):
                    deps.append(dep)
                    new_deps_to_consider = temp_deps_to_consider[:]
                    new_deps_to_consider.pop(idx)
                    to_find_idx = 0
                    if dep['governor'] in to_find:
                        to_find_idx = dep['dependent']
                    else:
                        to_find_idx = dep['governor']
                    more_deps = self._reccurent_find_all(new_deps_to_consider, to_find, [to_find_idx])
                    if more_deps:
                        deps = deps + more_deps
        return deps

    def _find_connected_deps(self, sub, deps_to_consider):
        gov_id = sub['governor']
        dep_id = sub['dependent']
        deps = []
        idx_sub = 0
        temp_deps_to_consider = deps_to_consider[:]
        for idx, dep in enumerate(deps_to_consider):
            if dep['governor'] == sub['governor'] and dep['dependent'] == sub['dependent']:
                idx_sub = idx
        temp_deps_to_consider.pop(idx_sub)
        deps = self._reccurent_find_all(temp_deps_to_consider, to_find=[gov_id, dep_id])
        deps.append(sub)
        return deps

    def _get_dependencies_tree_from_compound(self, sentence, sub_indexes):
        deps_to_consider = []
        for dep in sentence['basicDependencies']:
            if str(dep['dep']) not in ['mark', 'acl', 'appos', 'advcl', 'cc', 'ccomp', 'conj', 'dep', 'parataxis',
                                       'ref', 'punct', 'acl:relcl', 'det']:
                deps_to_consider.append(dep)
        # build deps trees for nsubjs
        dependencies = []
        for sub_idx in sub_indexes:
            sub = sentence['basicDependencies'][sub_idx]
            dependencies.append(self._find_connected_deps(sub, deps_to_consider))
        return dependencies

    def _glue_words_into_sentences(self, dependencies):
        index_to_word = {}
        for deps in dependencies:
            if deps['governor'] > 0:
                index_to_word[deps['governor']] = deps['governorGloss']
            if deps['dependent'] > 0:
                index_to_word[deps['dependent']] = deps['dependentGloss']

        ordered_words = [index_to_word[k] for k in sorted(index_to_word)]
        return " ".join(ordered_words)


    def get_simple_sentences(self, text):
        res = self.nlp.annotate(text, properties=self.prop)
        out = json.loads(res)
        sentence = out['sentences'][0]
        # start
        simple_sentences = []
        sub_indexes = self._get_subj_indexes(sentence)
        simple_sentence_deps = self._get_dependencies_tree_from_compound(sentence, sub_indexes)
        for deps in simple_sentence_deps:
            sentence = self._glue_words_into_sentences(deps)
            simple_sentences.append(sentence)

        return simple_sentences


In [169]:
def extract_triplet(text, uri="http://localhost:9000/"):
    uri = nlp=StanfordCoreNLP(uri)
    properties = {
            "timeout": "50000",
            "annotators":"tokenize, pos, lemma, openie, depparse",
            "outputFormat": "json",
            "openie.max_entailments_per_clause":"3", # no of triples from each clause
            "openie.triple.strict":"true",
          'openie.affinity_probability_cap': 2 / 3
    }



    res = nlp.annotate(text, properties=properties)
    out = json.loads(res)
    triplet = [(i['subject'],i['relation'],i['object']) for i in out['sentences'][0]['openie']]
    return triplet

In [170]:
sc = SimpleSentenceGenerationAlgorithm()

In [185]:
# detect if sentence is having -ve polarity, remove negation, extract triple, then add negation

text = "The Bank should not share any non-public OCC information"
text_c = sc.get_simple_sentences(text)
text_c

['Bank should not share non-public OCC information']

In [172]:
extract_triplet(text)

[('Bank', 'should share', 'non-public OCC information')]

In [173]:
extract_triplet(text_c[0])

[('Bank', 'should share', 'OCC information'),
 ('Bank', 'should share', 'non-public OCC information')]

In [174]:
t2 = "bank management should conduct due diligence of foreign-based service provider before contacting with provider and distributor"

In [175]:
text_c = sc.get_simple_sentences(t2)
text_c

['bank management should conduct due diligence of foreign based service provider before contacting with provider']

In [176]:
extract_triplet(t2)

[('bank management', 'should conduct due', 'diligence'),
 ('bank management',
  'should conduct due',
  'diligence of foreign based service provider'),
 ('bank management', 'should conduct due', 'diligence of service provider')]

In [184]:
txt = "I love pen but not paper"
text_c = sc.get_simple_sentences(txt)
text_c

['I love pen']