In [1]:
import networkx as nx
import pickle
import json
import spacy
import re

from bs4 import BeautifulSoup

In [2]:
desc_file_path = r'C:\workspace\SOworkspace\data\apidoc_description\javadoc_descriptions.json'

In [3]:
with open(desc_file_path, 'r', encoding='utf-8') as rf:
    descriptions = json.load(rf)

In [4]:
list(descriptions.items())[1235]

('api/java.base/java/io/File.html#setExecutable(boolean)',
 '<div class="block">A convenience method to set the owner\'s execute permission for this\n abstract pathname. On some platforms it may be possible to start the Java\n virtual machine with special privileges that allow it to execute files\n that are not marked executable.\n\n <p>An invocation of this method of the form <code>file.setExcutable(arg)</code>\n behaves in exactly the same way as the invocation\n\n </p><pre><code>\n     file.setExecutable(arg, true)\n </code></pre></div>')

In [5]:
fudan_graph_path = r'C:\workspace\SOworkspace\data\concept_map\fudan_jdk_graph.pkl'

In [6]:
fudan_graph = nx.read_gpickle(fudan_graph_path)
type(fudan_graph.nodes)

networkx.classes.reportviews.NodeView

In [10]:
fudan_graph.nodes[11314]

{'id': 11314,
 'properties': {'qualified_name': 'java.io.File.setExecutable(boolean)',
  'entity_category': 11,
  'full_declaration': 'public boolean setExecutable(boolean executable)',
  'api_type': 11,
  'short_description': "A convenience method to set the owner's execute permission for this abstract pathname. On some platforms it may be possible to start the Java virtual machine with special privileges that allow it to execute files that are not marked executable.\nAn invocation of this method of the form file.setExcutable(arg) behaves in exactly the same way as the invocation\nfile.setExecutable(arg, true) .",
  'id': 15938,
  'added_in_version': '1.6',
  'alias': ['File.setExecutable', 'setExecutable', 'set Executable']},
 'labels': {'code_element', 'entity', 'jdk8', 'method'}}

In [8]:
file_renameto_method_node = [node for node in fudan_graph.nodes if 'properties' in fudan_graph.nodes[node].keys() and 'qualified_name' in fudan_graph.nodes[node]['properties'].keys() and fudan_graph.nodes[node]['properties']['qualified_name'] == 'java.io.File.setExecutable(boolean)']
file_renameto_method_node

[11314]

In [9]:
label_set = set()
for node in fudan_graph.nodes:
    if 'labels' in fudan_graph.nodes[node].keys():
        label_set.update(fudan_graph.nodes[node]['labels'])
label_set


{'annotation class',
 'base override method',
 'class',
 'class type',
 'code_element',
 'construct method',
 'domain term',
 'entity',
 'enum class',
 'enum constants',
 'error class',
 'exception class',
 'field of class',
 'interface',
 'jdk8',
 'method',
 'operation',
 'package',
 'parameter',
 'primary type',
 'return value',
 'sentence',
 'type',
 'unknown',
 'value',
 'wikidata'}

In [33]:
term_nodes = [node for node in fudan_graph.nodes if 'domain term' in fudan_graph.nodes[node]['labels']]
term_nodes[0:10]

[233720,
 233721,
 233722,
 233723,
 233724,
 233725,
 233726,
 233727,
 233728,
 233729]

In [36]:
fudan_graph.nodes[233724]

{'id': 233724,
 'properties': {'term_name': 'newref', 'alias': {'newref'}, 'lemma': 'newref'},
 'labels': {'domain term', 'jdk8'}}

In [11]:
desc = descriptions['api/java.base/java/io/File.html#setExecutable(boolean)']
desc

'<div class="block">A convenience method to set the owner\'s execute permission for this\n abstract pathname. On some platforms it may be possible to start the Java\n virtual machine with special privileges that allow it to execute files\n that are not marked executable.\n\n <p>An invocation of this method of the form <code>file.setExcutable(arg)</code>\n behaves in exactly the same way as the invocation\n\n </p><pre><code>\n     file.setExecutable(arg, true)\n </code></pre></div>'

In [16]:
soup = BeautifulSoup(desc, 'lxml')
for pre in soup.find_all('pre'):
    pre.extract()
soup

<html><body><div class="block">A convenience method to set the owner's execute permission for this
 abstract pathname. On some platforms it may be possible to start the Java
 virtual machine with special privileges that allow it to execute files
 that are not marked executable.

 <p>An invocation of this method of the form <code>file.setExcutable(arg)</code>
 behaves in exactly the same way as the invocation

 </p></div></body></html>

In [22]:
from nltk.tokenize import sent_tokenize
pat = re.compile('<[^>]+>', re.S)
sentences = pat.sub('', soup.text)
sentences = [' '.join(sent.split()) for sent in sent_tokenize(sentences)]
sentences

["A convenience method to set the owner's execute permission for this abstract pathname.",
 'On some platforms it may be possible to start the Java virtual machine with special privileges that allow it to execute files that are not marked executable.',
 'An invocation of this method of the form file.setExcutable(arg) behaves in exactly the same way as the invocation']

In [23]:
class SpacyNLPFactory:
    """

    """
    __domain_extractor_nlp = None
    __identifier_extractor_nlp = None
    __simple_nlp = None

    @classmethod
    def create_spacy_nlp_for_domain_extractor(clss):
        """
        load a spacy nlp pipeline for extract domain entity and relations
        :return:
        """
        if clss.__domain_extractor_nlp is not None:
            return clss.__domain_extractor_nlp

        # todo: fix this, write a class as Spacy Component
        nlp = spacy.load("en_core_web_sm")
        id_re = re.compile(r"id|ID|Id")

        prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
        infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes)
        suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
        nlp.tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                                  infix_finditer=infix_re.finditer,
                                                  suffix_search=suffix_re.search, token_match=id_re.match)

        clss.__domain_extractor_nlp = nlp
        return nlp

    @classmethod
    def create_spacy_nlp_for_identifier_extractor(clss):
        """
        load a spacy nlp pipeline for extract domain entity and relations
        :return:
        """
        if clss.__identifier_extractor_nlp is not None:
            return clss.__identifier_extractor_nlp

        # todo: fix this, write a class as Spacy Component
        nlp = spacy.load("en")
        hyphen_re = re.compile(r"[A-Za-z\d]+-[A-Za-z\d]+|'[a-z]+|''")

        prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
        infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes)
        suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
        nlp.tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                                  infix_finditer=infix_re.finditer,
                                                  suffix_search=suffix_re.search, token_match=hyphen_re.match)

        clss.__identifier_extractor_nlp = nlp
        return nlp

    @classmethod
    def create_simple_nlp_pipeline(clss):
        """
        create a simple nlp pipeline, without NER and dependency parser, could tokenize and pos,lemma, will be very fast.        :return:
        """
        if clss.__simple_nlp is not None:
            return clss.__simple_nlp

        NLP = spacy.load('en', disable=["ner", "parser"])
        hyphen_re = re.compile(r"[A-Za-z\d]+-[A-Za-z\d]+|'[a-z]+|''")
        prefix_re = spacy.util.compile_prefix_regex(NLP.Defaults.prefixes)
        infix_re = spacy.util.compile_infix_regex(NLP.Defaults.infixes)
        suffix_re = spacy.util.compile_suffix_regex(NLP.Defaults.suffixes)
        NLP.tokenizer = spacy.tokenizer.Tokenizer(NLP.vocab, prefix_search=prefix_re.search,
                                                  infix_finditer=infix_re.finditer,
                                                  suffix_search=suffix_re.search, token_match=hyphen_re.match)

        clss.__simple_nlp = NLP
        return NLP

In [24]:
class CodeElementNameUtil:
    PATTERN_2_4 = re.compile(r'([A-Za-z])([24])([A-CE-Za-ce-z])')
    PATTERN_split = re.compile(r'([A-Z]+)([A-Z][a-z0-9]+)')
    PATTERN_split_num = re.compile(r'([0-9]?[A-Z]+)')

    def get_simple_name_with_parent(self, name):
        if not name:
            return None
        team_name = name.split("(")[0]
        split_names = team_name.split(".")
        if len(split_names) <= 1:
            return split_names[-1]

        child = split_names[-1].strip()
        parent = split_names[-2].strip()

        return parent + "." + child

    def simplify(self, name):
        """
        get the simple name for class, method, field, eg. java.util.ArrayList->ArrayList
        :param name:
        :return:
        """
        if not name:
            return None
        team_name = name.split("(")[0]
        simple_name = team_name.split(".")[-1].strip()

        return simple_name

    def uncamelize_from_simple_name(self, name):
        """
        uncamel from simple name of one name, rg. java.util.ArrayList->Array List
        :param name:
        :return:
        """
        if not name:
            return None
        simple_name = self.simplify(name)

        return self.uncamelize(simple_name)

    def uncamelize(self, name):
        """
        uncamel one name
        :param name: the camel styple name(include underline)
        :return:
        """
        if not name:
            return None
        # sub = re.sub(r'([A-Za-z])([24])([A-CE-Za-ce-z])', r'\1 \2 \3', name).strip()
        sub = re.sub(self.PATTERN_2_4, r'\1 \2 \3', name).strip()
        sub = re.sub(r'_', " ", sub)
        # sub = re.sub(r'([A-Z]+)([A-Z][a-z0-9]+)', r'\1 \2', sub)
        sub = re.sub(self.PATTERN_split, r'\1 \2', sub)
        # sub = re.sub(r'([0-9]?[A-Z]+)', r' \1', sub)
        sub = re.sub(self.PATTERN_split_num, r' \1', sub)
        sub = re.sub(r'\s+', " ", sub).strip()
        return sub

    def uncamelize_by_stemming(self, name):
        """
        uncamelzie the name and remove last num, eg. Student1->Student, JavaParser3->Java Parser
        :param name:
        :return:
        """
        # todo: improve this method to fix more situation, has some error for Path1->Path
        name = re.sub(r'([0-9]+)$', '', name)
        name = self.uncamelize(name)
        if not name:
            return None
        sub = self.match_numer_first_and_middle(name)
        if sub:
            return sub
        # self.merge_after_uncamelize_and_stemm(name)
        return name

    def match_numer_first_and_middle(self, name):
        number_first = re.compile(r'(^[0-9]+[A-Z]+)', re.IGNORECASE).findall(name)
        if number_first:
            return number_first[0]
        else:
            number_middle = re.compile(r'([A-Z]+[24][A-Z]+)', re.IGNORECASE).findall(name.replace(" ", ""))
            if number_middle:
                return number_middle[0]
            else:
                return None

    def generate_aliases(self, qualified_name, include_simple_parent_name=False):
        if not qualified_name:
            return []

        simple_name = self.simplify(qualified_name)
        separate_name = self.uncamelize_from_simple_name(simple_name)
        name_list = [simple_name, separate_name]

        if include_simple_parent_name:
            name_list.append(self.get_simple_name_with_parent(qualified_name))

        name_list = [name for name in name_list if name]

        return list(set(name_list))


class ConceptElementNameUtil:
    PATTERN_2_4 = re.compile(r'([A-Za-z])([24])([A-CE-Za-ce-z])')
    PATTERN_split = re.compile(r'([A-Z]+)([A-Z][a-z0-9]+)')
    PATTERN_split_num = re.compile(r'([0-9]?[A-Z]+)')

    def get_simple_name_with_parent(self, name):
        if not name:
            return None
        team_name = name.split("(")[0]
        split_names = team_name.split(".")
        if len(split_names) <= 1:
            return split_names[-1]

        child = split_names[-1].strip()
        parent = split_names[-2].strip()

        return parent + "." + child

    def simplify(self, name):
        """
        get the simple name for class, method, field, eg. java.util.ArrayList->ArrayList
        :param name:
        :return:
        """
        if not name:
            return None
        team_name = name.split("(")[0]
        simple_name = team_name.split(".")[-1].strip()

        return simple_name

    def uncamelize_from_simple_name(self, name):
        """
        uncamel from simple name of one name, rg. java.util.ArrayList->Array List
        :param name:
        :return:
        """
        if not name:
            return None
        simple_name = self.simplify(name)

        return self.uncamelize(simple_name)

    def uncamelize(self, name):
        """
        uncamel one name
        :param name: the camel styple name(include underline)
        :return:
        """
        if not name:
            return None
        # sub = re.sub(r'([A-Za-z])([24])([A-CE-Za-ce-z])', r'\1 \2 \3', name).strip()
        sub = re.sub(self.PATTERN_2_4, r'\1 \2 \3', name).strip()
        sub = re.sub(r'_', " ", sub)
        # sub = re.sub(r'([A-Z]+)([A-Z][a-z0-9]+)', r'\1 \2', sub)
        sub = re.sub(self.PATTERN_split, r'\1 \2', sub)
        # sub = re.sub(r'([0-9]?[A-Z]+)', r' \1', sub)
        sub = re.sub(self.PATTERN_split_num, r' \1', sub)
        sub = re.sub(r'\s+', " ", sub).strip()
        return sub

    def uncamelize_by_stemming(self, name):
        """
        uncamelzie the name and remove last num, eg. Student1->Student, JavaParser3->Java Parser
        :param name:
        :return:
        """
        # todo: improve this method to fix more situation, has some error for Path1->Path
        name = re.sub(r'([0-9]+)$', '', name)
        name = self.uncamelize(name)
        if not name:
            return None
        sub = self.match_numer_first_and_middle(name)
        if sub:
            return sub
        # self.merge_after_uncamelize_and_stemm(name)
        return name

    def match_numer_first_and_middle(self, name):
        number_first = re.compile(r'(^[0-9]+[A-Z]+)', re.IGNORECASE).findall(name)
        if number_first:
            return number_first[0]
        else:
            number_middle = re.compile(r'([A-Z]+[24][A-Z]+)', re.IGNORECASE).findall(name.replace(" ", ""))
            if number_middle:
                return number_middle[0]
            else:
                return None

    def generate_aliases(self, qualified_name, vocabulary=None, abbreviation=False):
        if not qualified_name:
            return []

        simple_name = self.simplify(qualified_name)
        of_names = self.deal_with_adj(simple_name)
        separate_name = self.uncamelize_from_simple_name(simple_name)
        name_deal_number = self.uncamelize_by_stemming(simple_name)
        combined_name = simple_name.lower().replace("-", "").replace("\\", "").replace(" ", "")

        name_list = [simple_name, separate_name, name_deal_number, combined_name]
        name_list.extend(of_names)

        if abbreviation:
            result = self.generate_all_abbreviation_names(separate_name, vocabulary)
            name_list.extend(result)

        name_list = [name for name in name_list if name]

        return list(set(name_list))

    def generate_all_abbreviation_names(self, separate_name, vocabulary):
        result = []
        part_abbreviation = []
        full_abbreviation_name = self.get_abbreviation(separate_name)
        abbreviation_name = self.get_abbreviation(separate_name, full_link=False)
        if vocabulary != None:
            part_abbreviation = self.get_part_abbreviation(separate_name, vocabulary)
        result.append(full_abbreviation_name)
        result.append(abbreviation_name)
        result.extend(part_abbreviation)

        return result

    def deal_with_adj(self, name):
        result = []
        seperate_words = [" of ", " Of "]

        # A of B type
        for s_w in seperate_words:
            if s_w in name:
                words = name.split(s_w)
                if len(words) != 2:
                    continue
                child = words[0]
                parent = words[1]
                result.append((parent + " " + child).replace("  ", " "))

        seperate_words = ["'s ", "' "]

        # A's B => A B
        for s_w in seperate_words:
            if s_w in name:
                words = name.split(s_w)
                if len(words) != 2:
                    continue
                parent = words[0]
                child = words[1]

                result.append((parent + " " + child).replace("  ", " "))
                result.append((child + " of " + parent).replace("  ", " "))

        return result

    def get_abbreviation(self, separate_name, full_link=True):
        separate_name_list = separate_name.lower().split(" ")
        if len(separate_name_list) <= 1:
            return separate_name
        if full_link:
            abbreviation_list = [name[0].upper() for name in separate_name_list]
            return "".join(abbreviation_list)
        else:
            abbreviation_list = [name[0].upper() for name in separate_name_list if
                                 name not in ["of", "the", "this", "a", "that"]]
            return "".join(abbreviation_list)

    def get_part_abbreviation(self, separate_name, vocabulary):
        """
        :param separate_name: term name
        :param vocabulary: a list of prase
        :return: a list of abbreviation
        """
        separate_name_list = separate_name.split(" ")
        return_list = []
        if len(separate_name_list) > 1:
            for index in range(len(separate_name_list) - 1):
                for inner_index in range(index + 1, len(separate_name_list)):
                    prase = " ".join(separate_name_list[index:inner_index + 1])
                    if prase in vocabulary:
                        abbreviation_list = [name[0].upper() for name in separate_name_list[index:inner_index + 1]]
                        abbreviation = separate_name.replace(prase, "".join(abbreviation_list))
                        return_list.append(abbreviation)
            return return_list
        else:
            return [separate_name]


In [40]:
from nltk import WordNetLemmatizer
from nltk.corpus import wordnet as wn, stopwords
class EntityExtractor(object):
    """
    extract useful information from text, eg. HTML text, comment style text, normal text
    """

    def __init__(self):
        self.nlp = SpacyNLPFactory.create_spacy_nlp_for_domain_extractor()
        self.pattern = re.compile(r"NP_\w+ of NP_\w+")
        self.stopwords = stopwords.words('english')
        self.stopwords.append("-PRON-")
        self.stopwords = set(self.stopwords)
        self.lemmatizer = WordNetLemmatizer()

        self.code_patterns = [
            re.compile(r'^(?P<ELE>[a-zA-Z0-9_]*[a-z0-9][A-Z][a-z]+[a-zA-Z0-9_]*)(<.*>)?$'),
            re.compile(r'^(?P<ELE>[a-zA-Z0-9_\.<>]+)\([a-zA-Z0-9_\,.<>)]*?$'),
            re.compile(r'^(?P<ELE>[a-zA-Z]{2,}(\.[a-zA-Z0-9_]+)+)(<.*>)?$'),
        ]

        self.camel_cache = {}
        self.CODE_NAME_UTIL = CodeElementNameUtil()

    def uncamelize(self, camel_case):
        if camel_case in self.camel_cache:
            return self.camel_cache[camel_case]
        sub = self.CODE_NAME_UTIL.uncamelize_by_stemming(camel_case)
        self.camel_cache[camel_case] = sub
        return sub

    def extract_from_sentence(self, sent):
        """
        extract concept from one sentence.
        :param sent:
        :return: a set of concepts.
        """
        code_elements = self.extract_code_element(sent)

        domain_terms = set()
        doc = self.nlp(sent)
        for chunk in doc.noun_chunks:
            # print("chunk: ", chunk.text)
            chunk = self.clean_chunk(chunk)
            # print("cleaned chunk:", chunk)
            if len(chunk) == 0:
                continue
            if len(chunk) == 1 and self.is_word_common(chunk.text):
                continue
            if chunk.text in code_elements:
                continue
            # domain_terms.add(self.__chunk_lemmatize(chunk))
            domain_terms.update(self.extract_abbreviation_from_chunk(chunk))
            domain_terms.update(self.extract_NNPs_from_chunk(chunk))
        domain_terms.update(self.extract_np_of_np(doc))
        # print('sent: ' + sent)
        # print('result: ', result)
        domain_terms = self.__post_process(domain_terms)
        return domain_terms, code_elements

    def extract_code_element(self, sent):
        elements = set()
        for word in sent.split():
            word = word.lstrip("#(").rstrip(",;.!?")
            # print(word)
            for index, pattern in enumerate(self.code_patterns):
                search_rs = pattern.search(word)
                if search_rs is not None and search_rs.group("ELE"):
                    # print(index, pattern, search_rs.group("ELE"))
                    elements.add(search_rs.group("ELE"))
                elif index == len(self.code_patterns) - 1:
                    p = re.compile(r'([a-z]|\d)([A-Z])')
                    if p.search(word) is not None:
                        # print("camel:", word)
                        elements.add(word)
        return elements

    def extract_np_of_np(self, doc):
        result = set([])
        sentence_text = doc[:].lemma_
        for chunk in doc.noun_chunks:
            chunk_arr = []
            chunk = self.clean_chunk(chunk)
            if len(chunk) == 0:
                continue
            for token in chunk:
                chunk_arr.append(token.lemma_)
            chunk_lemma = " ".join(chunk_arr)
            # print("chunk_lemma", chunk_lemma)
            replacement_value = "NP_" + "_".join(chunk_arr)
            # print("replacement_value", replacement_value)
            sentence_text = sentence_text.replace(chunk_lemma, replacement_value)
        # print("sentence_text", sentence_text)
        matches = re.findall(self.pattern, sentence_text)
        if len(matches) > 0:
            # print('matched: ', matches)
            for m in matches:
                result.add(m.replace("NP_", "").replace("_", " "))
        return result

    def clean_chunk(self, chunk):
        """
        remove the stopwords, digits and pronouns at the start of the chunk.
        pass the result which contains invalid symbol.
        :param chunk:
        :return:
        """
        if chunk.text.lower() in self.stopwords:
            return []
        while len(chunk) > 1:
            start_token = chunk[0]
            if start_token.text.lower() in self.stopwords or start_token.text.isdigit() or start_token.tag_ == 'PRP':
                chunk = chunk[1:]
            else:
                break
        if len(chunk) == 1:
            start_token = chunk[0]
            if start_token.text.lower() in self.stopwords or start_token.text.isdigit() or start_token.tag_ == 'PRP':
                return []
        if not re.match(r'^[a-zA-Z0-9][a-zA-Z0-9\' -]*[a-zA-Z0-9]$', chunk.text):
            return []
        return chunk

    def is_word_common(self, word):
        """
        check if the word is common word.
        :param word:
        :return:
        """
        if word in self.stopwords:
            return True
        if re.match(r'[a-zA-Z]+[a-zA-Z]$', word):
            word = self.lemmatizer.lemmatize(word, pos='n')
            synset = wn.synsets(word)
            if len(synset) > 0:
                return True
            else:
                return False
        return False

    def extract_abbreviation_from_chunk(self, chunk):
        result = set([])
        for token in chunk:
            if re.match(r'[A-Z]{2,}[0-9]*$', token.text):
                result.add(token.text)
        return result

    def extract_NNPs_from_chunk(self, chunk):
        result = set([])
        p = 0
        while p < (len(chunk) - 1):
            if chunk[p].tag_.startswith('NNP'):
                for i in range(p + 1, len(chunk)):
                    if not chunk[i].tag_.startswith('NNP'):
                        t_w = chunk[p:i]
                        p = i
                        if len(t_w) > 1:
                            result.add(self.__chunk_lemmatize(t_w))
                        break
                    elif i == len(chunk) - 1:
                        t_w = chunk[p:]
                        p = i
                        if len(t_w) > 1:
                            result.add(self.__chunk_lemmatize(t_w))
                        break
            else:
                p = p + 1
        return result

    def __chunk_lemmatize(self, chunk):
        """
        lemmatize the last word of chunk.
        :param chunk:
        :return:
        """

        word = self.lemmatizer.lemmatize(chunk.text, pos='n')

        return word

    def __post_process(self, result):
        new_result = set([])
        for item in result:
            if len(item) == 1 or item.isdigit():
                continue
            new_result.add(item)
        return new_result

    def extract_from_comment(self, comment):
        """
        extract domain_terms, code_elements from comment text
        :param comment:
        :return:
        """
        comment = re.sub(r'\s+', ' ', comment.strip().strip("/*").strip())
        if len(comment) == 0:
            return set(), set()
        domain_terms, code_elements = self.extract_from_sentence(comment)
        return domain_terms, code_elements

    def extract_from_html(self, html):
        terms = set()
        soup = BeautifulSoup(html, "lxml")
        tts = {tt.get_text() for tt in soup.findAll("tt")}
        terms.update({tt for tt in tts if len(tt.split()) <= 3})
        sent = soup.get_text()
        sent = re.sub(r'\s+', ' ', sent.strip().strip("/*").strip())
        domain_terms, code_elements = self.extract_from_sentence(sent)
        for term in domain_terms:
            terms.add(term)
        return terms, code_elements


In [41]:
extractor = EntityExtractor()

In [42]:
terms = set()
codes = set()
for sentence in sentences:
    domain_terms, code_elements = extractor.extract_from_sentence(sentence)
    terms.update(domain_terms)
    codes.update(code_elements)
terms, codes

(set(), {'file.setExcutable', 'file.setExcutable(arg)'})

In [29]:
'''
code element可以被抽取出来，给图谱注入新的关系...
这个可以加在之后的工作内容里，见上方尝试
至此初步的domain term抽取工作算是完成了，后面怎么fuse跟链接wiki之后再说...
'''

'\ncode element可以被抽取出来，给图谱注入新的关系...\n这个可以加在之后的工作内容里，见上方尝试\n'