In [1]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
# Remove stopwords from a list of words
def remove_stopwords(words_list):
    
    stopwords = open("stop_words_FULL.txt", "r")
    stopwords_list = []
    for word in stopwords:
        stopwords_list.append(word.replace('\n', ''))
    stopwords.close()
    
    new_words_list = []
    for word in words_list:
        word_lower = word.lower()
        if word_lower not in stopwords_list:
            new_words_list.append(word_lower)
    return new_words_list

In [3]:
def get_signature(sense):
    signature = []
    for word in tokenize_sentence(sense.definition()):  # definition tokenization
        signature.append(word)
    for example in sense.examples():  # example tokenization
        for word in tokenize_sentence(example):
            # Merge definition and examples
            signature.append(word)
    return signature  

In [4]:
# Tokenizza la frase in input e ne affettua anche la lemmatizzazione della sue parole
def tokenize_sentence(sentence):
    words_list = []
    lmtzr = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(str(sentence))):
        if tag[1][:2] == "NN":
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.NOUN))
        elif tag[1][:2] == "VB":
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.VERB))
        elif tag[1][:2] == "RB":
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.ADV))
        elif tag[1][:2] == "JJ":
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.ADJ))
    return words_list

In [5]:
from collections import Counter
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk

In [6]:
# dato un soggetto cerca l'oggetto della frase rispetto al verbo considerato
def search_obj(sentence, head_verb, pattern):
    for token in sentence:
        if token.head.text == head_verb and token.dep_ == 'dobj' and len(
                pattern) < 2 and token.text != '\n  ' and token.text != '\n':
            dependency2 = token.text, token.tag_, token.head.text, token.dep_
            return dependency2

In [7]:
# dato un oggetto cerca il soggetto della frase rispetto al verbo considerato
def search_subj(sentence, head_verb, pattern):
    for token in sentence:
        if token.head.text == head_verb and token.dep_ == 'nsubj' and len(
                pattern) < 2 and token.text != '\n  ' and token.text != '\n':
            dependency1 = token.text, token.tag_, token.head.text, token.dep_
            return dependency1

In [8]:
# prende in input un pattern e restituisce un nuovo pattern
# composto come segue: word_super_sense1 , word_super_sense2
# in pratica disambigua i termini in base alla sentence a cui fanno riferimento
def disambiguate_terms(pattern):
    dep1 = pattern[0]
    dep2 = pattern[1]

    sentence = pattern[2]

    w1 = dep1[0]
    w2 = dep2[0]

    # WSD with nltk Lesk
    return lesk(tokenize_sentence(sentence), w1, 'n'), lesk(tokenize_sentence(sentence), w2, 'n')

In [9]:
# Calculates semantic clusters
def compute_semantic_clusters(patterns):
    new_patterns = []
    for pattern in patterns:
        
        best_sense1, best_sense2 = disambiguate_terms(pattern)
        
        if best_sense1 and best_sense2:
            # supersense1, supersense2
            new_patterns.append((best_sense1._lexname, best_sense2._lexname))

    # Create a dict of Counter. Res like ((supersense1, supersensse2), frequency)
    patterns_Counter = dict(Counter(new_patterns))
    semantic_clusters = []
    
    for key in patterns_Counter.keys():
        result = patterns_Counter[key]
        percentage = result / len(new_patterns)
        # Cluster is a grouping of (subject, object) tuples with frequency associated
        cluster = key, format(percentage * 100, '.2f') + '%'
        semantic_clusters.append(cluster)

    # sort clusters by frequency
    semantic_clusters = sorted(semantic_clusters, key=lambda x: x[1], reverse=True)
    return semantic_clusters

In [10]:
import spacy
from xml.dom import minidom

In [11]:

VERB = "buy"
CORPUS = "buy_corpus.xml"
all_verb_forms = ['buy', 'buys']

"""
VERB = "eat"
CORPUS = "eat_corpus.xml"
all_verb_forms = ['eat', 'eats']
"""
# Valenza
VERB_ARGUMENTS = 2 

In [12]:

nlp = spacy.load('en_core_web_sm')

# Use minidom to parse XML file
mydoc = minidom.parse(CORPUS)

# Obtain all sentences 
sentences = mydoc.getElementsByTagName('line')

print("Verb: ", VERB)
print("Verb forms: ", all_verb_forms)

patterns = []
for sentence in sentences:
    # nlp do dependecies parsing and PoS
    sentence_parsed = nlp(sentence.firstChild.data.replace("<s>", "").replace("</s>", "").replace("     ", ""))
    pattern = []

    for token in sentence_parsed:
        # if token is a subject we can find subjects
        if token.head.text in all_verb_forms and token.dep_ == 'nsubj' and len(
                pattern) < VERB_ARGUMENTS and token.text != '\n  ' and token.text != '\n':
            dependency1 = token.text, token.tag_, token.head.text, token.dep_
            pattern.append(dependency1)
            dependency2 = search_obj(sentence_parsed, token.head.text, pattern)
            if dependency2:
                pattern.append(dependency2)
    if len(pattern) == VERB_ARGUMENTS:  # append iff we have 2 arguments
        pattern.append(sentence_parsed)
        patterns.append(pattern)

Verb:  buy
Verb forms:  ['buy', 'buys']


In [13]:
print("Numero di pattern trovati: ", len(patterns))
# Obtain semantic clusters with frequency 
semantic_clusters = compute_semantic_clusters(patterns)

# Check if probabilities are consistent
print(sum(float(semantic_clusters[i][1][0:-1]) for i in range(0, len(semantic_clusters))))

# Print clusters
for cluster in semantic_clusters:
    print(cluster)

Numero di pattern trovati:  2709
99.8599999999997
(('noun.quantity', 'noun.cognition'), '7.12%')
(('noun.group', 'noun.artifact'), '4.71%')
(('noun.quantity', 'noun.communication'), '3.71%')
(('noun.person', 'noun.artifact'), '3.71%')
(('noun.act', 'noun.substance'), '2.81%')
(('noun.group', 'noun.communication'), '2.31%')
(('noun.substance', 'noun.artifact'), '2.31%')
(('noun.quantity', 'noun.person'), '2.11%')
(('noun.quantity', 'noun.act'), '2.01%')
(('noun.quantity', 'noun.artifact'), '11.43%')
(('noun.quantity', 'noun.attribute'), '1.81%')
(('noun.person', 'noun.cognition'), '1.81%')
(('noun.quantity', 'noun.food'), '1.71%')
(('noun.person', 'noun.communication'), '1.40%')
(('noun.group', 'noun.cognition'), '1.40%')
(('noun.person', 'noun.attribute'), '1.30%')
(('noun.quantity', 'noun.group'), '1.20%')
(('noun.group', 'noun.possession'), '1.00%')
(('noun.person', 'noun.phenomenon'), '1.00%')
(('noun.quantity', 'noun.phenomenon'), '1.00%')
(('noun.communication', 'noun.attribute'),