In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from collections import Counter
from tabulate import tabulate

lemmatizer = WordNetLemmatizer()

In [1]:
def read_data_to_dict():
    return pd.read_excel('../resources/definizioni.xlsx', index_col=0).T.to_dict('list')

In [2]:
def words_frequency(dictionary: dict, n_common):
    frequencies = dict()
    for concept, definitions in dictionary.items():
        words = list()
        for definition in definitions[:len(definitions) - 1]:
            if definition == definition:
                words.extend(
                    [word for word in word_tokenize(definition.lower()) if
                     not word in string.punctuation and not word in stopwords.words('english')])
            else:
                definitions.remove(definition)
        frequencies[concept] = [t[0] for t in Counter(words).most_common(n_common)]
    return frequencies

In [3]:
def get_definitions_overlap_score(dictionary, common_words):
    occurrences_score = dict()
    for (concept, definitions), words in zip(dictionary.items(), common_words.values()):
        occurrences = list()
        for word in words:
            counter = 0
            for definition in definitions:
                if word in word_tokenize(definition.lower()): counter += 1
            occurrences.append(counter)
        occurrences_score[concept] = round(sum([x / len(definitions) for x in occurrences]) / len(occurrences), 3)
    return occurrences_score

In [4]:
def get_concept_overlap_score(score):
    overlap_dict = dict()
    overlap_dict['concrete-concrete'] = (score['Brick'] + score['Person']) / 2
    overlap_dict['abstract-abstract'] = (score['Emotion'] + score['Revenge']) / 2
    overlap_dict['generic-generic'] = (score['Emotion'] + score['Person']) / 2
    overlap_dict['specific-specific'] = (score['Brick'] + score['Revenge']) / 2
    return overlap_dict

In [6]:
df_dict = read_data_to_dict()
df_dict_freq = words_frequency(df_dict, 5)
def_overlap_score = get_definitions_overlap_score(df_dict, df_dict_freq)
concept_overlap_score = get_concept_overlap_score(def_overlap_score)

NameError: name 'pd' is not defined

In [7]:
def print_results(df_dict_freq, def_overlap_score, concept_overlap_score):
    to_print = list()
    for key, value in df_dict_freq.items():
        table = [key]
        table.extend(value)
        to_print.append(table)
    print(f'-------- The five most common words in each definition by concept --------')
    print()
    print(tabulate(to_print, headers=['Concept', 1, 2, 3, 4, 5], tablefmt='orgtbl'))
    print()
    print()

    print(f'-------- Definitions overlap score for each concept --------')
    print()
    to_print = [[key, val] for key, val in def_overlap_score.items()]
    print(tabulate(to_print, headers=['Concept', 'Score'], tablefmt='orgtbl'))
    print()
    print()

    print(f'-------- Concept overlap score --------')
    print()
    to_print = [[key, val] for key, val in concept_overlap_score.items()]
    print(tabulate(to_print, headers=['Concept', 'Score'], tablefmt='orgtbl'))
    print()
    print()
print_results(df_dict_freq, def_overlap_score, concept_overlap_score)

-------- The five most common words in each definition by concept --------

| Concept   | 1       | 2      | 3        | 4         | 5            |
|-----------+---------+--------+----------+-----------+--------------|
| Emotion   | feeling | human  | feel     | something | state        |
| Person    | human   | person | living   | entity    | individual   |
| Revenge   | someone | anger  | feeling  | action    | reaction     |
| Brick     | used    | object | material | build     | construction |


-------- Definitions overlap score for each concept --------

| Concept   |   Score |
|-----------+---------|
| Emotion   |   0.253 |
| Person    |   0.252 |
| Revenge   |   0.267 |
| Brick     |   0.523 |


-------- Concept overlap score --------

| Concept           |   Score |
|-------------------+---------|
| concrete-concrete |  0.3875 |
| abstract-abstract |  0.26   |
| generic-generic   |  0.2525 |
| specific-specific |  0.395  |


