In [2]:
! rm -R ./mcc_nlp_task_nap
! rm -R ./corpus
! git clone https://github.com/dayan3847/mcc_nlp_task_nap
! cp -R ./mcc_nlp_task_nap/corpus/ ./

# ! python -m spacy download es_core_news_sm

rm: cannot remove './corpus': No such file or directory
Cloning into 'mcc_nlp_task_nap'...
remote: Enumerating objects: 91, done.[K
remote: Counting objects: 100% (91/91), done.[K
remote: Compressing objects: 100% (85/85), done.[K
remote: Total 91 (delta 5), reused 89 (delta 3), pack-reused 0[K
Unpacking objects: 100% (91/91), 837.13 KiB | 6.64 MiB/s, done.


In [11]:
from networkx import Graph
from typing import List
from matplotlib import pyplot as plt
import os
import nltk
import xlrd
import networkx as nx

In [19]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
class Graphs:

    def __init__(self):
        self.graph_frequency = Graph()
        self.graph_time = Graph()
        self.graph_association = Graph()


In [6]:
class Definition:

    def __init__(self):
        self.word_input: str = ''
        self.word_outputs: List[List[str]] = []


In [10]:

class DataSet:

    def __init__(self):
        self.graphs: Graphs | None = None
        self.definitions: List[Definition] = []
        self.path_file_nap: str = './corpus/NAP.xls'
        self.path_folder_definitions: str = './corpus/freeling_definitions/'
        self.ignore_words: List[str] = ['--PALABRAS--', '', '=', '*']

    def import_graphs(self):
        if self.graphs is not None:
            return self.graphs

        self.graphs = Graphs()
        workbook = xlrd.open_workbook(self.path_file_nap)
        sheet = workbook.sheet_by_index(0)
        count_rows = sheet.nrows
        word_input: str = ''
        for row in range(count_rows):
            cell = sheet.cell(row, 0)
            cell_value: str = str(cell.value).strip()
            if '======' == cell_value:
                word_input = ''
                continue
            elif cell_value in self.ignore_words:
                continue
            elif '' == word_input:
                word_input = cell_value
            else:
                # frequency
                frequency: float = float(sheet.cell(row, 1).value)
                frequency_weight: float = 1 / frequency
                self.graphs.graph_frequency.add_edge(word_input, cell_value, weight=frequency_weight)
                # time
                time: float = float(sheet.cell(row, 2).value)
                self.graphs.graph_time.add_edge(word_input, cell_value, weight=time)
                # association
                association: float = float(sheet.cell(row, 3).value)
                association_weight: float = 100 - association
                self.graphs.graph_association.add_edge(word_input, cell_value, weight=association_weight)
        return self.graphs

    def clean_lematize(self, sentence: str):
        sentence = sentence.strip()
        result: str = ''
        stopwords = nltk.corpus.stopwords.words('spanish')
        words = sentence.split()
        for word in words:
            if word in stopwords:
                continue
            # doc = nlp(word)
            # result += doc[0].lemma_+ " "
            result += word + ' '
        return result

    def import_definitions(self) -> List[Definition]:
        if len(self.definitions) > 0:
            return self.definitions
        directory: str = self.path_folder_definitions
        self.definitions: List[Definition] = []
        for file_name in os.listdir(directory):
            if not file_name.endswith('.txt'):
                continue

            file_data = open(directory + file_name, encoding="utf8")
            lines = file_data.readlines()
            definition = Definition()
            definition.word_input = str(lines[0]).lower().strip()
            for line in lines[1:]:
                line = line.strip()
                if '' == line:
                    continue
                line = self.clean_lematize(line)
                definition.word_outputs.append(line.split())
            self.definitions.append(definition)

        return self.definitions


In [12]:


def draw(graph: nx.Graph):
    nx.draw(
        graph,
        with_labels=True,
        font_weight='bold',
        node_size=1000,
        node_color='green',
    )
    plt.show()


In [14]:

def reduce_graph(graph: nx.Graph, subset: list) -> nx.Graph:
    sub_graph = nx.Graph()
    for node in subset:
        if node not in graph.nodes():
            continue
        for neighbor in graph.neighbors(node):
            weight = graph[node][neighbor]['weight']
            sub_graph.add_edge(node, neighbor, weight=weight)
    return sub_graph


In [15]:

def bt_centrality(graph: nx.Graph, subset: List[str]) -> dict:
    sub_graph = reduce_graph(graph, subset)
    result = nx.betweenness_centrality(sub_graph, normalized=True, weight="weight")
    for w in subset:
        if w in result:
            result.pop(w)
    result = dict(sorted(result.items(), key=lambda item: item[1], reverse=True))
    return result


In [17]:


def build_definitions_graph(graphs: Graphs, definitions: List[Definition]):
    for definition in definitions:
        print('\033[32m' + f'Input: {definition.word_input}' + '\033[0m')
        for word_output in definition.word_outputs:
            print('\033[33m' + f'Output: {word_output}' + '\033[0m')
            sub_graph = reduce_graph(graphs.graph_frequency, word_output)
            print('Frequency ' + str(sub_graph))
            btc = bt_centrality(sub_graph, word_output)
            print('\033[35m' + str(btc) + '\033[0m')
            draw(sub_graph)
            sub_graph = reduce_graph(graphs.graph_time, word_output)
            print('Time ' + str(sub_graph))
            btc = bt_centrality(sub_graph, word_output)
            print('\033[35m' + str(btc) + '\033[0m')
            draw(sub_graph)
            sub_graph = reduce_graph(graphs.graph_association, word_output)
            print('Association ' + str(sub_graph))
            btc = bt_centrality(sub_graph, word_output)
            print('\033[35m' + str(btc) + '\033[0m')
            draw(sub_graph)


In [20]:

if __name__ == '__main__':
    data_set = DataSet()
    # Import Graphs
    the_graphs = data_set.import_graphs()
    print('Frequency ' + str(the_graphs.graph_frequency))
    print('Time ' + str(the_graphs.graph_time))
    print('Association ' + str(the_graphs.graph_association))
    # Import Definitions
    the_definitions = data_set.import_definitions()
    print('Definitions ' + str(the_definitions))
    # Build Graphs
    build_definitions_graph(the_graphs, the_definitions)


Output hidden; open in https://colab.research.google.com to view.