<a href="https://colab.research.google.com/github/faguilarc/RNN_v1/blob/master/Training_en_google_collab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import pandas as pd
from pandas import DataFrame


class AddressParser:

    def __init__(self, model: NeuralParser, decoder: Decoder):
        self.model = model
        self.decoder = decoder

    def process_address(self, address_list: list):
        probability_matrix = self.model.predict(address_list)

        return self.decoder.decoder_to_first_address_model(probability_matrix, address_list)

    def process_address_data_frame(self, address: DataFrame):
        address_list = []
        for i in address.index:
            if len(str(address.iloc[i, 0])) != 0:
                address_list.append(str(address.iloc[i, 0]))
        probability_matrix = self.model.predict(address_list)

        return self.decoder.decoder_to_first_address_model(probability_matrix, address_list)

    @staticmethod
    def to_xlsx(list_address: list, name_file='Results'):
        if len(list_address) == 0:
            raise NotImplementedError('The list should have at least one address')
        elif isinstance(list_address[0], ClassifiedAddressOne):
            AddressParser.__export_one(list_address, name_file=name_file, format='xlsx')
        # FRANK poner la otra condicion de que si es instancia de ClassifiedAddresTwoAndThree

    @staticmethod
    def to_csv(list_address: list, name_file='Results'):
        if len(list_address) == 0:
            raise NotImplementedError('The list should have at least one address')
        elif isinstance(list_address[0], ClassifiedAddressOne):
            AddressParser.__export_one(list_address, name_file=name_file, format='csv')
        # FRANK poner la otra condicion de que si es instancia de ClassifiedAddresTwoAndThree
    @staticmethod
    def __export_one(list_address: list, name_file='Results', format='xlsx'):
        principal_street_list = []
        first_side_street_list = []
        second_side_street_list = []
        locality_list = []
        municipality_list = []
        province_list = []
        buildings_list = []
        apartment_list = []
        reserve_words_list = []

        for address in list_address:
            principal_street_list.append(' '.join(address.principal_street))
            first_side_street_list.append(' '.join(address.first_side_street))
            second_side_street_list.append(' '.join(address.second_side_street))
            locality_list.append(' '.join(address.locality))
            municipality_list.append(' '.join(address.municipality))
            province_list.append(' '.join(address.province))
            buildings_list.append(' '.join(address.building))
            apartment_list.append(' '.join(address.apartment))
            reserve_words_list.append(' '.join(address.reserve_word))

        df = DataFrame({
            'Principal Street': principal_street_list,
            'First Side Street': first_side_street_list,
            'Second Side Street': second_side_street_list,
            'Building': buildings_list,
            'Apartment': apartment_list,
            'Locality': locality_list,
            'Municipality': municipality_list,
            'Province': province_list,
            'Reserved Word': reserve_words_list,
        })

        if format == 'xlsx':
            writer = pd.ExcelWriter(name_file + '.xlsx', engine='xlsxwriter')
            df.to_excel(writer, index=False, sheet_name=name_file)
            writer.save()
        elif format == 'csv':
            df.to_csv(name_file)
        else:
            raise NotImplementedError('This export format is not implemented')

    # def process_address_three(self, address_list: list):
    #     probability_matrix = self.model.predict(address_list)
    #
    #     return self.decoder.decoder_to_third_address_model(probability_matrix, address_list)

    def process_address_two(self, address_list: list):
        probability_matrix = self.model.predict(address_list)

        return self.decoder.decoder_to_second_address_model(probability_matrix, address_list)







In [7]:
import re
import tensorflow as tf


class AddressCleaner:

    @staticmethod
    def cleaner_method(method='custom_standardization'):
        if method == 'custom_standardization':
            return AddressCleaner.__custom_standardization
        elif method == 'custom_standardization_v2':
            return AddressCleaner.__custom_standardization_v2
        else:
            raise NotImplementedError('There is no such cleaning method')

    @staticmethod
    @tf.keras.utils.register_keras_serializable()
    def __custom_standardization(input_string):
        """ transforms words into lowercase and deletes punctuations """

        stripped_spanish = tf.strings.lower(input_string)

        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'á', 'a')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'ä', 'a')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'Á', 'a')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'Ä', 'a')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'é', 'e')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'ë', 'e')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'É', 'e')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'Ë', 'e')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'í', 'i')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'ï', 'i')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'Í', 'i')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'Ï', 'i')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'ó', 'o')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'ö', 'o')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'Ó', 'o')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'Ö', 'o')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'ú', 'u')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'ü', 'u')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'Ú', 'u')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'Ü', 'u')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    ',', ' , ')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    ';', ' , ')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'ñ', 'n')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'entre', 'entre ')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    '#', ' # ')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    '%', ' % ')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    '&', ' y ')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    '/', ' / ')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'apt.', 'apt. ')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'apt', 'apt ')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish,
                                                    'apartamento', 'apartamento ')
        stripped_spanish = tf.strings.regex_replace(stripped_spanish, '[^a-zA-Z0-9 -/]', '')

        output = tf.strings.regex_replace(
            stripped_spanish, '[%s]' % re.escape(r"""!"$&'()*+-.;<=>?@[]^_`{|}~"""), '')

        return output

    @staticmethod
    @tf.keras.utils.register_keras_serializable()
    def __custom_standardization_v2(input_string):
        # Transforma toda la cadena a minúsculas
        lower_str = input_string.lower()

        # Quitar ½ y 1/2 en textos
        spec_text = re.sub(r'½|1/2', ' ', lower_str)

        # Reemplaza los caracteres y vocales especiales por espacios
        char_spvow_off_str = re.sub('[^a-zA-Z0-9 \n\.]', ' ', spec_text)

        # Quita cualquier caracter que no sea número o letra por espacio
        clear_str = re.sub('[^0-9a-zA-Z]+', ' ', char_spvow_off_str)

        return clear_str


In [26]:
import pandas as pd
from keras.utils import pad_sequences, to_categorical
from pandas import DataFrame
from sklearn.model_selection import train_test_split




class DataSetAdapter:
    @staticmethod
    def adapt(data_set: DataFrame, training_percentage: float, validation_percentage: float,
              testing_percentage: float) -> DataSet:
        if training_percentage + testing_percentage + validation_percentage > 1:
            raise NotImplementedError('The sum of the percentages must correspond to 1')

        data_set['Tag'] = data_set['Tag'].astype('category')
        data_set['Tag_id'] = data_set['Tag'].cat.codes
        id_to_category = pd.Series(data_set.Tag.values, index=data_set.Tag_id).to_dict()
        n_tag = len(id_to_category)
        input_dim = len(list(set(data_set['Word'].to_list()))) + 1
        data_fillna = data_set.fillna(method='ffill', axis=0)
        # Group by and collect columns
        group_addres = data_fillna.groupby(
            ['Sentence #'], as_index=False
        )[['Word', 'Tag', 'Tag_id']].agg(lambda x: list(x))

        features, targets = group_addres['Word'].tolist(), group_addres['Tag_id'].tolist()

        train_features, test_features, train_targets, test_targets = train_test_split(
            features, targets,
            train_size=training_percentage + validation_percentage,
            test_size=testing_percentage,
            random_state=42,
            shuffle=True,
        )
        train_features, val_features, train_targets, val_targets = train_test_split(
            train_features, train_targets,
            train_size=1 - validation_percentage / (training_percentage + validation_percentage),
            test_size=validation_percentage / (training_percentage + validation_percentage),
            random_state=42,
            shuffle=True,
        )

        # Vocabulary is the list of all sentence of train set
        vocabulary_word = DataSetAdapter.__get_sentence(train_features)

        train_features_sentence = DataSetAdapter.__get_sentence(train_features)
        test_features_sentence = DataSetAdapter.__get_sentence(test_features)
        val_features_sentence = DataSetAdapter.__get_sentence(val_features)

        max_len_word = max([len(s) for s in group_addres['Word'].tolist()])
        max_len_characters = max(
            [len(''.join(s)) for s in group_addres['Word'].tolist()])  # Se cuenta tambien los signos de puntacion.
        max_len_trigram = max_len_characters

        train_targets = DataSetAdapter.__get_tags(train_targets, n_tag, max_len_word)
        test_targets = DataSetAdapter.__get_tags(test_targets, n_tag, max_len_word)
        val_targets = DataSetAdapter.__get_tags(val_targets, n_tag, max_len_word)

        return DataSet(vocabulary_word, max_len_characters, max_len_trigram, max_len_word, n_tag, id_to_category,
                       input_dim,
                       train_features_sentence, test_features_sentence, val_features_sentence, train_targets,
                       test_targets, val_targets)

    @staticmethod
    def __get_sentence(words_list):
        sentence_list = []
        for feature in words_list:
            sentence = ''
            for word in feature:
                sentence += word + ' '
            sentence_list.append([sentence])
        return sentence_list

    @staticmethod
    def __get_tags(data, n_tag, max_len, value=None):
        if value is None:
            value = n_tag - 1

        tags = pad_sequences(data, maxlen=max_len, dtype='int32', padding='post', value=value)
        tags = to_categorical(tags, num_classes=n_tag)

        return tags


In [15]:
from random import randrange
from random import choice
import itertools as itt
import math
import random as rm

from pandas import DataFrame
# from spellchecker import SpellChecker
# from fuzzywuzzy import fuzz


class Generator:

    def generate_non_standardization(self, components,probability):
        reorder_components = components

        if rm.randint(1, 100) <= probability:
            permutations = list(itt.permutations(components))

            amount_permutation = math.factorial(len(components)) - 1
            reorder_components = permutations[rm.randint(0, amount_permutation)]

        return reorder_components

    def generate_prefix_randomly(self, list_prefix, probability):
        if rm.randint(1, 100) <= probability:
            prefix = list_prefix[rm.randint(0, len(list_prefix) - 1)]
            return [[item, 'rw'] for item in prefix.split()]
        return []

    def generate_spelling_errors(self, word):
        apply_value = randrange(100)
        if apply_value > 65:
            rand = randrange(100)
            if rand < 25:
                # Duplicate character
                word = self.__duplicate_character(word)
            elif rand < 50:
                # Omit character
                word = self.__omit_character(word)
            elif rand < 75:
                # Misspelling
                word = self.__misspelling(word)
            else:
                # Replace similar character
                word = self.__similar_character(word)

        return word

    def generate_data_frame(self, address_list, words_list, tags_list):
        return DataFrame({
            'Sentence #': address_list,
            'Word': words_list,
            'Tag': tags_list
        })

    def add_new_address(self, components, address_number, address_list, words_list, tags_list):
        address_list.append('Sentence ' + str(address_number))

        # breaking down
        georeferential_elements_list = []
        for element in components:
            georeferential_elements_list += element

        count = 0
        var_aleatory = rm.randint(1, 6)
        amount_errors = var_aleatory if rm.randint(0, 4) == 1 else 0
        for compound_items in georeferential_elements_list:
            if compound_items[0] != 'nan':
                word = str(compound_items[0])
                if amount_errors > 0 and rm.randint(1, 4) == 1:
                    word = self.generate_spelling_errors(word)

                words_list.append(word)
                tags_list.append(str(compound_items[1]))

                if count != 0 and len(words_list) == len(address_list) + 1:
                    address_list.append(None)
            count += 1

    def is_empty(self, entity):

        return len(entity) == 0 or len(entity.split()) == 0 or entity == 'nan' or entity is None

    def generate_building_syntetic(self):
        random_value = rm.randint(1, 100)
        if random_value <= 45:
            # only numbers
            return str(rm.randint(101, 99999))
        elif random_value <= 55:
            # only letter
            letters = ['A', 'B', 'C', 'D', 'F', 'G', 'H']
            return letters[rm.randint(0, len(letters) - 1)]
        else:
            #  numbers and letters
            letters = ['A', 'B', 'C', 'D', 'F', 'G', 'H']
            letter = letters[rm.randint(0, len(letters) - 1)]
            letter_position = rm.randint(0, 4)
            number = str(rm.randint(101, 9999))

            name = number[0: letter_position] + letter + number[letter_position:]
            return name

    def generate_apartment_syntetic(self):
        random_value = rm.randint(1, 100)
        if random_value <= 50:
            # only numbers
            return str(rm.randint(10, 101))
        else:
            #  numbers and letters
            letters = ['A', 'B', 'C', 'D', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'R', 'S', 'T']
            letter = letters[rm.randint(0, len(letters) - 1)]
            letter_position = rm.randint(0, 4)
            number = str(rm.randint(1, 90))

            return number[0: letter_position] + letter + number[letter_position:]

    def divide_equally(self,number):
        part = number // 3
        remainder = number % 3
        if remainder == 0:
            return part, part, part
        elif remainder == 1:
            return part + 1, part, part
        else:
            return part + 1, part , part + 1

    def __duplicate_character(self, word):
        char_index = randrange(len(word))
        return word[:char_index] + word[char_index] + word[char_index:]

    def __omit_character(self,word):
        char_index = randrange(len(word))
        return word[:char_index] + word[char_index + 1:]

    def __misspelling(self,word):
        # spell = SpellChecker()
        # ret_word =''
        # if word.lower() in spell:
        #     suggestions = list(spell.candidates(word.lower()))
        #     if len(suggestions) > 0:
        #         new_word = choice(suggestions)
        #         if fuzz.ratio(word.lower(), new_word) < 75:
        #             ret_word = new_word
        pass

    def __similar_character(self,word):
        ret_word = ''
        char_index = randrange(len(word))
        similar_chars = {'a': 'e', 'e': 'a', 'i': 'l', 'l': 'i', 'o': 'u', 'u': 'o', 'a': '@', '0': '@'}
        if word[char_index].lower() in similar_chars:
            new_char = similar_chars[word[char_index].lower()]
            ret_word = word[:char_index] + new_char + word[char_index + 1:]
        return ret_word



In [16]:
import math
from pandas import DataFrame
import random as rm
import itertools as itt

class NoiseGenerator(Generator):
    '''
        Caracterizacion del modelo 1: (Componentes para permutar)

        De acuerdo a la localidad, municipio, provincia se tiene los siguientes componentes: (Componentes Basicos)
            [ prefijo + locaclidad]     [ prefijo + municipio]      [ prefijo + provincia]
        De a cuerdo a las calles, esquinas, entrecalles, edificacion ( casa o edificio y numero de apartamento o piso) se pueden dividir en:

            Tipo 1: Calle y entrecalles
                [ prefijo + calle principal]    [ prefijo + calle secundaria + conjuncion + prefijo + calle secundaria] +-  [ prefijo + edificio + apat.]
            Tipo 2: esquinas
                2.1
                    [ esq + prefijo calle + conjuncion + prefijo + calle]
                2.2
                    [ prefijo + calle + esq + prefijo + calle]
            Tipo 2 Con variaciones: informacion de edificios, casas, piso, apartamento, etc.
                2.1.1
                    [ esq + prefijo calle + EDF + conjuncion + prefijo + calle]
                    [ esq + prefijo calle + conjuncion + prefijo + calle + EDF]
                2.2.1
                    [ prefijo + calle + EDF + esq + prefijo + calle]
                    [ prefijo + calle + esq + prefijo + calle + EDF]
            Tipo 3: Solo calle principal
                    [ prefijo + calle principal] [ prefijo + edificio + apat.]
    '''

    def generate_noise(self, data_set: DataFrame, address_amount=None):
        print('Generate Noise II -- Type One')
        address_number = 0
        address_list = []
        words_list = []
        tags_list = []

        for i in data_set.index:
            if address_amount is not None and address_number == address_amount:
                break
            components = []

            principal_street = str(data_set.iloc[i, 0])
            first_side_street = str(data_set.iloc[i, 1])
            second_side_street = str(data_set.iloc[i, 2])
            locality = str(data_set.iloc[i, 3])
            municipality = str(data_set.iloc[i, 4])
            province = str(data_set.iloc[i, 5])

            # RECORDAR LOS SUFIJOS: 5ta Ave. ; primera avenida
            # Determinar si es tipo 1 o 2:
            if not self.__is_empty(first_side_street) and not self.__is_empty(second_side_street):
                #   Is type one
                principal_street_prefix = super().generate_prefix_randomly(STREET_NAME_PREFIX, 50)
                first_side_street_prefix = super().generate_prefix_randomly(STREET_NAME_PREFIX, 50)
                second_side_street_prefix = super().generate_prefix_randomly(STREET_NAME_PREFIX, 50)

                between_prefix = super().generate_prefix_randomly(BETWEEN_PREFIX, 100)
                conjunction_prefix = [[
                    'e' if (len(second_side_street_prefix) == 0 and second_side_street[0] == 'i') else 'y', 'rw']]

                # create component
                components.append(
                    principal_street_prefix + [[item, 'principal_street'] for item in principal_street.split()]
                )
                flag = False
                if rm.randint(1, 100) < 15:
                    flag = True
                    components.append(
                        between_prefix + first_side_street_prefix + [[item, 'first_side_street'] for item in
                                                                     first_side_street.split()] + conjunction_prefix +
                        second_side_street_prefix + [[item, 'second_side_street'] for item in second_side_street.split()]
                    )

                if rm.randint(1, 100) <= 50:
                    # Contain building
                    identification_building = self.__generate_building_syntetic()
                    identification_building_prefix = super().generate_prefix_randomly(BUILDING_PREFIX, 90)
                    if rm.randint(1, 100) <= 40:
                        components.append(
                            identification_building_prefix + [[item, 'building'] for item in
                                                              identification_building.split()]
                        )
                    else:
                        # Contain apartment
                        identification_apartment = self.__generate_apartment_syntetic()
                        identification_apartment_prefix = super().generate_prefix_randomly(BUILDING_SUBDIVISION_PREFIX,
                                                                                           100)

                        components.append(
                            identification_building_prefix + [[item, 'building'] for item in
                                                              identification_building.split()] +
                            identification_apartment_prefix + [[item, 'apartment'] for item
                                                               in identification_apartment.split()]
                        )
                if not flag:
                    components.append(
                        between_prefix + first_side_street_prefix + [[item, 'first_side_street'] for item in
                                                                     first_side_street.split()] + conjunction_prefix +
                        second_side_street_prefix + [[item, 'second_side_street'] for item in
                                                     second_side_street.split()]
                    )
            elif not self.__is_empty(first_side_street) or not self.__is_empty(second_side_street):
                #   Is type 2
                side_street = first_side_street if self.__is_empty(second_side_street) else second_side_street

                principal_street_prefix = super().generate_prefix_randomly(STREET_NAME_PREFIX, 50)
                side_street_prefix = super().generate_prefix_randomly(STREET_NAME_PREFIX, 50)

                if rm.randint(1, 100) <= 50:
                    # Is type 2.1
                    corner_prefix = super().generate_prefix_randomly(CORNER_CONNECTOR_PREFIX, 100)
                    conjunction_prefix = [['e' if (len(side_street_prefix) == 0 and side_street[0] == 'i') else 'y',
                                           'rw']]

                    if rm.randint(1, 100) <= 50:
                        # Contain Building
                        identification_building = self.__generate_building_syntetic()
                        identification_building_prefix = super().generate_prefix_randomly(BUILDING_PREFIX, 90)

                        if rm.randint(1, 100) <= 50:
                            # Is type 2.1.1 left building
                            components.append(
                                corner_prefix + principal_street_prefix + [[item, 'principal_street'] for item in
                                                                           principal_street.split()] +
                                identification_building_prefix + [[item, 'building'] for item in
                                                                  identification_building.split()] +
                                conjunction_prefix + side_street_prefix + [[item, 'first_side_street'] for item in
                                                                           side_street.split()]
                            )
                        else:
                            # Is type 2.1.1 right building
                            components.append(
                                corner_prefix + principal_street_prefix + [[item, 'principal_street'] for item in
                                                                           principal_street.split()] +
                                conjunction_prefix + side_street_prefix + [[item, 'first_side_street'] for item in
                                                                           side_street.split()] +
                                identification_building_prefix + [[item, 'building'] for item in
                                                                  identification_building.split()]
                            )
                    else:
                        # Not contain building
                        components.append(
                            corner_prefix + principal_street_prefix + [[item, 'principal_street'] for item in
                                                                       principal_street.split()] +
                            conjunction_prefix + side_street_prefix + [[item, 'first_side_street'] for item in
                                                                       side_street.split()]
                        )
                else:
                    # Is type 2.2
                    corner_prefix = super().generate_prefix_randomly(CORNER_CONNECTOR_PREFIX, 100)

                    if rm.randint(1, 100) <= 50:
                        # Contain Building
                        identification_building = self.__generate_building_syntetic()
                        identification_building_prefix = super().generate_prefix_randomly(BUILDING_PREFIX, 90)

                        if rm.randint(1, 100) <= 50:
                            # Is type 2.1.1 left building
                            components.append(
                                principal_street_prefix + [[item, 'principal_street'] for item in
                                                           principal_street.split()] +
                                identification_building_prefix + [[item, 'building'] for item in
                                                                  identification_building.split()] +
                                corner_prefix + side_street_prefix + [[item, 'first_side_street'] for item in
                                                                      side_street.split()]
                            )
                        else:
                            # Is type 2.1.1 right building
                            components.append(
                                principal_street_prefix + [[item, 'principal_street'] for item in
                                                           principal_street.split()] +
                                corner_prefix + side_street_prefix + [[item, 'first_side_street'] for item in
                                                                      side_street.split()] +
                                identification_building_prefix + [[item, 'building'] for item in
                                                                  identification_building.split()]
                            )
                    else:
                        # Not contain building
                        components.append(
                            principal_street_prefix + [[item, 'principal_street'] for item in
                                                       principal_street.split()] +
                            corner_prefix + side_street_prefix + [[item, 'first_side_street'] for item in
                                                                  side_street.split()]
                        )
            else:
                # Is type 3
                principal_street_prefix = super().generate_prefix_randomly(STREET_NAME_PREFIX, 70)
                components.append(
                    principal_street_prefix + [[item, 'principal_street'] for item in principal_street.split()]
                )
                if rm.randint(1, 100) <= 50:
                    # Contain building
                    identification_building = self.__generate_building_syntetic()
                    identification_building_prefix = super().generate_prefix_randomly(BUILDING_PREFIX, 90)
                    if rm.randint(1, 100) <= 30:
                        components.append(
                            identification_building_prefix + [[item, 'building'] for item in
                                                              identification_building.split()]
                        )
                    else:
                        # Contain apartment
                        identification_apartment = self.__generate_apartment_syntetic()
                        identification_apartment_prefix = super().generate_prefix_randomly(BUILDING_SUBDIVISION_PREFIX,100)
                        components.append(
                            identification_building_prefix + [[item, 'building'] for item in
                                                              identification_building.split()] +
                            identification_apartment_prefix + [[item, 'apartment'] for item
                                                               in identification_apartment.split()]
                        )
            # Components Basics
            if not self.__is_empty(locality):
                between_prefix = [[' , ', 'rw']] if rm.randint(1, 100) < 70 else []
                locality_prefix = super().generate_prefix_randomly(LOCALITY_PREFIX, 35)
                components.append(
                    between_prefix + locality_prefix + [[item, 'locality'] for item in locality.split()]
                )
            if not self.__is_empty(municipality):
                between_prefix = [[' , ', 'rw']] if rm.randint(1, 100) < 75 else []
                municipality_prefix = super().generate_prefix_randomly(MUNICIPALITY_PREFIX, 8)
                components.append(
                    between_prefix + municipality_prefix + [[item, 'municipality'] for item in municipality.split()]
                )
            if not self.__is_empty(province):
                province_prefix = super().generate_prefix_randomly(PROVINCE_PREFIX, 3)
                between_prefix = [[' , ', 'rw']] if rm.randint(1, 100) < 85 else []
                components.append(
                    between_prefix + province_prefix + [[item, 'province'] for item in province.split()]
                )

            #  Permutación entre componentes.
            if rm.randint(1, 100) <= 5:
                components = super().generate_non_standardization(components)

            address_number += 1
            self.__add_new_address(components, address_number, address_list, words_list, tags_list)

        # Adding real address
        real_address = self.__add_real_address()
        for address in real_address:
            address_number += 1
            self.__add_new_address(address, address_number, address_list, words_list, tags_list)

        return self.__generate_data_frame(address_list, words_list, tags_list)

    def __generate_data_frame(self, address_list, words_list, tags_list):
        return DataFrame({
            'Sentence #': address_list,
            'Word': words_list,
            'Tag': tags_list
        })

    def __add_new_address(self, components, address_number, address_list, words_list, tags_list):
        address_list.append('Sentence ' + str(address_number))

        # breaking down
        georeferential_elements_list = []
        for element in components:
            georeferential_elements_list += element

        count = 0
        var_aleatory = rm.randint(1, 6)
        amount_errors = var_aleatory if rm.randint(0, 4) == 1 else 0
        for compound_items in georeferential_elements_list:
            if compound_items[0] != 'nan':
                word = str(compound_items[0])
                if amount_errors > 0 and rm.randint(1, 4) == 1:
                    word = super().generate_spelling_errors(word)

                words_list.append(word)
                tags_list.append(str(compound_items[1]))

                if count != 0 and len(words_list) == len(address_list) + 1:
                    address_list.append(None)
            count += 1

    def __is_empty(self, entity):

        return len(entity) == 0 or len(entity.split()) == 0 or entity == 'nan' or entity is None

    def __generate_building_syntetic(self):
        random_value = rm.randint(1, 100)
        if random_value <= 45:
            # only numbers
            return str(rm.randint(101, 99999))
        elif random_value <= 55:
            # only letter
            letters = ['A', 'B', 'C', 'D', 'F', 'G', 'H']
            return letters[rm.randint(0, len(letters) - 1)]
        else:
            #  numbers and letters
            letters = ['A', 'B', 'C', 'D', 'F', 'G', 'H']
            letter = letters[rm.randint(0, len(letters) - 1)]
            letter_position = rm.randint(0, 4)
            number = str(rm.randint(101, 9999))

            name = number[0: letter_position] + letter + number[letter_position:]
            return name

    def __generate_apartment_syntetic(self):
        random_value = rm.randint(1, 100)
        if random_value <= 50:
            # only numbers
            return str(rm.randint(10, 101))
        else:
            #  numbers and letters
            letters = ['A', 'B', 'C', 'D', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'R', 'S', 'T']
            letter = letters[rm.randint(0, len(letters) - 1)]
            letter_position = rm.randint(0, 4)
            number = str(rm.randint(1, 90))

            return number[0: letter_position] + letter + number[letter_position:]

    def __add_real_address(self):
        return [
            [[['calle', 'rw'], ['30', 'principal_street'], ['959', 'building'], ['e', 'rw'], ['entre', 'rw'], ['avenida', 'rw'], ['26', 'first_side_street'], ['y', 'rw'], ['47', 'second_side_street'], ['Plaza', 'municipality'], ['de', 'municipality'], ['la', 'municipality'], ['Revolucion', 'municipality'], ['La', 'province'], ['Habana', 'province']]],
            [[['ave', 'rw'], ['67', 'principal_street'], ['no', 'rw'], ['13613', 'building'], ['e', 'rw'], ['136', 'first_side_street'], ['y', 'rw'], ['138', 'second_side_street'], ['Marianao', 'municipality'], ['Marianao', 'municipality'], ['La', 'province'], ['HAbana', 'province'],]],
            [[['calle', 'rw'], ['Gomez', 'principal_street'], ['2', 'building'], ['E', 'building'], ['entre', 'rw'], ['calle', 'rw'], ['Marti', 'first_side_street'], ['y', 'rw'], ['calle', 'rw'], ['Washington', 'second_side_street'], ['reparto', 'rw'], ['barrio', 'rw'], ['Azul', 'locality'], ['Arroyo', 'municipality'], ['Naranjo', 'municipality'], ['La', 'province'], ['Habana', 'province']]],
            [[['San', 'principal_street'], ['Juan', 'principal_street'], ['de', 'principal_street'], ['Dios', 'principal_street'], ['Edif', 'rw'], ['108', 'building'], ['apto', 'rw'], ['15', 'apartment'], ['entre', 'rw'], ['aguacate', 'first_side_street'], ['y', 'rw'], ['Compostela', 'second_side_street'], ['La', 'municipality'], ['Habana', 'municipality'], ['Vieja', 'municipality'], ['La', 'province'], ['Habana', 'province']]],
            [[['avenida', 'rw'], ['del', 'principal_street'], ['sur', 'principal_street'], ['entre', 'rw'], ['primelles', 'first_side_street'], ['y', 'rw'], ['Lazada', 'second_side_street'], [',', 'rw'], ['Norte', 'locality'], ['III', 'locality'], [',', 'rw'], ['CERRO', 'municipality'], [',', 'rw'], ['LA', 'province'], ['HABANA', 'province'],]],
            [[['San', 'principal_street'], ['Juan', 'principal_street'], ['DE', 'principal_street'], ['dios', 'principal_street'], ['entre', 'rw'], ['aguacate', 'first_side_street'], ['y', 'rw'], ['compostela', 'second_side_street'], [',', 'rw'], ['La', 'municipality'], ['Habana', 'municipality'], ['Vieja', 'municipality'], [',', 'rw'], ['La', 'province'], ['Habana', 'province'],]],
            [[['27', 'principal_street'], ['b', 'principal_street'], ['entre', 'rw'], ['230', 'first_side_street'], ['y', 'rw'], ['234', 'second_side_street'], [',', 'rw'], ['La', 'locality'], ['Coronela', 'locality'], [',', 'rw'], ['La', 'municipality'], ['Lisa', 'municipality'], [',', 'rw'], ['La', 'province'], ['Habana', 'province'],]],
            [[['calle', 'rw'], ['REYES', 'principal_street'], ['entre', 'rw'], ['c', 'first_side_street'], ['y', 'rw'], ['calle', 'rw'], ['Altarriba', 'second_side_street'], ['Edificio', 'rw'], ['319', 'building'], ['Apto', 'rw'], ['9', 'apartment'], ['Barrio', 'rw'], ['Lawton', 'locality'], ['Diez', 'municipality'], ['de', 'municipality'], ['Octubre', 'municipality'], ['La', 'province'], ['Habana', 'province'],]],
            [[['calle', 'rw'], ['real', 'principal_street'], ['#', 'rw'], ['360', 'building'], ['poblado', 'rw'], ['bacuranao', 'locality'], [',', 'rw'], ['guanabacoa', 'municipality'], [',', 'rw'], ['La', 'province'], ['Habana', 'province'],]],
            [[['calle', 'rw'], ['82', 'principal_street'], ['E', 'rw'], ['/', 'rw'], ['calle', 'rw'], ['5D', 'first_side_street'], ['y', 'rw'], ['calle', 'rw'], ['7', 'second_side_street'], ['Edificio', 'rw'], ['iacc', 'building'], ['#', 'rw'], ['5d14', 'building'], [',', 'rw'], ['apto', 'rw'], ['8', 'apartment'], ['repto', 'rw'], ['villa', 'locality'], ['panamericana', 'locality'], [',', 'rw'], ['La', 'municipality'], ['Habana', 'municipality'], ['del', 'municipality'], ['Este', 'municipality'], [',', 'rw'], ['La', 'province'], ['Habana', 'province'],]],
            [[['calle', 'rw'], ['5ta', 'principal_street'], ['num', 'rw'], ['5800', 'building'], ['Bajo', 'rw'], ['entre', 'rw'], ['calle', 'rw'], ['b', 'first_side_street'], ['y', 'rw'], ['calle', 'rw'], ['c', 'second_side_street'], [',', 'rw'], ['SAN', 'municipality'], ['MIGUEL', 'municipality'], ['DEL', 'municipality'], ['PADRON', 'municipality'], [',', 'rw'], ['LA', 'province'], ['HABANA', 'province'],]],
            [[['calle', 'rw'], ['A', 'principal_street'], ['no', 'rw'], ['48', 'building'], ['y', 'rw'], ['apto', 'rw'], ['1', 'apartment'], ['e', 'rw'], ['entre', 'rw'], ['calle', 'rw'], ['pinar', 'first_side_street'], ['del', 'first_side_street'], ['rio', 'first_side_street'], ['y', 'rw'], ['calle', 'rw'], ['woodberry', 'second_side_street'], ['reparto', 'rw'], ['callejas', 'locality'], ['ARROYO', 'municipality'], ['NARANJO', 'municipality'], ['LA', 'province'], ['HABANA', 'province']]],
            [[['calle', 'rw'], ['7ma', 'principal_street'], ['e', 'rw'], ['entre', 'rw'], ['calle', 'rw'], ['l', 'first_side_street'], ['y', 'rw'], ['calle', 'rw'], ['10', 'second_side_street'], ['edificio', 'rw'], ['10103', 'building'], ['apto', 'rw'], ['23', 'apartment'], ['reparto', 'rw'], ['Altahabana', 'locality'], ['BOYEROS', 'municipality'], ['LA', 'province'], ['HABANA', 'province'],]],
            [[['avenida', 'rw'], ['27', 'principal_street'], ['b', 'principal_street'], ['entre', 'rw'], ['calle', 'rw'], ['230', 'first_side_street'], ['y', 'rw'], ['calle', 'rw'], ['234', 'second_side_street'], ['edificio', 'rw'], ['22', 'building'], ['apto', 'rw'], ['18', 'apartment'], ['reparto', 'rw'], ['la', 'locality'], ['coronela', 'locality'], ['la', 'municipality'], ['lisa', 'municipality'], ['La', 'province'], ['Habana', 'province'],]],
            [[['avenida', 'rw'], ['27', 'principal_street'], ['b', 'principal_street'], ['e', 'rw'], ['entre', 'rw'], ['calle', 'rw'], ['230', 'first_side_street'], ['y', 'rw'], ['calle', 'rw'], ['234', 'second_side_street'], ['Edificio', 'rw'], ['10', 'building'], ['Apto', 'rw'], ['19', 'apartment'], ['reparto', 'rw'], ['la', 'locality'], ['coronela', 'locality'], ['la', 'municipality'], ['lisa', 'municipality'], ['La', 'province'], ['Habana', 'province'],]],
            [[['calle', 'rw'], ['100', 'principal_street'], ['5907', 'building'], ['bajos', 'rw'], ['entre', 'rw'], ['ave', 'rw'], ['59', 'first_side_street'], ['y', 'rw'], ['61', 'second_side_street'], ['Marianao', 'municipality'], ['La', 'province'], ['HABANA', 'province'],]],
            [[['Cisneros', 'principal_street'], ['21', 'building'], ['Altos', 'rw'], ['e', 'rw'], ['entre', 'rw'], ['arnao', 'first_side_street'], ['y', 'rw'], ['cortez', 'second_side_street'], ['ARROYO', 'municipality'], ['NARANJO', 'municipality'], ['LA', 'province'], ['HABANA', 'province'],]],
            [[['avenida', 'rw'], ['47', 'principal_street'], ['4003', 'building'], ['e', 'rw'], ['entre', 'rw'], ['calle', 'rw'], ['40', 'first_side_street'], ['y', 'rw'], ['avenida', 'rw'], ['41', 'second_side_street'], ['reparto', 'rw'], ['kohly', 'locality'], ['playa', 'municipality'], ['la', 'province'], ['habana', 'province'],]],
            [[['calle', 'rw'], ['59', 'principal_street'], ['no', 'rw'], ['10814A', 'building'], ['e', 'rw'], ['entre', 'rw'], ['108', 'first_side_street'], ['y', 'rw'], ['110', 'second_side_street'], ['Apto', 'rw'], ['3', 'apartment'], ['marianao', 'municipality'], ['la', 'province'], ['habana', 'province']]],
        ]


In [17]:
from pandas import DataFrame
import random as rm

class NoiseGeneratorModelTwo(Generator):

    def generate_noise(self, data_set: DataFrame,type=None, address_amount=None):
        '''

                :param data_set: this is the corpus for generate

                :param type: you can specify which kind of addresses examples you want to use it
                        example for using correct addresses  = "ce"
                        example for using almost correct addresses  = "ace"
                        example for using uncorrect addresses  = "uce"
                        example for using uncorrect addresses  = "eq"


                :param address_amount: this value indicates the amount of addresses this method will use

                :return: DataFrame
                '''

        self.address_amount = address_amount
        self.data = data_set
        self.type = type

        if self.type == 'ce':
            address_number = 0
            address_list = []
            words_list = []
            tag_list = []
            for index in range(self.address_amount):
                components = self.__generate_correct_example_type_two()
                address_number += 1
                super().add_new_address(components, address_number, address_list, words_list, tag_list)

            return super().generate_data_frame(address_list, words_list, tag_list)
        elif type == 'ace':
            address_number = 0
            address_list = []
            words_list = []
            tag_list = []

            for index in range(self.address_amount):
                components = self.__generate_almost_correct_examples_type_two()
                address_number += 1
                super().add_new_address(components, address_number, address_list, words_list, tag_list)

            return super().generate_data_frame(address_list, words_list, tag_list)
        elif type == 'uce':
            address_number = 0
            address_list = []
            words_list = []
            tag_list = []

            for index in range(self.address_amount):
                components = self.__generate_uncorrect_examples_type_two()
                address_number += 1
                super().add_new_address(components, address_number, address_list, words_list, tag_list)

            return super().generate_data_frame(address_list, words_list, tag_list)
        elif type == 'eq':
            return self.__generate_equilibrated_examples_type_two()
        else:
            raise NotImplementedError('There is no such kind of example')


    def __generate_equilibrated_examples_type_two(self):

        print('Generate_random_noise_type_two')
        address_number = 0
        address_list = []
        words_list = []
        tag_list = []

        correct,almost_correct,uncorrect = super().divide_equally(self.address_amount)


        for index in range(correct):
            components = self.__generate_correct_example_type_two()
            address_number += 1
            super().add_new_address(components, address_number, address_list, words_list, tag_list)

        for index in range(almost_correct):
            components = self.__generate_almost_correct_examples_type_two()
            address_number += 1
            super().add_new_address(components, address_number, address_list, words_list, tag_list)

        for index in range(uncorrect):
            components = self.__generate_uncorrect_examples_type_two()
            address_number += 1
            super().add_new_address(components, address_number, address_list, words_list, tag_list)


        return super().generate_data_frame(address_list, words_list, tag_list)

    def __generate_correct_example_type_two(self):
        components = []

        building = super().generate_building_syntetic()
        locality = str(rm.choice(self.data['locality']))
        municipality = str(rm.choice(self.data['municipality']))
        province = str(rm.choice(self.data['province']))

        building_form = super().generate_prefix_randomly(BUILDING_PREFIX_CORRECT, 100)
        number_form = super().generate_prefix_randomly(PROPERTY_PREFIX_CORRECT, 100)
        apartment_form = super().generate_prefix_randomly(APARTMENT_PREFIX_CORRECT, 100)
        zone_form = super().generate_prefix_randomly(ZONE_PREFIX_CORRECT, 100)
        locality_form = super().generate_prefix_randomly(LOCALITY_PREFIX_CORRECT, 100)
        municipality_prefix = super().generate_prefix_randomly(MUNICIPALITY_PREFIX_CORRECT, 100)
        province_prefix = super().generate_prefix_randomly(PROVINCE_PREFIX_CORRECT, 100)

        # For creating the first component we have to split itself in three sub-components, if the name of the building is alphanum
        # For example: Edif 456 o 34B
        # [reserved word for building]+ [reserved word for number] + [building]
        # then a random number is used to decide whether or not the number component appears
        # And if it's alpha would be like:
        # [reserved word for building] + [name]

        is_name = True if building.isalpha() else False

        # CREATING COMPONENT 1  --- [building_form],[number_form],[building] ---
        if is_name:
            components.append(
                building_form + [[item, 'building'] for item in
                                 building.split()]
            )
        else:
            # Using the random number mentioned above
            # Add property prefix
            add_num_component = True if rm.randint(0, 100) > 50 else False
            if add_num_component:
                components.append(
                    building_form + number_form + [[item, 'building'] for item in
                                                   building.split()]
                )
            else:
                components.append(
                    building_form + [[item, 'building'] for item in
                                     building.split()]
                )

        # CREATING COMPONENT 2  --- [apartment_form],[number_form],[apartment] ---
        # This component is optional

        add_apart_component = rm.randint(0, 100)
        if add_apart_component > 50:
            apartment_num = super().generate_apartment_syntetic()

            add_num_component = True if rm.randint(0, 100) > 50 else False
            if add_num_component:
                components.append(
                    apartment_form + number_form + [[item, 'apartment'] for item in
                                                    apartment_num.split()]
                )
            else:
                components.append(
                    apartment_form + [[item, 'apartment'] for item in
                                      apartment_num.split()]
                )

        # CREATING COMPONENT 3  --- [locality_form],[locality] ---
        if locality != '':
            loc_aux = ''
            loc_zone = ''
            if locality.lower().find('alamar') != -1:
                spl_loc = locality.split()
                if len(spl_loc) != 1:
                    loc_aux = 'Alamar'
                    for word in spl_loc:
                        if word.lower() != 'alamar':
                            loc_zone += word + ' '

                    # Adding the zone
                    if loc_zone.lower().find('micro') != -1:
                        if rm.randint(0, 50) > 25:
                            components.append(
                                locality_form + [[item, 'locality'] for item in loc_aux.split()] + zone_form + [
                                    [item, 'locality'] for item in
                                    loc_zone.split()]
                            )
                        else:
                            components.append(
                                zone_form + [[item, 'locality'] for item in loc_zone.split()]
                                + locality_form + [[item, 'locality'] for item in loc_aux.split()]
                            )

                    else:
                        add_num_component = True if rm.randint(0, 100) > 50 else False
                        if add_num_component:
                            if rm.randint(0, 50) > 25:
                                components.append(
                                    locality_form + [[item, 'locality'] for item in loc_aux.split()]
                                    + zone_form + number_form + [[item, 'locality'] for item in
                                                                 loc_zone.split()]
                                )
                            else:
                                components.append(
                                    zone_form + number_form + [[item, 'locality'] for item in loc_zone.split()]
                                    + locality_form + [[item, 'locality'] for item in loc_aux.split()]
                                )

                        else:
                            if rm.randint(0, 50) > 25:
                                components.append(
                                    locality_form + [[item, 'locality'] for item in loc_aux.split()] + zone_form + [
                                        [item, 'locality'] for item in
                                        loc_zone.split()]
                                )
                            else:
                                components.append(
                                    zone_form + [[item, 'locality'] for item in loc_zone.split()]
                                    + locality_form + [[item, 'locality'] for item in loc_aux.split()]
                                )
                else:
                    components.append(
                        locality_form + [[item, 'locality'] for item in locality.split()]
                    )
            else:
                components.append(
                    locality_form + [[item, 'locality'] for item in locality.split()]
                )

        # CREATING COMPONENT 4  --- [municipality_form],[municipality] ---
        if len(municipality) != 0 or not super().is_empty(municipality):
            components.append(
                municipality_prefix + [[item, 'municipality'] for item in municipality.split()]
            )

        # CREATING COMPONENT 5  --- [province_form],[province] ---
        if len(province) != 0 or not super().is_empty(province):
            components.append(
                province_prefix + [[item, 'province'] for item in province.split()]
            )

            #  Permutación entre componentes.
            components = super().generate_non_standardization(components, 30)

        return components

    def __generate_almost_correct_examples_type_two(self):
        components = []

        building = super().generate_building_syntetic()
        locality = str(rm.choice(self.data['locality']))
        municipality = str(rm.choice(self.data['municipality']))
        province = str(rm.choice(self.data['province']))

        building_form = super().generate_prefix_randomly(BUILDING_PREFIX, 70)
        number_form = super().generate_prefix_randomly(PROPERTY_PREFIX, 70)
        apartment_form = super().generate_prefix_randomly(APARTMENT_PREFIX, 70)
        zone_form = super().generate_prefix_randomly(ZONE_PREFIX, 60)
        locality_form = super().generate_prefix_randomly(LOCALITY_PREFIX, 60)
        municipality_prefix = super().generate_prefix_randomly(MUNICIPALITY_PREFIX, 60)
        province_prefix = super().generate_prefix_randomly(PROVINCE_PREFIX, 40)

        # For creating the first component we have to split itself in three sub-components, if the name of the building is alphanum
        # For example: Edif 456 o 34B
        # [reserved word for building]+ [reserved word for number] + [building]
        # then a random number is used to decide whether or not the number component appears
        # And if it's alpha would be like:
        # [reserved word for building] + [name]

        is_name = True if building.isalpha() else False

        # CREATING COMPONENT 1  --- [building_form],[number_form],[building] ---
        if is_name:
            components.append(
                building_form + [[item, 'building'] for item in
                                 building.split()]
            )
        else:
            # Using the random number mentioned above
            # Add property prefix
            add_num_component = True if rm.randint(0, 100) > 50 else False
            if add_num_component:
                components.append(
                    building_form + number_form + [[item, 'building'] for item in
                                                   building.split()]
                )
            else:
                components.append(
                    building_form + [[item, 'building'] for item in
                                     building.split()]
                )

        # CREATING COMPONENT 2  --- [apartment_form],[number_form],[apartment] ---
        # This component is optional

        add_apart_component = rm.randint(0, 100)
        if add_apart_component > 50:
            apartment_num = super().generate_apartment_syntetic()

            # para variante num, letra+num
            add_num_component = True if rm.randint(0, 100) > 50 else False
            if add_num_component:
                components.append(
                    apartment_form + number_form + [[item, 'apartment'] for item in
                                                    apartment_num.split()]
                )
            else:
                components.append(
                    apartment_form + [[item, 'apartment'] for item in
                                      apartment_num.split()]
                )

        # CREATING COMPONENT 3  --- [locality_form],[locality] ---
        if locality != '':
            loc_aux = ''
            loc_zone = ''
            if locality.lower().find('alamar') != -1:
                spl_loc = locality.split()
                if len(spl_loc) != 1:
                    loc_aux = 'Alamar'
                    for word in spl_loc:
                        if word.lower() != 'alamar':
                            loc_zone += word + ' '

                    # Adding the zone
                    if loc_zone.lower().find('micro') != -1:
                        if rm.randint(0, 50) > 25:
                            components.append(
                                locality_form + [[item, 'locality'] for item in loc_aux.split()] + zone_form + [
                                    [item, 'locality'] for item in
                                    loc_zone.split()]
                            )
                        else:
                            components.append(
                                zone_form + [[item, 'locality'] for item in loc_zone.split()]
                                + locality_form + [[item, 'locality'] for item in loc_aux.split()]
                            )

                    else:
                        add_num_component = True if rm.randint(0, 100) > 50 else False
                        if add_num_component:
                            if rm.randint(0, 50) > 25:
                                components.append(
                                    locality_form + [[item, 'locality'] for item in loc_aux.split()]
                                    + zone_form + number_form + [[item, 'locality'] for item in
                                                                 loc_zone.split()]
                                )
                            else:
                                components.append(
                                    zone_form + number_form + [[item, 'locality'] for item in loc_zone.split()]
                                    + locality_form + [[item, 'locality'] for item in loc_aux.split()]
                                )

                        else:
                            if rm.randint(0, 50) > 25:
                                components.append(
                                    locality_form + [[item, 'locality'] for item in loc_aux.split()] + zone_form + [
                                        [item, 'locality'] for item in
                                        loc_zone.split()]
                                )
                            else:
                                components.append(
                                    zone_form + [[item, 'locality'] for item in loc_zone.split()]
                                    + locality_form + [[item, 'locality'] for item in loc_aux.split()]
                                )
                else:
                    components.append(
                        locality_form + [[item, 'locality'] for item in locality.split()]
                    )
            else:
                components.append(
                    locality_form + [[item, 'locality'] for item in locality.split()]
                )

        # CREATING COMPONENT 4  --- [municipality_form],[municipality] ---
        if len(municipality) != 0 or not super().is_empty(municipality):
            components.append(
                municipality_prefix + [[item, 'municipality'] for item in municipality.split()]
            )

        # CREATING COMPONENT 5  --- [province_form],[province] ---
        if len(province) != 0 or not super().is_empty(province):
            components.append(
                province_prefix + [[item, 'province'] for item in province.split()]
            )

            #  Permutación entre componentes.
            components = super().generate_non_standardization(components, 50)


        return components

    def __generate_uncorrect_examples_type_two(self):
        components = []

        building = super().generate_building_syntetic()
        locality = str(rm.choice(self.data['locality']))
        municipality = str(rm.choice(self.data['municipality']))
        province = str(rm.choice(self.data['province']))

        building_form = super().generate_prefix_randomly(BUILDING_PREFIX, 55)
        number_form = super().generate_prefix_randomly(PROPERTY_PREFIX, 55)
        apartment_form = super().generate_prefix_randomly(APARTMENT_PREFIX, 55)
        zone_form = super().generate_prefix_randomly(ZONE_PREFIX, 45)
        locality_form = super().generate_prefix_randomly(LOCALITY_PREFIX, 45)
        municipality_prefix = super().generate_prefix_randomly(MUNICIPALITY_PREFIX, 35)
        province_prefix = super().generate_prefix_randomly(PROVINCE_PREFIX, 35)

        # For creating the first component we have to split itself in three sub-components, if the name of the building is alphanum
        # For example: Edif 456 o 34B
        # [reserved word for building]+ [reserved word for number] + [building]
        # then a random number is used to decide whether or not the number component appears
        # And if it's alpha would be like:
        # [reserved word for building] + [name]

        is_name = True if building.isalpha() else False

        # CREATING COMPONENT 1  --- [building_form],[number_form],[building] ---
        if is_name:
            components.append(
                building_form + [[item, 'building'] for item in
                                 building.split()]
            )
        else:
            # Using the random number mentioned above
            # Add property prefix
            add_num_component = True if rm.randint(0, 100) > 50 else False
            if add_num_component:
                components.append(
                    building_form + number_form + [[item, 'building'] for item in
                                                   building.split()]
                )
            else:
                components.append(
                    building_form + [[item, 'building'] for item in
                                     building.split()]
                )

        # CREATING COMPONENT 2  --- [apartment_form],[number_form],[apartment] ---
        # This component is optional

        add_apart_component = rm.randint(0, 100)
        if add_apart_component > 50:
            apartment_num = super().generate_apartment_syntetic()

            # para variante num, letra+num
            add_num_component = True if rm.randint(0, 100) > 50 else False
            if add_num_component:
                components.append(
                    apartment_form + number_form + [[item, 'apartment'] for item in
                                                    apartment_num.split()]
                )
            else:
                components.append(
                    apartment_form + [[item, 'apartment'] for item in
                                      apartment_num.split()]
                )

        # CREATING COMPONENT 3  --- [locality_form],[locality] ---
        if locality != '':
            loc_aux = ''
            loc_zone = ''
            if locality.lower().find('alamar') != -1:
                spl_loc = locality.split()
                if len(spl_loc) != 1:
                    loc_aux = 'Alamar'
                    for word in spl_loc:
                        if word.lower() != 'alamar':
                            loc_zone += word + ' '

                    # Adding the zone
                    if loc_zone.lower().find('micro') != -1:
                        if rm.randint(0, 50) > 25:
                            components.append(
                                locality_form + [[item, 'locality'] for item in loc_aux.split()] + zone_form + [
                                    [item, 'locality'] for item in
                                    loc_zone.split()]
                            )
                        else:
                            components.append(
                                zone_form + [[item, 'locality'] for item in loc_zone.split()]
                                + locality_form + [[item, 'locality'] for item in loc_aux.split()]
                            )

                    else:
                        add_num_component = True if rm.randint(0, 100) > 50 else False
                        if add_num_component:
                            if rm.randint(0, 50) > 25:
                                components.append(
                                    locality_form + [[item, 'locality'] for item in loc_aux.split()]
                                    + zone_form + number_form + [[item, 'locality'] for item in
                                                                 loc_zone.split()]
                                )
                            else:
                                components.append(
                                    zone_form + number_form + [[item, 'locality'] for item in loc_zone.split()]
                                    + locality_form + [[item, 'locality'] for item in loc_aux.split()]
                                )

                        else:
                            if rm.randint(0, 50) > 25:
                                components.append(
                                    locality_form + [[item, 'locality'] for item in loc_aux.split()] + zone_form + [
                                        [item, 'locality'] for item in
                                        loc_zone.split()]
                                )
                            else:
                                components.append(
                                    zone_form + [[item, 'locality'] for item in loc_zone.split()]
                                    + locality_form + [[item, 'locality'] for item in loc_aux.split()]
                                )
                else:
                    components.append(
                        locality_form + [[item, 'locality'] for item in locality.split()]
                    )
            else:
                components.append(
                    locality_form + [[item, 'locality'] for item in locality.split()]
                )

        # CREATING COMPONENT 4  --- [municipality_form],[municipality] ---
        if len(municipality) != 0 or not super().is_empty(municipality):
            components.append(
                municipality_prefix + [[item, 'municipality'] for item in municipality.split()]
            )

        # CREATING COMPONENT 5  --- [province_form],[province] ---
        if len(province) != 0 or not super().is_empty(province):
            components.append(
                province_prefix + [[item, 'province'] for item in province.split()]
            )

        #  Permutación entre componentes.
        components = super().generate_non_standardization(components, 50)

        return components



In [27]:
import keras
import keras.optimizers
import numpy as np
import tensorflow as tf
from keras import Sequential
from keras.layers import LSTM, Embedding, Dense, Bidirectional, Concatenate, Reshape, Flatten
from keras.layers import TextVectorization
from tensorflow.python.ops.ragged.ragged_string_ops import string_bytes_split

class DeepParserModel(NeuralParser):

    def __init__(self, data_set: DataSet, cleaner_method, model=None):
        self.data = data_set
        self.cleaner_method = cleaner_method
        if model is None:
            output_dim = 100
            input_length = 25

            inputs = keras.Input(shape=(1,), dtype="string")

            tv_by_character = self.__create_layer_vectorization(name='TextVectorization_Character',
                                                                max_len=data_set.get_max_len_character(),
                                                                split=string_bytes_split)
            vocab_size_character = len(tv_by_character.get_layer('text_vectorization').get_vocabulary())
            layer_tv_character = tv_by_character(inputs)

            tv_by_trigram = self.__create_layer_vectorization(name='TextVectorization_Trigram',
                                                              max_len=data_set.get_max_len_trigram(),
                                                              split=string_bytes_split,
                                                              ngrams=3)
            vocab_size_trigram = len(tv_by_trigram.get_layer('text_vectorization_1').get_vocabulary())
            layer_tv_by_trigram = tv_by_trigram(inputs)

            tv_by_word = self.__create_layer_vectorization(name='TextVectorization_Word',
                                                           max_len=data_set.get_max_len_word())
            vocab_size_word = len(tv_by_word.get_layer('text_vectorization_2').get_vocabulary())
            layer_tv_by_word = tv_by_word(inputs)

            embedding_character = Embedding(vocab_size_character, 25, name='Embedding_Character')
            layer_embedding_character = embedding_character(layer_tv_character)
            blstm_character = Bidirectional(LSTM(units=25, return_sequences=True, dropout=0.6, recurrent_dropout=0.1),
                                            merge_mode='concat')
            layer_blstm_character = blstm_character(layer_embedding_character)

            embedding_trigram = Embedding(vocab_size_trigram, 25, name='Embedding_Trigram')
            layer_embedding_trigram = embedding_trigram(layer_tv_by_trigram)
            blstm_trigram = Bidirectional(LSTM(units=25, return_sequences=True, dropout=0.6, recurrent_dropout=0.1),
                                          merge_mode='concat')
            layer_blstm_trigram = blstm_trigram(layer_embedding_trigram)

            embedding_word = Embedding(vocab_size_word, output_dim, name='Embedding_Word')
            layer_embedding_word = embedding_word(layer_tv_by_word)
            concat = Concatenate()([layer_blstm_character, layer_blstm_trigram])
            blstm_concat = Bidirectional(
                LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.1),
                merge_mode='concat')
            layer_blstm_concat = blstm_concat(concat)

            # ********* LAYER PROJECT ***************
            projection = Flatten()(layer_blstm_concat)
            projection = Dense(units=data_set.get_max_len_word() * 100)(projection)
            projection = Reshape((data_set.get_max_len_word(), 100))(projection)
            # ********* LAYER PROJECT ***************

            concat_2 = Concatenate()([projection, layer_embedding_word])
            blstm_concat_2 = Bidirectional(
                LSTM(units=data_set.get_n_tag(), return_sequences=True, dropout=0, recurrent_dropout=0),
                merge_mode='sum')
            layer_blstm_concat_2 = blstm_concat_2(concat_2)

            output = Dense(data_set.get_n_tag(), activation='softmax')(layer_blstm_concat_2)
            model = keras.Model(inputs, output, name='Model')

            # opt = keras.optimizers.Adam(learning_rate=0.0005)

            # Optimiser
            opt = keras.optimizers.Adam(learning_rate=0.0005, beta_1=0.9, beta_2=0.999)
            metrics = [tf.metrics.CategoricalAccuracy(), tf.metrics.Precision(), tf.metrics.Recall()]
            # metrics = [tf.metrics.Accuracy()]
            # Accuracy tells you how many times the ML model was correct overall.
            # Precision is how good the model is at predicting a specific category.
            # Recall tells you how many times the model was able to detect a specific category.

            model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=metrics)
            model.summary()
            self.model = model
            # plot_model(model, 'DeepParse_Architecture.jpg')
        elif type(model) is not keras.engine.functional.Functional:
            raise NotImplementedError('Model variable could be Keras.Model instance')
        else:
            self.model = model

    def __create_layer_vectorization(self, name, max_len, ngrams=None, split="whitespace"):
        vectorize_layer = TextVectorization(
            standardize=self.cleaner_method,
            output_mode="int",
            output_sequence_length=max_len,
            ngrams=ngrams,
            split=split
        )

        vectorize_layer.adapt(self.get_data().get_vocabulary())

        vectorize_layer_model = Sequential(name=name)
        vectorize_layer_model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
        vectorize_layer_model.add(vectorize_layer)

        return vectorize_layer_model

    def train(self, batch_size=1200, epochs=50):
        x = np.asarray(self.data.get_x_train_sentence_values())
        x_val = np.asarray(self.data.get_x_val_sentence_values())
        print(self.data.get_y_train_values())
        self.model.fit(x, self.data.get_y_train_values(), batch_size=batch_size,
                       verbose=1, epochs=epochs, validation_data=(x_val, self.data.get_y_val_values()))

    def predict(self, address_list: list):
        print(self.data.get_id_to_category())
        result = self.model.predict(address_list)

        return np.round(result, decimals=2)

    def evaluate(self):
        self.model.evaluate(
            np.asarray(self.data.get_x_test_sentence_values()),
            self.data.get_y_test_values())

    def get_cleaner_method(self):
        return self.cleaner_method

    def get_data(self) -> DataSet:
        return self.data

    def get_model(self) -> keras.Model:
        return self.model

    def set_data(self, data: DataSet) -> None:
        if data.get_n_tag() == self.data.get_n_tag():
            self.data = data
        else:
            raise NotImplementedError('The number of tags does not correspond to the trained network')



In [20]:
from abc import ABC, abstractmethod

import tensorflow as tf




class NeuralParser(ABC):

    @abstractmethod
    def predict(self, address_list: list):
        pass
    @abstractmethod
    def train(self, batch_size=1200, epochs=50):
        pass
    @abstractmethod
    def evaluate(self):
        pass
    @abstractmethod
    def get_cleaner_method(self):
        pass
    @abstractmethod
    def get_data(self) -> DataSet:
        pass
    @abstractmethod
    def get_model(self) -> tf.keras.Model:
        pass

    @abstractmethod
    def set_data(self, data: DataSet) -> None:
        pass




In [19]:
import pandas as pd


class DataSet:
    def __init__(self, vocabulary_word, max_len_character, max_len_trigram, max_len_word, n_tag, id_to_category, input_dim,
                 x_train_sentence, x_test_sentence, x_val_sentence, y_train, y_test, y_val):
        self.__vocabulary_word = vocabulary_word
        self.max_len_character = max_len_character
        self.max_len_trigram = max_len_trigram
        self.max_len_word = max_len_word
        self.__n_tag = n_tag
        self.__id_to_category = id_to_category
        self.__x_train_sentence = x_train_sentence
        self.__x_test_sentence = x_test_sentence
        self.__x_val_sentence = x_val_sentence
        self.__y_train = y_train
        self.__y_test = y_test
        self.__y_val = y_val
        self.__input_dim = input_dim

    def get_x_train_sentence_values(self):
        return self.__x_train_sentence

    def get_x_test_sentence_values(self):
        return self.__x_test_sentence

    def get_x_val_sentence_values(self):
        return self.__x_val_sentence

    def get_y_train_values(self):
        return self.__y_train

    def get_y_test_values(self):
        return self.__y_test

    def get_y_val_values(self):
        return self.__y_val

    def get_vocabulary(self):
        return self.__vocabulary_word

    def get_max_len_character(self):
        return self.max_len_character

    def get_max_len_trigram(self):
        return self.max_len_trigram

    def get_max_len_word(self):
        return self.max_len_word

    def get_n_tag(self):
        return self.__n_tag

    def get_input_dim(self):
        return self.__input_dim

    def get_id_to_category(self):
        return self.__id_to_category


In [18]:
STREET_NAME_PREFIX_CORRECT = ['calle', 'avenida',  'carretera',  'calzada',  'pasaje',  'callejon',  'callejuela',
                      'acera',  'terraplen', 'camino']
BETWEEN_PREFIX_CORRECT = ['entre', ]
CORNER_CONECTOR_PREFIX_CORRECT = [ 'esquina']
BUILDING_PREFIX_CORRECT = ['edificio', 'EDIFICIO', 'Edificio',]
PROPERTY_PREFIX_CORRECT = [ 'número']
DISTANCE_PREFIX_CORRECT = [ 'kilometro', 'Kilometro', 'KILOMETRO']
DISTANCE_SPECIFICATION_PREFIX_CORRECT =['½','¼','¾','1/4','1/2','3/4']
LOCALITY_PREFIX_CORRECT = ['pueblo', 'Pueblo', 'poblado', 'Poblado', 'caserio', 'Caserio',
                   'reparto', 'Reparto', 'barrio',  'comunidad', 'Comunidad', 'distrito',  'Distrito', ]
OTHER_PREFIX_CORRECT = ['batey', 'bat', 'ciudad','finca', 'Finca']
PLACE_PREFIX_CORRECT = ['Bar','Club','Restaurante','Hotel','Centro comercial','Supermercado','Tienda minorista',
                'Tienda mayorista','Mercado agropecuario','Bazar','Feria','Parque',
                'Piscina','Zonas de escalada','Finca','Clínica','Hospital','Laboratorios']
ZONE_PREFIX_CORRECT = [ 'zona',  'Zona']
MUNICIPALITY_PREFIX_CORRECT = ['Municipio', 'municipio', 'MUNICIPIO']
PROVINCE_PREFIX_CORRECT = [ 'Provincia', 'provicia', 'PROVINCIA', ]
APARTMENT_PREFIX_CORRECT = [ 'apartamento', ]
BUILDING_SUBDIVISION_PREFIX_CORRECT = ['apto', 'bloque', 'blq', 'esc', 'escalera', 'piso', 'apartamento', 'Bloque',
                               ]
CONJUNTION_CORRECT = ['y', 'e']

STREET_NAME_PREFIX = ['calle', 'CALL', 'ave', 'avenida', 'ave.', 'Ave', 'Ave.', 'AVE', 'AVE.', 'carretera', 'ctra',
                      'Ctra.', 'Ctra', 'carr',
                      'Carr.', 'Carr', 'carret', 'Carret.', 'Carret', 'CARRET', 'CARRETE', 'calzada', 'czda.', 'calz',
                      'Calzada', 'Czda',
                      'Calz', 'czda.', 'calz.', 'Czda.', 'Calz.', 'pasaje', 'psje', 'callejon', 'cjon', 'callejuela',
                      'acera', 'terraplan', 'terr', 'Terraplen', 'camino', 'calle', 'calle', 'calle', 'calle', 'calle', 'calle', 'calle', 'calle', 'calle']
BETWEEN_PREFIX = ['e/', 'e/c', '%', 'entre', 'entre', 'entre', 'entre', 'entre', 'E\\', 'E/', 'ent.', 'etr.', 'e\c', '/', '\\', 'e /']
CORNER_CONNECTOR_PREFIX = ['esq', 'esquina']
LOCALITY_PREFIX = ['pueblo', 'localidad', 'Pueblo', 'poblado', 'pob', 'Poblado', 'caserio', 'cas', 'csrio', 'Caserio',
                   'batey', 'bat', 'ciudad','acera', 'terraplan', 'terr', 'Terraplen', 'camino']
BUILDING_PREFIX = ['ed', 'edif', 'edf', 'edificio', 'EDIFICIO', 'Edificio', 'EDIF.', 'ED', 'e.','edf.', 'edi' 'EDF','Edif.', 'Edifi', 'edif.', 'ed.','']
PROPERTY_PREFIX = ['#', 'no', 'S/n', 'S/N', 's/N', 's/n', 'nro.', 'nu', 'num', 'no.', 'num.', 'nu.', 'número', 'no','nro','']
DISTANCE_PREFIX = ['Km.', 'KM.', 'Km', 'KM', 'K.', 'k.', 'kilometro', 'Kilometro', 'KILOMETRO', 'K\M', 'K/M', 'k/m',
                   'k\m','kmts','kmts.',]
DISTANCE_SPECIFICATION_PREFIX =['½','¼','¾','1/4','1/2','3/4']
OTHER_PREFIX = ['batey', 'bat', 'ciudad','finca', 'Finca']
PLACE_PREFIX = ['Bar','Club','Restaurante','Hotel','Centro comercial','Supermercado','Tienda minorista',
                'Tienda mayorista','Mercado agropecuario','Bazar','Feria','Parque',
                'Piscina','Zonas de escalada','Finca','Clínica','Hospital','Laboratorios']
ZONE_PREFIX = ['', 'Zn.', 'zn.', 'zon.', 'z.', 'zona', 'zna.', 'za', 'zo.', 'Zona']
MUNICIPALITY_PREFIX = ['Mun.', 'mun.', 'Mun', 'mun', 'Municipio', 'municipio', 'MUNICIPIO''M.','m.','mcpio.','Mno.', ' ']
PROVINCE_PREFIX = ['Prov.', 'prov.', 'PROV.', 'Prov', 'prov', 'Provincia', 'provicia', 'PROVINCIA', 'Pro.', 'PRO.',
                   'Pro', 'PRO', 'pro','prcia','provin.', 'prv.', '']
APARTMENT_PREFIX = ['apart.', 'apt.', 'apto', 'apto.', 'apartamento', 'apt', 'ap', 'aptto', '']
BUILDING_SUBDIVISION_PREFIX = ['apto', 'bloque', 'blq', 'esc', 'escalera', 'piso', 'Apto', 'apartamento', 'Bloque',
                               'apto.', 'Apto.', 'apart', 'APTO', 'apt', 'apto']
CONJUNCTION = ['y', 'e']




In [21]:
import pickle

class DataSetManage:
    @staticmethod
    def save(data_set: DataSet, route_and_name: str):
        if type(route_and_name) is not str:
            raise NotImplementedError('route variable could be string instance')
        with open(route_and_name + '.pickle', "wb") as file:
            pickle.dump(data_set, file)

    @staticmethod
    def load(route_and_name) -> DataSet:
        data_set = None
        with open(route_and_name + '.pickle', "rb") as file:
            data_set = pickle.load(file)

        if  type(route_and_name) is None:
            raise NotImplementedError('route and name data set is not found')

        return data_set

    def export_data(df, filename, file_format='csv'):
        if file_format == 'csv':
            df.to_csv(filename + '.csv', index=False)
        elif file_format == 'xlsx':
            df.to_excel(filename + '.xlsx', index=False)

In [22]:
import numpy as np

class Decoder:

    def __init__(self, id_to_cat: dict, cleaner_method):
        self.cat_to_id = {v: k for k, v in id_to_cat.items()}
        self.cleaner_method = cleaner_method

    def decoder_to_first_address_model(self, matrix_probability, text_address_list):
        list_address_classified = []
        index_address = 0
        for raw_address in matrix_probability:
            components = {value: [] for key, value in self.cat_to_id.items()}

            pre_presses_text = self.cleaner_method(text_address_list[index_address])
            words = str(pre_presses_text.numpy().decode('utf-8')).split()

            for i in range(len(words)):  # Lista de probabilidades
                index_tag = list(raw_address[i]).index(max(list(raw_address[i])))
                components[index_tag] += [words[i]]

            principal_street = None
            first_side_street = None
            second_side_street = None
            building = None
            apartment = None
            locality = None
            municipality = None
            province = None
            reserve_word = None

            for cat in self.cat_to_id:
                if cat == 'principal_street':
                    principal_street = components[self.cat_to_id[cat]]
                elif cat == 'first_side_street':
                    first_side_street = components[self.cat_to_id[cat]]
                elif cat == 'second_side_street':
                    second_side_street = components[self.cat_to_id[cat]]
                elif cat == 'building':
                    building = components[self.cat_to_id[cat]]
                elif cat == 'apartment':
                    apartment = components[self.cat_to_id[cat]]
                elif cat == 'locality':
                    locality = components[self.cat_to_id[cat]]
                elif cat == 'municipality':
                    municipality = components[self.cat_to_id[cat]]
                elif cat == 'province':
                    province = components[self.cat_to_id[cat]]
                elif cat == 'rw':
                    reserve_word = components[self.cat_to_id[cat]]

            list_address_classified.append(
                ClassifiedAddressOne(principal_street=principal_street, first_side_street=first_side_street, second_side_street=second_side_street,
                                     locality=locality, municipality=municipality, province=province,
                                     building=building, apartment=apartment, reserve_word=reserve_word))
            index_address += 1

        return list_address_classified

    def decoder_to_second_address_model(self, matrix_probability, text_address_list):
        list_address_classified = []
        index_address = 0
        for raw_address in matrix_probability:
            components = {value: [] for key, value in self.cat_to_id.items()}

            pre_presses_text = self.cleaner_method(text_address_list[index_address])
            words = str(pre_presses_text.numpy().decode('utf-8')).split()

            for i in range(len(words)):  # Lista de probabilidades
                index_tag = list(raw_address[i]).index(max(list(raw_address[i])))
                components[index_tag] += [words[i]]

            building = None
            apartment = None
            locality = None
            municipality = None
            province = None
            reserve_word = None

            for cat in self.cat_to_id:
                if cat == 'building':
                    building = components[self.cat_to_id[cat]]
                elif cat == 'apartment':
                    apartment = components[self.cat_to_id[cat]]
                elif cat == 'locality':
                    locality = components[self.cat_to_id[cat]]
                elif cat == 'municipality':
                    municipality = components[self.cat_to_id[cat]]
                elif cat == 'province':
                    province = components[self.cat_to_id[cat]]
                elif cat == 'rw':
                    reserve_word = components[self.cat_to_id[cat]]

            list_address_classified.append(
                ClassifiedAddressTwo(locality, municipality,province, building, apartment, reserve_word))
            index_address += 1

        return list_address_classified

In [23]:
!pip install dill


Collecting dill
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/110.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dill
Successfully installed dill-0.3.6


In [24]:
import tensorflow as tf
import dill


class NeuralParserManage:

    @staticmethod
    def save_neural_parser(neural_parser: NeuralParser, route='assets/trained_models/model_type_one/',
                           name='model_pretrained'):
        if type(route) is not str:
            raise NotImplementedError('route variable could be string instance')
        if type(name) is not str:
            raise NotImplementedError('name_model variable could be string instance')
        # save model
        route = NeuralParserManage.__reformat_and_validate_route(route)
        name = NeuralParserManage.__reformat_and_validate_name_model(name)
        dir_model = route + '/' + name + '/model'
        dir_data_set = route + '/' + name + '/data_set'
        dir_cleaner_method = route + '/' + name + '/cleaner_method'

        neural_parser.get_model().save(dir_model, save_format="tf")
        DataSetManage.save(neural_parser.get_data(), route_and_name=dir_data_set)

        # Save cleaner method with dill
        dill.dump(neural_parser.get_cleaner_method(), open(dir_cleaner_method, 'wb'))

    @staticmethod
    def load_neural_parser(route='default', name='default') -> NeuralParser:
        file_path = route + '/' + name
        # load data set
        data = DataSetManage.load(file_path + '/data_set')

        # load address cleaner
        cleaner_method = dill.load(open(file_path + '/cleaner_method', 'rb'))
        # load keras model
        model = tf.keras.models.load_model(file_path + '/model', custom_objects={cleaner_method.__name__: cleaner_method, 'string_bytes_split': string_bytes_split})
        return DeepParserModel(data, cleaner_method=cleaner_method, model=model)

    @staticmethod
    def __reformat_and_validate_route(route: str):
        if len(route) == 0:
            raise NotImplementedError('name_model variable cannot be an empty text')

        if route[len(route) - 1] == '/':
            route = route[:len(route) - 1]

        if len(route.split()) == 0:
            raise NotImplementedError('route variable cannot be an text with only withe space')

        return route

    @staticmethod
    def __reformat_and_validate_name_model(name_model: str):
        if len(name_model) == 0:
            raise NotImplementedError('name_model variable cannot be an empty text')
        if name_model[len(name_model) - 1] == '/':
            name_model = name_model[:len(name_model) - 1]
        if name_model[0] == '/':
            name_model = name_model[1:]

            # Repeating validation because the before steps has removed a character
        if len(name_model) == 0:
            raise NotImplementedError('name_model variable cannot be an empty text')
        if len(name_model.split()) == 0:
            raise NotImplementedError('name_model variable cannot be an text with only withe space')

        return name_model


In [35]:
print('Init')
# # load corpus
data = pd.read_excel('./corpus/corpus_short_havana.xlsx')
generator = NoiseGeneratorModelTwo()
data_with_noise = generator.generate_noise(data,type='eq', address_amount=20000)
data_set = DataSetAdapter().adapt(data_with_noise, 0.80, 0.05, 0.15)
DataSetManage.save(data_set, './dataset/EQ5_20000')



Init
Generate_random_noise_type_two


In [36]:
print('Loading Datasets')
eq1_data = DataSetManage.load('./dataset/EQ1_20000')
eq2_data = DataSetManage.load('./dataset/EQ2_20000')
eq3_data = DataSetManage.load('./dataset/EQ3_20000')
eq4_data = DataSetManage.load('./dataset/EQ4_20000')
eq5_data = DataSetManage.load('./dataset/EQ5_20000')


Loading Datasets


In [None]:
print('Creating Neural Model')

model = DeepParserModel(eq1_data, AddressCleaner.cleaner_method('custom_standardization'))

print('Correct Examples Triaining')

model.train(batch_size=1000, epochs=20)
NeuralParserManage.save_neural_parser(model, route='./trained_models/',
                                      name='MCT_T2_1000_20_20k')
print('Saved MCT_T2_1000_20_20k ')

Creating Neural Model
Model: "Model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 TextVectorization_Character (S  (None, 109)         0           ['input_1[0][0]']                
 equential)                                                                                       
                                                                                                  
 TextVectorization_Trigram (Seq  (None, 109)         0           ['input_1[0][0]']                
 uential)                                                                                         
                                                                        