In [1302]:
import json
import re

FILE_BASE_PATH                              = "./data/data/ende/"
DEFAULT_OUTPUT_DATASET_FILE_NAME            = "dataset.txt"
CLEAR_HTML_TAG_REGULAR_EXPRESSION           = re.compile("<.*?>")
CLEAR_SPECIAL_HTML_CHARS_REGULAR_EXPRESSION = re.compile("(&)\w+?;")
UKNOWN_CHARS_REGULAR_EXPRESSION             = re.compile("[^a-zA-Z0-9 \t.,-ä)'$’\"\%(#—“”!–öü+éá‘€*ó•\−―ʼäëïöüçáéíóúñ✦„─]")

In [1303]:

def readJsonFile(path): 
    file = open(path, encoding="utf-8")

    fileAsJson = json.load(file)

    file.close()

    return fileAsJson

def removeHtmlTagsFromString(originalString):
    return re.sub(CLEAR_HTML_TAG_REGULAR_EXPRESSION, "", originalString)

def removeHtmlSpecialCharsFromString(originalString):
    return re.sub(CLEAR_SPECIAL_HTML_CHARS_REGULAR_EXPRESSION, "", originalString)

def removeUnknowChars(originalString): 

    if(len(re.findall(UKNOWN_CHARS_REGULAR_EXPRESSION, originalString))):
        #print("removing string: ", originalString)
        return ""
    
    return originalString

def clearString(originalString):
    

    x = removeHtmlTagsFromString(originalString)
    x = removeHtmlSpecialCharsFromString(x)
    x = removeUnknowChars(x)
    x = x.replace("\t", " ") 
    x = x.replace("voilà!", "voila")
    x = x.lower()
    return x

def readJsonClean(jsonText):
    return {key : clearString(value) for key, value in jsonText["text"].items()}

def createPairsArray(source, target):
    return [f"{value}\t{target.get(key)}\n" for key, value in source.items()]
    
def createDatasetFile(source, target, destinationFileName=DEFAULT_OUTPUT_DATASET_FILE_NAME):
    outputFile = open(destinationFileName, "w", encoding="utf-8")

    i = 0
    for key, value in source.items():
        #if(len(target.get(key)) < 3 or len(value) < 3):
        #    print(f"{value}\t{target.get(key)}\n")
        outputFile.write(f"{value}\t{target.get(key)}\n")  

    outputFile.close()

def createDatasetFileFromRows(rows, destinationFileName=DEFAULT_OUTPUT_DATASET_FILE_NAME):
    outputFile = open(destinationFileName, "w", encoding="utf-8")

    i = 0
    for row in rows:
        #if(len(target.get(key)) < 3 or len(value) < 3):
        #    print(f"{value}\t{target.get(key)}\n")
        outputFile.write(row) 
 

    outputFile.close()


In [1304]:
sourFilePath   = FILE_BASE_PATH + "ende_en_dev.json"
targetFilePath = FILE_BASE_PATH + "ende_de_dev.json"

In [1305]:

jsonSource = readJsonFile(sourFilePath)
jsonTarget = readJsonFile(targetFilePath) 

jsonSourceClean = readJsonClean(jsonSource)
jsonTargetClean = readJsonClean(jsonTarget)

createDatasetFile(jsonSourceClean, jsonTargetClean)
 

In [1296]:
sourFilePath   = FILE_BASE_PATH + "ende_en_train.json"
targetFilePath = FILE_BASE_PATH + "ende_de_train.json"

In [1297]:
import string
import tensorflow as tf
jsonSource = readJsonFile(sourFilePath)
jsonTarget = readJsonFile(targetFilePath) 

jsonSourceClean = readJsonClean(jsonSource)
jsonTargetClean = readJsonClean(jsonTarget)  

createDatasetFile(jsonSourceClean, jsonTargetClean, "train.txt") 
pairsArray = createPairsArray(jsonSourceClean, jsonTargetClean) 

In [1306]:
import io
import random

from tensorflow.keras import layers
#text_file = "./train.txt"
#with open(text_file, encoding='utf-8') as f:
#    lines = f.read().split("\n")[:-1]


lines = pairsArray

text_pairs = []

for line in lines:
    english, port = line.split("\t")
    port = "[start] " + port + " [end]"
    text_pairs.append((english, port))

train_english_texts = [pair[0] for pair in text_pairs] 
vocab_size = 15000    # O modelo apneas vai conhecer 15000 palavras
sequence_length = 200  # cada frase vai ter 20 palavrasg


source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length
)

max = len(train_english_texts)
a = 1
b = max 
current = b // 2

hasError = False

while(( b- a) > 1):
    half = a + (b - a) // 2
    #print("\ncurrent half: ",half)
    try:
        source_vectorization = layers.TextVectorization(
            max_tokens=vocab_size,
            output_mode="int",
            output_sequence_length=sequence_length
        )
        
        #print(f"lookup 1 {a} {half}")
        source_vectorization.adapt(train_english_texts[a : half])
        source_vectorization.get_vocabulary() 
    except UnicodeDecodeError:
        hasError = True
        print(f"the error is between {a} {half}  ")
        b = half
        continue
    
    if(a >= b):
        break

    try:
        source_vectorization = layers.TextVectorization(
            max_tokens=vocab_size,
            output_mode="int",
            output_sequence_length=sequence_length
        )
        #print(f"lookup 2 {half} {b}")
        source_vectorization.adapt(train_english_texts[half:b])
        source_vectorization.get_vocabulary() 
    except UnicodeDecodeError: 
        hasError = True
        print(f"the error is between {half} {b}")
        a = half 

    if(not hasError):
        print("no enconding error found in the text")
        break

no enconding error found in the text


In [1299]:
print(len(train_english_texts))
print(train_english_texts[a :b+1 ])
pairsArray[a]

100611
['voilà! joined reports and lightning experience together at last.', 'to hide the summary row, select hide summary from the options menu.']


'voilà! joined reports and lightning experience together at last.\tvoilà! verbundene berichte und lightning experience sind endlich vereint.\n'

In [1300]:
train_english_texts.pop(a)
pairsArray.pop(a)

'voilà! joined reports and lightning experience together at last.\tvoilà! verbundene berichte und lightning experience sind endlich vereint.\n'

In [1301]:
createDatasetFileFromRows(pairsArray, "train.txt")