In [1]:
import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from keras.src.utils import pad_sequences
from custom_preprocessor import preprocess
from fs.osfs import OSFS
import json

# Prediction

In [13]:
def get_columns(directory):
    with OSFS(f'./{directory}') as fs:
        print(fs.listdir('.'))
        return fs.listdir('.')

def load_tokenizer_from_file(title):
    with open(f'./tokenizer_configs/tokenizer_config_{title}.json', 'r') as config_file:
        data = json.load(config_file)
        tokenizer = tokenizer_from_json(data)
    return tokenizer

def tokenize(text, max_length, title):
    tokenizer = load_tokenizer_from_file(title)
    prediction_word = tokenizer.texts_to_sequences([text])
    prediction_word = pad_sequences(prediction_word, maxlen=max_length)
    return prediction_word
    

def predict(text, max_length, language):
    prediction_word = preprocess(text)
    if language == 'families':
        print(f"Text before preprocessing: {text}")
        print(f"Text after preprocessing: {prediction_word}")
    prediction_word = tokenize(prediction_word, max_length, language)
    model = load_model(f'./models/model_{language}.keras')
    prediction = model.predict([prediction_word])
    result = {}
    class_labels = np.unique(get_columns(f'data_{language}'))
    for index, prediction in enumerate(np.array(prediction)[0]):
        result.update({class_labels[index]: round(prediction * 100, 1)})

    # Print keys and values of the result dictionary
    for key, value in result.items():
        print(f"{key}: {value}%")
    return max(result, key=lambda key: result[key])

text_prediction = "Hvorfor er der så varmt herinde"
family = predict(text_prediction, 40, 'families')
print(family)
if family == 'Germanic':
    predicted_language = predict(text_prediction, 49, 'germanic')
elif family == 'Slavic':
    predicted_language = predict(text_prediction, 48, 'slavic')
elif family == 'Hellenic':
    predicted_language = predict(text_prediction, 48, 'hellenic')
elif family == 'Romance':
    predicted_language = predict(text_prediction, 48, 'romance')
elif family == 'Uralic':
    predicted_language = predict(text_prediction, 48, 'uralic')



Text before preprocessing: Hvorfor er der så varmt herinde
Text after preprocessing: hvorfor er der sa varmt herinde hvorfor er der




['Hellenic', 'Romance', 'Germanic', 'Slavic', 'Uralic']
Germanic: 100.0%
Hellenic: 0.0%
Romance: 0.0%
Slavic: 0.0%
Uralic: 0.0%
Germanic
['Dutch', 'German', 'English', 'Norwegian', 'Danish', 'Swedish']
Danish: 53.1%
Dutch: 0.4%
English: 0.2%
German: 1.1%
Norwegian: 22.2%
Swedish: 23.0%
