In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Conv1D, LSTM, Dropout, Flatten
from tensorflow.keras import optimizers
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from keras.src.utils import pad_sequences
from sklearn.model_selection import train_test_split
from custom_preprocessor import preprocess
from fs.osfs import OSFS
import json

# Prediction

In [5]:
import time


def get_columns(directory):
    with OSFS(f'./{directory}') as fs:
        print(fs.listdir('.'))
        return fs.listdir('.')

def load_tokenizer_from_file(title):
    with open(f'./tokenizer_configs/tokenizer_config_{title}.json', 'r') as config_file:
        data = json.load(config_file)
        tokenizer = tokenizer_from_json(data)
    return tokenizer

def tokenize(text, max_length, title):
    tokenizer = load_tokenizer_from_file(title)
    prediction_word = tokenizer.texts_to_sequences([text])
    prediction_word = pad_sequences(prediction_word, maxlen=max_length)
    return prediction_word
    

def predict(text, max_length, language):
    prediction_word = preprocess(text)
    if language == 'families':
        print(f"Text before preprocessing: {text}")
        print(f"Text after preprocessing: {prediction_word}")
    prediction_word = tokenize(prediction_word, max_length, language)
    model = load_model(f'./models/model_{language}.keras')
    print(f'tokenized values for {language} {prediction_word}')
    prediction = model.predict([prediction_word])
    result = {}
    class_labels = np.unique(get_columns(f'data_{language}'))
    for index, prediction in enumerate(np.array(prediction)[0]):
        result.update({class_labels[index]: round(prediction * 100, 1)})

    import tensorflowjs as tfjs
    tfjs.converters.save_keras_model(model, f'./converted_models/model_{language}')

    # Print keys and values of the result dictionary
    for key, value in result.items():
        print(f"{key}: {value}%")
    return max(result, key=lambda key: result[key])

text_prediction = "hvordan gar det med dig"
family = predict(text_prediction, 40, 'families')
print(family)
if family == 'Germanic':
    predicted_language = predict(text_prediction, 49, 'germanic')
elif family == 'Slavic':
    predicted_language = predict(text_prediction, 48, 'slavic')
elif family == 'Hellenic':
    predicted_language = predict(text_prediction, 48, 'hellenic')
elif family == 'Romance':
    predicted_language = predict(text_prediction, 48, 'romance')
elif family == 'Uralic':
    predicted_language = predict(text_prediction, 48, 'uralic')

Text before preprocessing: hvordan gar det med dig
Text after preprocessing: hvordan gar det med dig hvordan gar det med dig
tokenized values for families [[ 2 17  3  9  2 14  1  7  2 13  1 14  2 14  4 17  2 15 18  6  9 14  3  5
   2 17  3  9  2 14  1  7  2 13  1 14  2 14  4 17]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
['Hellenic', 'Romance', 'Germanic', 'Slavic', 'Uralic']




failed to lookup keras version from the file,
    this is likely a weight only file
Germanic: 100.0%
Hellenic: 0.0%
Romance: 0.0%
Slavic: 0.0%
Uralic: 0.0%
Germanic
tokenized values for germanic [[ 0  0 12 17 10  6  7  4  3  1 13  4  6  1  7  2  5  1 14  2  7  1  7  8
  13  1 12 17 10  6  7  4  3  1 13  4  6  1  7  2  5  1 14  2  7  1  7  8
  13]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step




['Dutch', 'German', 'English', 'Norwegian', 'Danish', 'Swedish']
failed to lookup keras version from the file,
    this is likely a weight only file
Danish: 51.1%
Dutch: 0.0%
English: 0.0%
German: 0.0%
Norwegian: 30.2%
Swedish: 18.7%
