In [998]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dropout, GlobalAveragePooling1D, BatchNormalization
from tensorflow.keras import optimizers
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.src.utils import pad_sequences
from sklearn.model_selection import train_test_split

# Get data
Pandas is used to get the data from the project directory.
The data is a .csv file, that has been created by book texts and prepared to be listed in the file word-by-word.

In [999]:
df = pd.read_csv('dataset.csv')
df = df.sample(frac=1.0, random_state=42)
df.head()

Unnamed: 0,Word,Language
59324,you doing checking for letter,English
111343,professoren rystede,Danish
63047,excellent” said percy,English
111514,apropos skal,Danish
138835,mano pendía un enorme bolso rojo,Spanish


# Prepare data
Divide the data into X and y. X is input and y is the desired output.
One-Hot Encode it, so the language is categorical. This is done with Pandas.

In [1000]:
# Getting X - the input values
X = df.iloc[:, :-1]
X.loc[:, 'Word'] = X['Word'].astype(str)
tokenizer = Tokenizer(char_level=True) # Used for turning the words into numbers
tokenizer.fit_on_texts(X['Word'].tolist()) # Giving Tokenizer the data it needs to learn how to encode the words
X_sequences = tokenizer.texts_to_sequences(X['Word'].tolist()) # Now the words are encoded into sequences
input_dim = max(len(seq) for seq in X_sequences)
X = pad_sequences(
    X_sequences, 
    maxlen=input_dim, 
    padding='post'
) # Making all input values be of same length
print(f"Rows: {len(X)}")

# Getting y - the output values
y_pre_training = df.iloc[:, -1:]
y = pd.get_dummies(y_pre_training).values
print(f"Amount of languages to identify: {len(y[1])}")

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)

Rows: 144165
Amount of languages to identify: 13


In [1001]:
model = Sequential()
model.add(Conv1D(input_dim, 5, activation='relu'))  # Convolutional layer
tf.keras.layers.SimpleRNN(10, activation='relu'),
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.6))
model.add(GlobalAveragePooling1D())  # Pooling layer
model.add(Dense(len(y[1]), activation='softmax'))
adam = optimizers.Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

In [1002]:
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=1024,verbose=1)

Epoch 1/20
[1m 30/113[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1:37[0m 1s/step - accuracy: 0.2553 - loss: 2.4032

KeyboardInterrupt: 

In [None]:
%matplotlib inline
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title("Model's MSE")
plt.legend(['Train', 'Val'], loc='upper right')
plt.ylabel('Loss')
plt.xlabel('Epoch')

# Prediction

In [None]:
prediction_word = "quien eres tu"
prediction_word = tokenizer.texts_to_sequences([prediction_word])
prediction_word = pad_sequences(prediction_word, maxlen=input_dim)
prediction = model.predict([prediction_word])
output_probabilities = np.array(prediction)

result = []
class_labels = np.unique(y_pre_training)
for index, prediction in enumerate(output_probabilities[0]):
    result.append(f"{class_labels[index]} - {round(prediction*100, 1)}%")

for r in result:
    print(r)


In [None]:
model.save('./model.keras')