In [248]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [249]:
# Import the data
df = pd.read_csv('/Users/elvisechefu/Desktop/language detection/languages.csv')
df.head()

Unnamed: 0,text,language
0,ich denke es handelt sich hier um ein missvers...,german
1,ich habe tom gerade erst verlassen,german
2,tom versuchte mary nur zu ärgern,german
3,tom hat mir die hand geküsst,german
4,ich wusste dass dir das gefiele,german


In [250]:
# Get all unique languages values
print(df.groupby('language').nunique())
print(f"Total Length of dataset: {len(df)}")

            text
language        
english   275687
french    169693
german    199618
spanish   118686
Total Length of dataset: 763684


In [251]:
# Check for missing values
print(df.isnull().sum())

text        0
language    0
dtype: int64


In [252]:
# Explore the columns and their types
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 763684 entries, 0 to 763683
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   text      763684 non-null  object
 1   language  763684 non-null  object
dtypes: object(2)
memory usage: 11.7+ MB
None


In [253]:
# Encode language column
# le = preprocessing.LabelEncoder()
# le.fit(df['language'])
# df['language'] = le.transform(df['language'])
# le.classes_

def onehot_encode(df, columns, prefixes):
    df = df.copy()
    for column, prefix in zip (columns, prefixes):
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df , dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

df = onehot_encode(
    df,
    columns=['language'],
    prefixes=['lan']
)

In [254]:
df.head()

Unnamed: 0,text,lan_english,lan_french,lan_german,lan_spanish
0,ich denke es handelt sich hier um ein missvers...,False,False,True,False
1,ich habe tom gerade erst verlassen,False,False,True,False
2,tom versuchte mary nur zu ärgern,False,False,True,False
3,tom hat mir die hand geküsst,False,False,True,False
4,ich wusste dass dir das gefiele,False,False,True,False


In [255]:
y = df.drop('text', axis=1)
X = df['text']

In [256]:
y.head()

Unnamed: 0,lan_english,lan_french,lan_german,lan_spanish
0,False,False,True,False
1,False,False,True,False
2,False,False,True,False
3,False,False,True,False
4,False,False,True,False


In [257]:
X.head()

0    ich denke es handelt sich hier um ein missvers...
1                   ich habe tom gerade erst verlassen
2                     tom versuchte mary nur zu ärgern
3                         tom hat mir die hand geküsst
4                      ich wusste dass dir das gefiele
Name: text, dtype: object

In [258]:
# Split data into training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y , train_size=0.3, random_state=42)

In [259]:
# Turn pandas dataframe into TensorFlow Dataset
#trainset
raw_train_dataset = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
#testset
raw_test_dataset = tf.data.Dataset.from_tensor_slices((X_test.values, y_test.values))

In [260]:
# Create batches
batch_size = 60
raw_train_dataset = raw_train_dataset.batch(batch_size)
raw_test_dataset = raw_test_dataset.batch(batch_size)

In [261]:
# Print three labels as example
for text_batch, label_batch in raw_train_dataset.take(1):
    for i in range(3):
       print('Review:', text_batch.numpy()[i])
       print("Label:", label_batch.numpy()[i])

Review: b'the only people standing in front of the building are policemen'
Label: [ True False False False]
Review: b'jusquo\xc3\xb9 \xc3\xaatesvous pr\xc3\xaats \xc3\xa0 aller '
Label: [False  True False False]
Review: b'la cantidad de papel producido por un pa\xc3\xads est\xc3\xa1 cercanamente relacionado a sus est\xc3\xa1ndares culturales'
Label: [False False False  True]


In [262]:
# Get number of unique words in entire dataset
from collections import Counter

results = Counter()
df['text'].str.split().apply(results.update) # Very computer intensive method
len(results)


104611

In [263]:
max_features = 50000  # Total words to vectorize
sequence_length = 20  # The length of a sentence

vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

# Take a smaller sample for adaptation
sample_size = 1000  # Choose an appropriate size based on your dataset size
subset_for_adaptation = df['text'].sample(n=sample_size)

# Adapt to the smaller subset
vectorize_layer.adapt(subset_for_adaptation)


In [264]:
# Function to vectorize text
def vectorize_text(text):
    text = tf.expand_dims(text, -1)
    return tf.dtypes.cast(vectorize_layer(text), tf.float32)

In [265]:
# Retrieve a batch from the dataset
text_batch, label_batch = next(iter(raw_test_dataset))
first_review, first_label = text_batch[0], label_batch[0]

print("Review:", first_review)
print("Label:", first_label)
print("Vectorized review:", vectorize_text(first_review))

Review: tf.Tensor(b'the grass is always greener on the other side of the fence', shape=(), dtype=string)
Label: tf.Tensor([ True False False False], shape=(4,), dtype=bool)
Vectorized review: tf.Tensor(
[[  6.   1.  14. 138.   1.  98.   6.   1.   1.  30.   6.   1.   0.   0.
    0.   0.   0.   0.   0.   0.]], shape=(1, 20), dtype=float32)


In [266]:
# Apply the TextVectorization step to the train and test dataset
train_ds = raw_train_dataset.map(lambda x,y: (vectorize_text(x), y))

test_ds = raw_test_dataset.map(lambda x ,y: (vectorize_text(x) , y))



In [267]:
# Performance measures
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

CREATE THE ACTUAL MODEL RNN Model

In [268]:
tf.keras.backend.clear_session()

embedding_dim = 16

model = tf.keras.Sequential([
    layers.Embedding(max_features + 1, embedding_dim, input_length=20), 
    layers.GlobalAveragePooling1D(),
    layers.Dense(128, activation=tf.nn.relu), 
    layers.Dense(64, activation=tf.nn.relu), 
    layers.Dense(32, activation=tf.nn.relu),
    layers.Dense(4, activation=tf.nn.softmax), 
    layers.Dropout(0.5),
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 16)            800016    
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 128)               2176      
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 4)                 132       
                                                        

In [269]:
model.compile(loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])

In [270]:
epochs = 1
history = model.fit(
    train_ds,
    epochs=epochs
)



In [271]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.055499229580163956
Accuracy:  0.9773092269897461


In [272]:
model.save('my_language_detection_model.h5')


  saving_api.save_model(


In [273]:
#prediction
to_predict = ["Servus was heißt du?"]

to_predict = vectorize_layer(to_predict)
prediction = model.predict(to_predict)

classes = ["English", "French", "German", "Spanish"]

highest_prediction = tf.math.argmax(prediction, 1).numpy()

print(classes[highest_prediction[0]])
print(f"Certainty: {prediction[0][highest_prediction][0] * 100}%")

German
Certainty: 99.89362359046936%


In [276]:
#prediction test 2
to_predict = ["Stop that immediate action", "I'm going to the movies tonight"]

to_predict = vectorize_layer(to_predict)
prediction = model.predict(to_predict)

classes = ["English", "French", "German", "Spanish"]

highest_prediction = tf.math.argmax(prediction, 1).numpy()

print(classes[highest_prediction[0]])
print(f"Certainty: {prediction[0][highest_prediction][0] * 100}%")

English
Certainty: 99.82247948646545%


In [277]:
to_predict = ["i don't undertand what in the world you're saying"]
prediction = model.predict(vectorize_layer(to_predict))
print(f"Predicted Language: {classes[prediction.argmax()]}, Certainty: {100 * prediction.max():.2f}%")


Predicted Language: English, Certainty: 100.00%


In [278]:
to_predict = ["cállate tu boca huele"]
prediction = model.predict(vectorize_layer(to_predict))
print(f"Predicted Language: {classes[prediction.argmax()]}, Certainty: {100 * prediction.max():.2f}%")

Predicted Language: Spanish, Certainty: 58.31%


In [279]:
to_predict = ["ton connard pue"]
prediction = model.predict(vectorize_layer(to_predict))
print(f"Predicted Language: {classes[prediction.argmax()]}, Certainty: {100 * prediction.max():.2f}%")

Predicted Language: French, Certainty: 81.30%


In [281]:
to_predict = ["you are a doppelganger"]
prediction = model.predict(vectorize_layer(to_predict))
print(f"Predicted Language: {classes[prediction.argmax()]}, Certainty: {100 * prediction.max():.2f}%")

Predicted Language: English, Certainty: 99.99%


In [282]:
to_predict = ["you are a doppelgänger"]
prediction = model.predict(vectorize_layer(to_predict))
print(f"Predicted Language: {classes[prediction.argmax()]}, Certainty: {100 * prediction.max():.2f}%")

Predicted Language: English, Certainty: 99.99%


In [283]:
to_predict = ["doppelgänger"]
prediction = model.predict(vectorize_layer(to_predict))
print(f"Predicted Language: {classes[prediction.argmax()]}, Certainty: {100 * prediction.max():.2f}%")

Predicted Language: English, Certainty: 49.60%


In [284]:
to_predict = ["é, je veux toucher tes cheveux s'il te plaît!!!!"]
prediction = model.predict(vectorize_layer(to_predict))
print(f"Predicted Language: {classes[prediction.argmax()]}, Certainty: {100 * prediction.max():.2f}%")

Predicted Language: French, Certainty: 99.96%
