## PJ : Faris Fadhilah / 13518026 ##

### Import Libraries ###

In [1]:
# Libraries
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import tensorflow as tf # create neural networks
from tensorflow.keras import Sequential # create squential NN model
from tensorflow.keras.layers import Dense # implements the operation: output = activation(dot(input, kernel) + bias)
from tensorflow.keras.models import load_model # load saved model
from tensorflow.keras.utils import plot_model # plot model architecture
import time

### Import Dataset ###

In [2]:
# Import Dataset
data = pd.read_csv("dataset.csv")
data = data.iloc[:6000,:]
# Separate Features
X = data["Text"]
y = data["Language"]

### Label Encoding ###

In [3]:
le = LabelEncoder()
y = le.fit_transform(y)

### Text Preprocessing ##

In [4]:
# creating a list for appending the preprocessed text
data_list = []
# iterating through all the text
for text in X:
       # removing the symbols and numbers
        text = re.sub(r'[!@#$(),n"%^*?:;~`0-9]', ' ', text)
        text = re.sub(r'[[]]', ' ', text)
        # converting the text to lower case
        text = text.lower()
        # appending to data_list
        data_list.append(text)

  import sys


### Bag of Words ###

In [5]:
cv = CountVectorizer()
X = cv.fit_transform(data_list).toarray()

### Train Test Splitting ###

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

### Define Model Multi Layer Perceptron (MLP) ###

In [7]:
# input size hyperparameter
INPUT_SIZE = x_train.shape[1]
INPUT_SIZE
# outputsize hyperparatmeter
OUTPUT_SIZE = len(data['Language'].unique())
OUTPUT_SIZE
# epochs and batch_size hyperparameters
EPOCHS = 30
BATCH_SIZE = 128
# creating the MLP model
model = Sequential([Dense(100, activation='relu', kernel_initializer='he_normal', input_shape=(INPUT_SIZE,)),Dense(80, activation='relu', kernel_initializer='he_normal'), Dense(50, activation='relu', kernel_initializer='he_normal'),Dense(OUTPUT_SIZE, activation='softmax')])
# compiling the MLP model
model.compile(optimizer='Adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

### Train Model ###

In [8]:
# fitting the MLP model
start_time = time.time()
hist = model.fit(x_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.3, verbose=2)
print("Train Time Execution: %.3f seconds" % (time.time() - start_time))

Epoch 1/30
27/27 - 8s - loss: 2.2964 - accuracy: 0.5643 - val_loss: 1.2165 - val_accuracy: 0.8813 - 8s/epoch - 286ms/step
Epoch 2/30
27/27 - 6s - loss: 0.5973 - accuracy: 0.9625 - val_loss: 0.4316 - val_accuracy: 0.9486 - 6s/epoch - 236ms/step
Epoch 3/30
27/27 - 6s - loss: 0.1014 - accuracy: 0.9985 - val_loss: 0.2994 - val_accuracy: 0.9507 - 6s/epoch - 236ms/step
Epoch 4/30
27/27 - 6s - loss: 0.0149 - accuracy: 0.9997 - val_loss: 0.2691 - val_accuracy: 0.9542 - 6s/epoch - 235ms/step
Epoch 5/30
27/27 - 6s - loss: 0.0055 - accuracy: 1.0000 - val_loss: 0.2641 - val_accuracy: 0.9535 - 6s/epoch - 237ms/step
Epoch 6/30
27/27 - 6s - loss: 0.0034 - accuracy: 1.0000 - val_loss: 0.2605 - val_accuracy: 0.9542 - 6s/epoch - 232ms/step
Epoch 7/30
27/27 - 6s - loss: 0.0024 - accuracy: 1.0000 - val_loss: 0.2577 - val_accuracy: 0.9528 - 6s/epoch - 235ms/step
Epoch 8/30
27/27 - 6s - loss: 0.0019 - accuracy: 1.0000 - val_loss: 0.2550 - val_accuracy: 0.9535 - 6s/epoch - 233ms/step
Epoch 9/30
27/27 - 6s - 

### Evaluate Model ####

In [9]:
# summary of the MLP model
model.summary()
# architetcure of the MLP model
plot_model(model, show_shapes=True)
# evaluating the loss and accuracy of the model
loss, accuracy = model.evaluate(x_test, y_test, verbose=2)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 100)               10233800  
                                                                 
 dense_1 (Dense)             (None, 80)                8080      
                                                                 
 dense_2 (Dense)             (None, 50)                4050      
                                                                 
 dense_3 (Dense)             (None, 22)                1122      
                                                                 
Total params: 10,247,052
Trainable params: 10,247,052
Non-trainable params: 0
_________________________________________________________________
38/38 - 1s - loss: 0.2425 - accuracy: 0.9583 - 1s/epoch - 31ms/step


### Save Model ###

In [10]:
# saving the model
model.save('language_detection_model2.h5')

### Function Predict Sentence ###

In [11]:
# function for predicting language
def predict(text):
    x = cv.transform([text])
    lang = model.predict(x)
    lang = np.argmax(lang)
    print("This text is in",le.inverse_transform([lang])[0])

### Load Model & Predict ###

In [12]:
start_time = time.time()
# loading the model
model = load_model('language_detection_model2.h5')
# predict language
predict("We use a dataset that contains 22 selective languages from the original dataset which includes the following Languages")
print("Predict Time Execution: %.3f seconds" % (time.time() - start_time))

This text is in English
Predict Time Execution: 0.599 seconds
