In [2]:
import os
from text_processing.process_data import *
from training.train_dev_test import *

import warnings
warnings.filterwarnings('ignore')

## Data Preparation

In [3]:
# paths
files_folder = 'lang_files/'
files_path = os.path.join(os.getcwd(), files_folder)

# initial data
lang_data = compile_all_data(files_path)
lang_data.head()

Unnamed: 0,sentence,language
0,"Dili, 11 Outubru 2021 - Gabinete Apoiu Ativida...",tet
1,Treinamentu ne’e ninia objetivu prinsipál mak ...,tet
2,Iha loron daruak hosi treinamentu ne’e partisi...,tet
3,Partisipante na’in-56 iha treinamentu ne’e mai...,tet
4,Komunidade iha suku Bikeli ho Makadade agrades...,tet


In [4]:
# after preprocessed and counted each sentence length
clean_data = clean_data_with_count(files_path)
clean_data.head()

Unnamed: 0,sentence,language,sentence_length
0,dili outubru gabinete apoiu atividade kónjuge...,tet,275
1,treinamentu ne’e ninia objetivu prinsipál mak ...,tet,232
2,iha loron daruak hosi treinamentu ne’e partisi...,tet,288
3,partisipante na’in iha treinamentu ne’e mai ho...,tet,82
4,komunidade iha suku bikeli ho makadade agrades...,tet,320


In [5]:
# confirm that data was cleaned.
clean = clean_data[(clean_data['sentence'] =='') & (clean_data['sentence'] ==' ')]
try:
    assert len(clean) == 0
    print("The data is cleaned.")
except AssertionError:
    print("The data is NOT cleaned.")

The data is cleaned.


## Train and test model

### Split dataset

In [6]:
# Split dataset to train, development(dev)/validation, and test sets
X_train, y_train, X_dev, y_dev, X_test, y_test = train_dev_test_split(clean_data, 0.3, 0.5)

# print the sizes of the resulting sets
print("Train set size:", len(X_train))
print("Dev set size:", len(X_dev))
print("Test set size:", len(X_test))

Train set size: 74258
Dev set size: 15913
Test set size: 15913


### Train and evaluate model

In [7]:
# Compare the models

model_lists = [LinearSVC(), LogisticRegression(multi_class='ovr'), MultinomialNB()]
analyzers = ['char_wb', 'word']

compare_models(model_lists, analyzers, 1, 6, 1, X_train, y_train, X_dev, y_dev)

Model: LinearSVC()
Analyzer: char_wb
	n_gram 1 --> accuracy:  0.9798
	n_gram 2 --> accuracy:  0.9927
Analyzer: word
	n_gram 1 --> accuracy:  0.9938
	n_gram 2 --> accuracy:  0.9666
Model: LogisticRegression(multi_class='ovr')
Analyzer: char_wb
	n_gram 1 --> accuracy:  0.9789
	n_gram 2 --> accuracy:  0.9924
Analyzer: word
	n_gram 1 --> accuracy:  0.9913
	n_gram 2 --> accuracy:  0.9519
Model: MultinomialNB()
Analyzer: char_wb
	n_gram 1 --> accuracy:  0.9412
	n_gram 2 --> accuracy:  0.9911
Analyzer: word
	n_gram 1 --> accuracy:  0.9960
	n_gram 2 --> accuracy:  0.9708


In [22]:
# Select the best model, train and evaluate it using dev set

# Train the model
model = train_model(TfidfVectorizer(analyzer='char_wb',ngram_range=(5,5)), MultinomialNB(), X_train, y_train)

# Evaluate the model using dev set
evaluate_model(model,X_dev, y_dev)

Accuracy:  0.997109281719349
Confusion Matrix:  [[5050    2    3    2]
 [  14 4254    2    0]
 [  13    2 4020    1]
 [   5    0    2 2543]]
Classification Report:                precision    recall  f1-score   support

          en       0.99      1.00      1.00      5057
          id       1.00      1.00      1.00      4270
          pt       1.00      1.00      1.00      4036
         tet       1.00      1.00      1.00      2550

    accuracy                           1.00     15913
   macro avg       1.00      1.00      1.00     15913
weighted avg       1.00      1.00      1.00     15913



### Test model

In [23]:
# Evaluate the model using test set
evaluate_model(model, X_test, y_test)

Accuracy:  0.9967950732105826
Confusion Matrix:  [[5001    3    3    3]
 [  15 4260    3    2]
 [   9    3 4008    3]
 [   3    0    4 2593]]
Classification Report:                precision    recall  f1-score   support

          en       0.99      1.00      1.00      5010
          id       1.00      1.00      1.00      4280
          pt       1.00      1.00      1.00      4023
         tet       1.00      1.00      1.00      2600

    accuracy                           1.00     15913
   macro avg       1.00      1.00      1.00     15913
weighted avg       1.00      1.00      1.00     15913



### Further test

In [24]:
test1 = model.predict(["Organizasaun mundial saúde"])
test2 = model.predict(["Tribunál rekursu rejeita kandidatura partidu"])
print(f"Test 1 classification: {test1}\nTest 2 classification: {test2}")

Test 1 classification: ['tet']
Test 2 classification: ['tet']


In [25]:
input = ["Deklarasaun Universál Direitus Umanus", 
        "Indonesia merupakan negara terluas ke-14 sekaligus",
        "A língua portuguesa, também designada português, é uma língua",
        "Deklarasaun ne'e inklui artigu 30 ne'ebé esplika Asembleia Jerál",
        "Can we feed a future population of 10 billion people a healthy?"
        ]

# Naive Bayes and Logistic Regression
pred_probs = model.predict_proba(input)

for i, probs in enumerate(pred_probs):
    print(input[i])
    for j, lang in enumerate(model.classes_):
        print(lang, probs[j])

Deklarasaun Universál Direitus Umanus
en 1.7198102347965515e-05
id 7.656943219400005e-05
pt 0.0001802486040840422
tet 0.9997259838613715
Indonesia merupakan negara terluas ke-14 sekaligus
en 1.3050918693531165e-07
id 0.9999995025902451
pt 9.055449712082101e-08
tet 2.7634607120579004e-07
A língua portuguesa, também designada português, é uma língua
en 0.00018657712215813118
id 2.970099994212303e-05
pt 0.998175916996359
tet 0.0016078048815381433
Deklarasaun ne'e inklui artigu 30 ne'ebé esplika Asembleia Jerál
en 2.916091141335691e-08
id 4.085156091273892e-08
pt 1.4132462405706621e-07
tet 0.9999997886629056
Can we feed a future population of 10 billion people a healthy?
en 0.9999978448630447
id 1.321850425292166e-07
pt 1.061105899088093e-06
tet 9.61846016528177e-07


In [None]:
# SVM - LinearSVC
pred_result = model.predict(input)
pred_probability = model.decision_function(input)
for i in range(len(input)):
    print(f"{input[i]} ---> {pred_result[i]} ---> {np.argmax(pred_probability[i])} --> {pred_probability[i]} ")

### Save model [if required]

In [26]:
import joblib

# save the model to a file
joblib.dump(model, 'model_best/tet-lid-model_NB_best_ng5chars.pkl')

['model_best/tet-lid-model_NB_best_ng5chars.pkl']

In [48]:
# load the save model from a file
saved_model = joblib.load('model_best/tet-lid-model_NB_best_ng5chars.pkl')

In [29]:
from unidecode import unidecode

text = """
𝐋𝐚 𝐏𝐚𝐫𝐭𝐢𝐬𝐢𝐩𝐚 𝐀𝐧𝐢𝐯𝐞𝐫𝐬𝐚𝐫𝐢𝐮 𝐕𝐞𝐭𝐞𝐫𝐚𝐧𝐮 𝐁𝐚 𝐃𝐚𝐥𝐚 𝐕𝐈 𝐗𝐚𝐧𝐚𝐧𝐚: “𝐇𝐚’𝐮 𝐋𝐚 𝐇𝐚𝐥𝐨 𝐁𝐮𝐚𝐭 𝐈𝐝𝐚, 𝐇𝐮𝐬𝐮 𝐁𝐚 𝐉𝐞𝐧𝐞𝐫𝐚𝐥 𝐧𝐨 𝐅𝐮𝐧𝐝𝐚𝐝𝐨𝐫 𝐒𝐢𝐫𝐚, 𝐓𝐚𝐧𝐛𝐚 𝐇𝐚’𝐮 𝐍𝐞’𝐞 𝐒𝐨𝐞𝐡𝐚𝐫𝐭𝐨 𝐊𝐞𝐝𝐮𝐚” 
"""

plain_text = unidecode(text)
plain_text

'\nLa Partisipa Aniversariu Veteranu Ba Dala VI Xanana: "Ha\'u La Halo Buat Ida, Husu Ba Jeneral no Fundador Sira, Tanba Ha\'u Ne\'e Soeharto Kedua" \n'

In [49]:
saved_model.predict([plain_text])

array(['tet'], dtype='<U3')