In [1]:
from text_processing.process_data import ProcessData
from training.train_model import TrainModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from text_processing.process_data import ProcessData
import warnings

warnings.filterwarnings("ignore")


## Data Preparation

In [2]:
# Instantiate classes
process_data = ProcessData()
# Since we trained the model on chars and words levels, we do not normalize the sentence length.
dataset = process_data.initial_clean_data_with_count()
train_models = TrainModel(dataset)


In [3]:
# Initial data
dataset.head()


Unnamed: 0,sentence,language,sentence_length
0,dili outubru gabinete apoiu atividade kónjuge...,tet,275
1,treinamentu ne’e ninia objetivu prinsipál mak ...,tet,232
2,iha loron daruak hosi treinamentu ne’e partisi...,tet,288
3,partisipante na’in iha treinamentu ne’e mai ho...,tet,82
4,komunidade iha suku bikeli ho makadade agrades...,tet,320


In [4]:
# Confirm that data is cleaned.
clean = dataset[(dataset["sentence"] == "") & (dataset["sentence"] == " ")]
try:
    assert len(clean) == 0
    print("The data is cleaned.")
except AssertionError:
    print("The data is NOT cleaned.")


The data is cleaned.


## Train and test model

### Split dataset

In [5]:
# Split dataset to train, development(dev)/validation, and test sets
X_train, y_train, X_dev, y_dev, X_test, y_test = train_models.train_dev_test_split(0.3, 0.5)

# print the sizes of the resulting sets
print("Train set size:", len(X_train))
print("Dev set size:", len(X_dev))
print("Test set size:", len(X_test))


Train set size: 80362
Dev set size: 17220
Test set size: 17221


### Train and evaluate model

In [7]:
# Compare various models

model_lists = [LinearSVC(), LogisticRegression(multi_class="ovr"), MultinomialNB()]
analyzers = ["char_wb", "word"]

train_models.compare_models(model_lists, analyzers, 1, 6, 1, X_train, y_train, X_dev, y_dev)


Model: LinearSVC()
Analyzer: char_wb
	n_gram 1 --> accuracy:  0.9772
	n_gram 2 --> accuracy:  0.9909
	n_gram 3 --> accuracy:  0.9948
	n_gram 4 --> accuracy:  0.9951
	n_gram 5 --> accuracy:  0.9954
	n_gram 6 --> accuracy:  0.9948
Analyzer: word
	n_gram 1 --> accuracy:  0.9926
	n_gram 2 --> accuracy:  0.9639
	n_gram 3 --> accuracy:  0.8298
	n_gram 4 --> accuracy:  0.5306
	n_gram 5 --> accuracy:  0.4170
	n_gram 6 --> accuracy:  0.3725
Model: LogisticRegression(multi_class='ovr')
Analyzer: char_wb
	n_gram 1 --> accuracy:  0.9765
	n_gram 2 --> accuracy:  0.9912
	n_gram 3 --> accuracy:  0.9941
	n_gram 4 --> accuracy:  0.9949
	n_gram 5 --> accuracy:  0.9946
	n_gram 6 --> accuracy:  0.9931
Analyzer: word
	n_gram 1 --> accuracy:  0.9898
	n_gram 2 --> accuracy:  0.9459
	n_gram 3 --> accuracy:  0.7962
	n_gram 4 --> accuracy:  0.4697
	n_gram 5 --> accuracy:  0.3785
	n_gram 6 --> accuracy:  0.3503
Model: MultinomialNB()
Analyzer: char_wb
	n_gram 1 --> accuracy:  0.9388
	n_gram 2 --> accuracy:  0.98

In [6]:
# Select the best model, train and evaluate it using the dev set

# Train the model
model = train_models.train_model(
    TfidfVectorizer(analyzer="char_wb", ngram_range=(5, 5)), MultinomialNB(), X_train, y_train
)

# Evaluate the model using dev set
train_models.evaluate_model(model, X_dev, y_dev)


Accuracy:  0.9955865272938443
Confusion Matrix:  [[5257    6    3    3]
 [  24 4766    4    0]
 [  15    7 4425    2]
 [   3    0    9 2696]]
Classification Report:                precision    recall  f1-score   support

          en       0.99      1.00      0.99      5269
          id       1.00      0.99      1.00      4794
          pt       1.00      0.99      1.00      4449
         tet       1.00      1.00      1.00      2708

    accuracy                           1.00     17220
   macro avg       1.00      1.00      1.00     17220
weighted avg       1.00      1.00      1.00     17220



### Test model

In [7]:
# Evaluate the model using test set
train_models.evaluate_model(model, X_test, y_test)


Accuracy:  0.9959932640380931
Confusion Matrix:  [[5329    4    1    2]
 [  23 4666    2    1]
 [  20    4 4448    5]
 [   3    1    3 2709]]
Classification Report:                precision    recall  f1-score   support

          en       0.99      1.00      1.00      5336
          id       1.00      0.99      1.00      4692
          pt       1.00      0.99      1.00      4477
         tet       1.00      1.00      1.00      2716

    accuracy                           1.00     17221
   macro avg       1.00      1.00      1.00     17221
weighted avg       1.00      1.00      1.00     17221



### Further test

In [24]:
test1 = model.predict(["Organizasaun mundial saúde"])
test2 = model.predict(["Tribunál rekursu rejeita kandidatura partidu"])
print(f"Test 1 classification: {test1}\nTest 2 classification: {test2}")


Test 1 classification: ['tet']
Test 2 classification: ['tet']


In [25]:
input = [
    "Deklarasaun Universál Direitus Umanus",
    "Indonesia merupakan negara terluas ke-14 sekaligus",
    "A língua portuguesa, também designada português, é uma língua",
    "Deklarasaun ne'e inklui artigu 30 ne'ebé esplika Asembleia Jerál",
    "Can we feed a future population of 10 billion people a healthy?",
]

# Naive Bayes and Logistic Regression
pred_probs = model.predict_proba(input)

for i, probs in enumerate(pred_probs):
    print(input[i])
    for j, lang in enumerate(model.classes_):
        print(lang, probs[j])


Deklarasaun Universál Direitus Umanus
en 1.7198102347965515e-05
id 7.656943219400005e-05
pt 0.0001802486040840422
tet 0.9997259838613715
Indonesia merupakan negara terluas ke-14 sekaligus
en 1.3050918693531165e-07
id 0.9999995025902451
pt 9.055449712082101e-08
tet 2.7634607120579004e-07
A língua portuguesa, também designada português, é uma língua
en 0.00018657712215813118
id 2.970099994212303e-05
pt 0.998175916996359
tet 0.0016078048815381433
Deklarasaun ne'e inklui artigu 30 ne'ebé esplika Asembleia Jerál
en 2.916091141335691e-08
id 4.085156091273892e-08
pt 1.4132462405706621e-07
tet 0.9999997886629056
Can we feed a future population of 10 billion people a healthy?
en 0.9999978448630447
id 1.321850425292166e-07
pt 1.061105899088093e-06
tet 9.61846016528177e-07


In [None]:
# SVM - LinearSVC
pred_result = model.predict(input)
pred_probability = model.decision_function(input)
for i in range(len(input)):
    print(
        f"{input[i]} ---> {pred_result[i]} ---> {np.argmax(pred_probability[i])} --> {pred_probability[i]} "
    )



### Save model [if required]

In [26]:
import joblib

# save the model to a file
joblib.dump(model, "model_best/tet-lid-model_NB_best_ng5chars.pkl")



['model_best/tet-lid-model_NB_best_ng5chars.pkl']

In [2]:
# load the save model from a file
import joblib

saved_model = joblib.load("model_best/tet-lid-model_NB_best_ng5chars.pkl")



In [26]:
from unidecode import unidecode

text = ["Timor-Leste", "Timor", "Lei"]

# plain_text = unidecode(text)
# plain_text



In [27]:
pred_probs = saved_model.predict_proba(text)

for i, probs in enumerate(pred_probs):
    print(text[i])
    for j, lang in enumerate(saved_model.classes_):
        print(lang, probs[j])


Timor-Leste
en 5.971441277656112e-07
id 2.524699071902872e-06
pt 2.5094790700185165e-06
tet 0.9999943686777295
Timor
en 0.0007093783786956153
id 0.006613671115733073
pt 0.0006555447235547801
tet 0.9920214057820171
Lei
en 0.02498752277221515
id 0.021932794497354923
pt 0.4597678041996972
tet 0.4933118785307327
