<a href="https://colab.research.google.com/github/dooryan/languageidentification/blob/main/language_identification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import string
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import joblib

In [None]:
def open_file(filename):
    with open(filename, 'r',encoding="utf8") as f:
        data = f.readlines()
    return data

In [None]:
data_raw = dict()

data_raw['ceb'] =  open_file('/content/drive/MyDrive/Colab Notebooks/Thesis/Cebuano.csv')
data_raw['msk'] =  open_file('/content/drive/MyDrive/Colab Notebooks/Thesis/Mansaka.csv')
data_raw['mnb'] =  open_file('/content/drive/MyDrive/Colab Notebooks/Thesis/Manobo-Ata.csv')

In [None]:
data_raw['ceb'][1000]

'1001\tPagkakaron gipahayag sa manager sa DDPKRM nga nakahuman na siya ug 18 batches sa mga aplikante sa pagpahigayon ug phsycological exam ug isunod niini ang interview.\n'

In [None]:
def show_statistics(data):
    for language, sentences in data.items():
        
        word_list = ' '.join(sentences).split()
        
        number_of_sentences = len(sentences)  
        number_of_words = len(word_list)
        number_of_unique_words = len(set(word_list))
        sample_extract = ''.join(sentences[7].split(' ')[:30])
    
        print(f'Language: {language}')
        print('-----------------------')
        print(f'Number of sentences\t:\t {number_of_sentences}')
        print(f'Number of words\t\t:\t {number_of_words}')
        print(f'Number of unique words\t:\t {number_of_unique_words}')
        print(f'Sample extract\t\t:\t {sample_extract}...\n')

In [None]:
show_statistics(data_raw)

In [None]:
def text_process(text):
    
    preprocessed_text = text
    preprocessed_text = text.lower().replace('-',' ')
    translation_table = str.maketrans('\n',' ', string.punctuation+string.digits) 
    preprocessed_text = preprocessed_text.translate(translation_table)  
    
    return preprocessed_text

In [None]:
data_preprocessed = {k: [text_process(sentence) for sentence in v] for k, v in data_raw.items()}

In [None]:
print('ORIGINAL STATISTICS')
show_statistics(data_raw)
print('PREPROCESSED STATISTICS :')
show_statistics(data_preprocessed)

In [None]:
data_preprocessed['mnb'][8000]

'magimon ko du on sundau kai to mansalinow  '

In [None]:
sentences_train, y_train =[], []
for k, v in data_preprocessed.items():
    for sentence in v:
        sentences_train.append(sentence)
        y_train.append(k)

In [None]:
vectorizer = CountVectorizer()

In [None]:
x_train = vectorizer.fit_transform(sentences_train)
x_train

<90004x53742 sparse matrix of type '<class 'numpy.int64'>'
	with 960838 stored elements in Compressed Sparse Row format>

In [None]:
naive_bayes = MultinomialNB()
naive_bayes.fit(x_train,y_train)

MultinomialNB()

In [None]:
data_val = dict()

data_val['ceb'] =  open_file('/content/drive/MyDrive/Colab Notebooks/Thesis/Cebuano.csv')
data_val['msk'] =  open_file('/content/drive/MyDrive/Colab Notebooks/Thesis/Mansaka.csv')
data_val['mnb'] =  open_file('/content/drive/MyDrive/Colab Notebooks/Thesis/Manobo-Ata.csv')





data_val_preprocessed = {k: [text_process(sentence) for sentence in v] for k,v in data_val.items()}

In [None]:
show_statistics(data_val_preprocessed)

In [None]:
sentences_val, y_val = [], []
for k,v in data_val_preprocessed.items():
    for sentence in v:
        sentences_val.append(sentence)
        y_val.append(k)

In [None]:
x_val = vectorizer.transform(sentences_val)

In [None]:
predictions = naive_bayes.predict(x_val)
predictions

array(['ceb', 'ceb', 'ceb', ..., 'mnb', 'mnb', 'mnb'], dtype='<U3')

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_val, predictions, labels=['ceb', 'msk', 'mnb']))

[[27748    97    29]
 [    1 20867   585]
 [   46    22 40609]]


In [None]:
f1_score(y_val, predictions, average='weighted')

0.9913145155844283

In [None]:
joblib.dump(naive_bayes, '/content/drive/MyDrive/Colab Notebooks/Thesis/final_model.joblib')
joblib.dump(vectorizer, '/content/drive/MyDrive/Colab Notebooks/Thesis/final_modelVec.joblib')

['/content/drive/MyDrive/Colab Notebooks/Thesis/final_modelVec.joblib']

In [None]:
model = joblib.load('/content/drive/MyDrive/Colab Notebooks/Thesis/final_model.joblib')
vectorizer = joblib.load('/content/drive/MyDrive/Colab Notebooks/Thesis/final_modelVec.joblib')

In [None]:
text = "Samoka oy ayaw daw pag samok"
text = text_process(text)
text=[text]
text_vectorized = vectorizer.transform(text)

model.predict(text_vectorized)[0]

'ceb'