In [2]:
import os
#Mapping each file to a language label

label_map = {
    "nepali_cleaned.txt": "Nepali",
    "bhojpuri_cleaned.txt": "Bhojpuri",
    "maithili_cleaned.txt": "Maithili",
    "newari_cleaned.txt": "Newari",
    "tamang_cleaned.txt": "Tamang",
    "tharu_cleaned.txt": "Tharu",
}

texts = [] #stores sentences
labels = [] #stores labels

#Check if file exists after looping through each file-language pair
for filename, label in label_map.items():  
    if not os.path.exists(filename):
        print(f"File {filename} not found")
        continue
        
    #open file and reach each line
    with open(filename, encoding = "utf-8") as f:
        for line in f:
            line = line.strip()
            if line:    #skips blank lines
                texts.append(line)
                labels.append(label)
print(f"Loaded {len(texts)} sentences from {len(label_map)} languages.")

Loaded 165301 sentences from 6 languages.


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    texts,
    labels,
    test_size=0.2,   #20% test size
    random_state=42,
    stratify=labels
)

# Save SVM training file
with open("train.txt", "w", encoding="utf-8") as f_train:
    for text, label in zip(X_train, y_train):
        f_train.write(f"__label__{label} {text}\n")

# Save SVM test file
with open("test.txt", "w", encoding="utf-8") as f_test:
    for text, label in zip(X_test, y_test):
        f_test.write(f"__label__{label} {text}\n")

print(f"Training file saved: {len(X_train)} sentences")
print(f"Test file saved: {len(X_test)} sentences")

Training file saved: 132240 sentences
Test file saved: 33061 sentences


In [4]:
#Vectorize Text - character n-gram
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    analyzer="char",  #character n-grams
    ngram_range=(3,5),   #trigrams to 5-grams
    lowercase=True
)
X_train_vec = vectorizer.fit_transform(X_train) #training texts to vectors, sentences to sparse numerical matrix
X_test_vec = vectorizer.transform(X_test)

In [5]:
#SVM

from sklearn.svm import LinearSVC
clf = LinearSVC(C=1.0, random_state=42, max_iter=5000)
clf.fit(X_train_vec, y_train)
print("SVM training completed")

print("Train size:", len(X_train))
print("Test size :", len(X_test))

SVM training completed
Train size: 132240
Test size : 33061
Train label distribution: Counter({'Nepali': 40000, 'Bhojpuri': 23999, 'Maithili': 23877, 'Tharu': 17453, 'Newari': 15136, 'Tamang': 11775})
Test label distribution : Counter({'Nepali': 10000, 'Bhojpuri': 6000, 'Maithili': 5970, 'Tharu': 4363, 'Newari': 3784, 'Tamang': 2944})


In [6]:
# New unseen test samples
new_test_data_svm = [
    ("Nepali",    "राप्रपा र राप्रपा नेपाल तथा सत्तारुढ गठवन्धनका अन्य दलहरु पनि नयाँ गठवन्धनमा आउने विश्वास खाँडको छ"), #test data from test.txt
    ("Bhojpuri",  "पोर्क्युपिन शाकाहारी होलें मुख्य रूप से छाल पतई आ तना के खालें"),
    ("Maithili",  "टेकु दोभानमे ई नदी विष्णुमती नदीसँ मिलैत अछि"),
    ("Newari",    "सुनानं सुयातं नुगलय् स्‍याकूसा जितः जक मखु बरु छिपिं फुक्‍कसितं नुगलय् स्‍याकूगु थें खः"),
    ("Tamang",    "अझन थेला थोबोहेन्‍से काङधोना खीबा मुबा"),
    ("Tharu",     "एक बाट आउर टोहार ओ मोर जौन एकठो लावा सम्बन्ढ बने जाइटा यम्ने टुँ खुसि बाटो या नाइ")
]
true_labels = [label for label, text in new_test_data]
texts = [text for label, text in new_test_data]

# Convert new texts to TF-IDF vectors
X_new_vec = vectorizer.transform(texts)
predicted_labels = clf.predict(X_new_vec)

for text, true, pred in zip(texts, true_labels, predicted_labels):
    status = "Correct" if true == pred else "Wrong"
    print(f"Text: {text}")
    print(f"True label: {true}")
    print(f"Predicted label: {pred}   {status}")

Text: राप्रपा र राप्रपा नेपाल तथा सत्तारुढ गठवन्धनका अन्य दलहरु पनि नयाँ गठवन्धनमा आउने विश्वास खाँडको छ
True label: Nepali
Predicted label: Nepali   Correct
Text: पोर्क्युपिन शाकाहारी होलें मुख्य रूप से छाल पतई आ तना के खालें
True label: Bhojpuri
Predicted label: Bhojpuri   Correct
Text: टेकु दोभानमे ई नदी विष्णुमती नदीसँ मिलैत अछि
True label: Maithili
Predicted label: Maithili   Correct
Text: सुनानं सुयातं नुगलय् स्‍याकूसा जितः जक मखु बरु छिपिं फुक्‍कसितं नुगलय् स्‍याकूगु थें खः
True label: Newari
Predicted label: Newari   Correct
Text: अझन थेला थोबोहेन्‍से काङधोना खीबा मुबा
True label: Tamang
Predicted label: Tamang   Correct
Text: एक बाट आउर टोहार ओ मोर जौन एकठो लावा सम्बन्ढ बने जाइटा यम्ने टुँ खुसि बाटो या नाइ
True label: Tharu
Predicted label: Tharu   Correct


In [7]:
#Evaluate the SVM model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

#Make predictions on test data
y_pred = clf.predict(X_test_vec)

#Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {acc:.4f}")

#Confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
print("\nConfusion Matrix:")
print(cm)

#Precision, Recall, F1-score
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9861

Confusion Matrix:
[[5952   39    0    1    0    8]
 [  22 5916   20    1    0   11]
 [   2    5 9986    6    0    1]
 [   1   19  160 3537   14   53]
 [   0    5    0    7 2914   18]
 [   0    8    2   55    3 4295]]

Classification Report:
              precision    recall  f1-score   support

    Bhojpuri       1.00      0.99      0.99      6000
    Maithili       0.99      0.99      0.99      5970
      Nepali       0.98      1.00      0.99     10000
      Newari       0.98      0.93      0.96      3784
      Tamang       0.99      0.99      0.99      2944
       Tharu       0.98      0.98      0.98      4363

    accuracy                           0.99     33061
   macro avg       0.99      0.98      0.98     33061
weighted avg       0.99      0.99      0.99     33061



In [None]:
####################################################################################################################


In [9]:
#Multinomial Naive Bayes

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

clf_nb = MultinomialNB()
clf_nb.fit(X_train_vec, y_train)
print("Multinomial Naive Bayes training completed.")

Multinomial Naive Bayes training completed.


In [10]:
# New unseen test samples
new_test_data_multinb = [
    ("Nepali",    "राप्रपा र राप्रपा नेपाल तथा सत्तारुढ गठवन्धनका अन्य दलहरु पनि नयाँ गठवन्धनमा आउने विश्वास खाँडको छ"), #test data from test.txt
    ("Bhojpuri",  "पोर्क्युपिन शाकाहारी होलें मुख्य रूप से छाल पतई आ तना के खालें"),
    ("Maithili",  "टेकु दोभानमे ई नदी विष्णुमती नदीसँ मिलैत अछि"),
    ("Newari",    "सुनानं सुयातं नुगलय् स्‍याकूसा जितः जक मखु बरु छिपिं फुक्‍कसितं नुगलय् स्‍याकूगु थें खः"),
    ("Tamang",    "अझन थेला थोबोहेन्‍से काङधोना खीबा मुबा"),
    ("Tharu",     "एक बाट आउर टोहार ओ मोर जौन एकठो लावा सम्बन्ढ बने जाइटा यम्ने टुँ खुसि बाटो या नाइ")
]
true_labels = [label for label, text in new_test_data]
texts = [text for label, text in new_test_data]

X_new_vec = vectorizer.transform(texts)
predicted_labels = clf_nb.predict(X_new_vec)

for text, true, pred in zip(texts, true_labels, predicted_labels):
    status = "Correct" if true == pred else "Wrong"
    print(f"Text: {text}")
    print(f"True label: {true}")
    print(f"Predicted label: {pred}   {status}")

Text: राप्रपा र राप्रपा नेपाल तथा सत्तारुढ गठवन्धनका अन्य दलहरु पनि नयाँ गठवन्धनमा आउने विश्वास खाँडको छ
True label: Nepali
Predicted label: Nepali   Correct
Text: पोर्क्युपिन शाकाहारी होलें मुख्य रूप से छाल पतई आ तना के खालें
True label: Bhojpuri
Predicted label: Bhojpuri   Correct
Text: टेकु दोभानमे ई नदी विष्णुमती नदीसँ मिलैत अछि
True label: Maithili
Predicted label: Maithili   Correct
Text: सुनानं सुयातं नुगलय् स्‍याकूसा जितः जक मखु बरु छिपिं फुक्‍कसितं नुगलय् स्‍याकूगु थें खः
True label: Newari
Predicted label: Newari   Correct
Text: अझन थेला थोबोहेन्‍से काङधोना खीबा मुबा
True label: Tamang
Predicted label: Tamang   Correct
Text: एक बाट आउर टोहार ओ मोर जौन एकठो लावा सम्बन्ढ बने जाइटा यम्ने टुँ खुसि बाटो या नाइ
True label: Tharu
Predicted label: Tharu   Correct


In [12]:
#Evaluate
y_pred = clf_nb.predict(X_test_vec)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred, labels=clf_nb.classes_))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.9618281358700583

Confusion Matrix:
 [[5956   13   25    0    0    6]
 [  96 5314  553    1    2    4]
 [   1    1 9998    0    0    0]
 [   4    3  308 3414    3   52]
 [   0    2   13    6 2898   25]
 [  30   13   74   24    3 4219]]

Classification Report:
               precision    recall  f1-score   support

    Bhojpuri       0.98      0.99      0.99      6000
    Maithili       0.99      0.89      0.94      5970
      Nepali       0.91      1.00      0.95     10000
      Newari       0.99      0.90      0.94      3784
      Tamang       1.00      0.98      0.99      2944
       Tharu       0.98      0.97      0.97      4363

    accuracy                           0.96     33061
   macro avg       0.98      0.96      0.96     33061
weighted avg       0.96      0.96      0.96     33061



In [None]:
####################################################################################################################

In [15]:
#K-Nearest Neighbour

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(
    n_neighbors=5,   
    metric='cosine'  
)

knn.fit(X_train_vec, y_train)
print("KNN training completed.")

KNN training completed.


In [17]:
new_test_data_knn = [
    ("Nepali",    "राप्रपा र राप्रपा नेपाल तथा सत्तारुढ गठवन्धनका अन्य दलहरु पनि नयाँ गठवन्धनमा आउने विश्वास खाँडको छ"),
    ("Bhojpuri",  "पोर्क्युपिन शाकाहारी होलें मुख्य रूप से छाल पतई आ तना के खालें"),
    ("Maithili",  "टेकु दोभानमे ई नदी विष्णुमती नदीसँ मिलैत अछि"),
    ("Newari",    "सुनानं सुयातं नुगलय् स्‍याकूसा जितः जक मखु बरु छिपिं फुक्‍कसितं नुगलय् स्‍याकूगु थें खः"),
    ("Tamang",    "अझन थेला थोबोहेन्‍से काङधोना खीबा मुबा"),
    ("Tharu",     "एक बाट आउर टोहार ओ मोर जौन एकठो लावा सम्बन्ढ बने जाइटा यम्ने टुँ खुसि बाटो या नाइ")
]

# Separate texts and true labels
true_labels = [label for label, text in new_test_data_knn]
texts = [text for label, text in new_test_data_knn]

# Convert texts to TF-IDF vectors
X_new_vec = vectorizer.transform(texts)

# Predict with KNN
predicted_labels = knn.predict(X_new_vec)

# Print results
for text, true, pred in zip(texts, true_labels, predicted_labels):
    status = "Correct" if true == pred else "Wrong"
    print(f"Text: {text}")
    print(f"True label     : {true}")
    print(f"Predicted label: {pred}   {status}")


Text: राप्रपा र राप्रपा नेपाल तथा सत्तारुढ गठवन्धनका अन्य दलहरु पनि नयाँ गठवन्धनमा आउने विश्वास खाँडको छ
True label     : Nepali
Predicted label: Nepali   Correct
Text: पोर्क्युपिन शाकाहारी होलें मुख्य रूप से छाल पतई आ तना के खालें
True label     : Bhojpuri
Predicted label: Bhojpuri   Correct
Text: टेकु दोभानमे ई नदी विष्णुमती नदीसँ मिलैत अछि
True label     : Maithili
Predicted label: Maithili   Correct
Text: सुनानं सुयातं नुगलय् स्‍याकूसा जितः जक मखु बरु छिपिं फुक्‍कसितं नुगलय् स्‍याकूगु थें खः
True label     : Newari
Predicted label: Newari   Correct
Text: अझन थेला थोबोहेन्‍से काङधोना खीबा मुबा
True label     : Tamang
Predicted label: Tamang   Correct
Text: एक बाट आउर टोहार ओ मोर जौन एकठो लावा सम्बन्ढ बने जाइटा यम्ने टुँ खुसि बाटो या नाइ
True label     : Tharu
Predicted label: Tharu   Correct


In [18]:
#Predict 
y_pred_knn = knn.predict(X_test_vec)

#Accuracy score
from sklearn.metrics import accuracy_score
print("KNN accuracy:", accuracy_score(y_test, y_pred_knn))

KNN accuracy: 0.9363903088230846


In [None]:
####################################################################################################################

In [19]:
#Decision Trees
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(
    max_depth=50,       
    random_state=42
)

dt.fit(X_train_vec, y_train)
print("Decision Tree Training Completed")


Decision Tree Training Completed


In [32]:
dt_preds = dt.predict(X_new_vec)
for text, true, pred in zip(texts, true_labels, dt_preds):
    status = "Correct" if true == pred else "Wrong"
    print(f"Text: {text}")
    print(f"True label     : {true}")
    print(f"Predicted label: {pred}   {status}")
print("Decision Tree accuracy:", accuracy_score(y_test, y_pred_dt))

Text: राप्रपा र राप्रपा नेपाल तथा सत्तारुढ गठवन्धनका अन्य दलहरु पनि नयाँ गठवन्धनमा आउने विश्वास खाँडको छ
True label     : Nepali
Predicted label: Nepali   Correct
Text: पोर्क्युपिन शाकाहारी होलें मुख्य रूप से छाल पतई आ तना के खालें
True label     : Bhojpuri
Predicted label: Bhojpuri   Correct
Text: टेकु दोभानमे ई नदी विष्णुमती नदीसँ मिलैत अछि
True label     : Maithili
Predicted label: Maithili   Correct
Text: सुनानं सुयातं नुगलय् स्‍याकूसा जितः जक मखु बरु छिपिं फुक्‍कसितं नुगलय् स्‍याकूगु थें खः
True label     : Newari
Predicted label: Newari   Correct
Text: अझन थेला थोबोहेन्‍से काङधोना खीबा मुबा
True label     : Tamang
Predicted label: Tamang   Correct
Text: एक बाट आउर टोहार ओ मोर जौन एकठो लावा सम्बन्ढ बने जाइटा यम्ने टुँ खुसि बाटो या नाइ
True label     : Tharu
Predicted label: Tharu   Correct
Decision Tree accuracy: 0.9449502434893077


In [None]:
####################################################################################################################

In [26]:
from sklearn.ensemble import RandomForestClassifier

#Random Forest

rf_clf = RandomForestClassifier(n_estimators=200, max_depth=50, random_state=42, n_jobs=-1)
rf_clf.fit(X_train_vec, y_train)
print("Random Forest training completed")

Random Forest training completed


In [35]:
rf_preds = rf_clf.predict(X_new_vec)
print("\nRandom Forest Predictions")
for text, true, pred in zip(texts, true_labels, rf_preds):
    status = "Correct" if true == pred else "Wrong"
    print(f"Text: {text}")
    print(f"True label     : {true}")
    print(f"Predicted label: {pred}   {status}")
# --- Step 4: Accuracy score ---
acc = accuracy_score(true_labels, rf_preds)
print(f"Random Forest accuracy on new test set: {acc:.2f}")

# --- Step 5: Classification report ---
print("\nClassification Report:")
print(classification_report(true_labels, rf_preds, digits=4))

# --- Step 6: Confusion matrix ---
cm = confusion_matrix(true_labels, rf_preds, labels=rf_clf.classes_)
print(cm)



=== Random Forest Predictions ===
Text: राप्रपा र राप्रपा नेपाल तथा सत्तारुढ गठवन्धनका अन्य दलहरु पनि नयाँ गठवन्धनमा आउने विश्वास खाँडको छ
True label     : Nepali
Predicted label: Nepali   Correct
Text: पोर्क्युपिन शाकाहारी होलें मुख्य रूप से छाल पतई आ तना के खालें
True label     : Bhojpuri
Predicted label: Bhojpuri   Correct
Text: टेकु दोभानमे ई नदी विष्णुमती नदीसँ मिलैत अछि
True label     : Maithili
Predicted label: Maithili   Correct
Text: सुनानं सुयातं नुगलय् स्‍याकूसा जितः जक मखु बरु छिपिं फुक्‍कसितं नुगलय् स्‍याकूगु थें खः
True label     : Newari
Predicted label: Newari   Correct
Text: अझन थेला थोबोहेन्‍से काङधोना खीबा मुबा
True label     : Tamang
Predicted label: Tamang   Correct
Text: एक बाट आउर टोहार ओ मोर जौन एकठो लावा सम्बन्ढ बने जाइटा यम्ने टुँ खुसि बाटो या नाइ
True label     : Tharu
Predicted label: Tharu   Correct
Random Forest accuracy on new test set: 1.00

Classification Report:
              precision    recall  f1-score   support

    Bhojpuri     1.0000    1.0000  