Install Libraries

In [None]:
# %pip install numpy
# %pip install pandas
# %pip install matplotlib
# %pip install seaborn
%pip install -U scikit-learn

Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

Import Dataset

In [None]:
df = pd.read_csv('dataset.csv')
# df.head(15)

Data-Preprocessing step
  - Omit is_ne column
    - Will lead to oversampling, because of a small amount of data. Too specifc, needs to be more general
  - Omit rows that have null values under the 'word' column
    - Unnecessary data

In [15]:
original_df = df
# print(len(original_df))
is_ne_drop = df.drop("is_ne", axis=1)

# for (i in range ):
word_null_query = is_ne_drop[is_ne_drop['word'].isnull()]

word_null_query.head()

# for word_null_query in original_df:
# word_null_query.head(16)

filtered_df = is_ne_drop.drop(word_null_query.index)
# print(len(original_df))
filtered_df.head()


Unnamed: 0,word_id,sentence_id,word,label,is_spelling_correct
0,45,1,Gusto,FIL,True
1,46,1,kong,FIL,True
2,47,1,intindihin,FIL,True
3,48,1,pero,FIL,True
4,49,1,hindi,FIL,True


Feature Engineering / Preprocessing:

Target Features:
 - Word length
 - Vowel / Word Ratio
 - Consonant / Word Ratio
 - Look at common letters not present in filipino word (c, x, z, f, etc.)
 - Common NGram counts for Filipino and English words
    - ng
    - ch, sh, etc.
    - Filipino prefixes (um, in, etc.)
-  Check if the word is a (noun, verb, etc based on the sentence)


Note: normalize data if needed (check sklearn.preprocessing)

Additional Features:
- Special character handling for words classfied as OTH

In [None]:
len(df["is_spelling_correct"])

Length Feature

In [None]:
df['word_length'] = df['word'].apply(lambda w: len(str(w)) if isinstance(w, str) else 0)

Vowel and consonant ratio

In [None]:
def vowel_consonant_ratio(word):
    if not isinstance(word, str):  ### if the word does not have alphabet
        return 0.0
    
    vowels = ['a', 'e', 'i', 'o', 'u', 'A', 'E', 'I', 'O', 'U']
    num_vowels = sum(1 for ch in word if ch.isalpha() and ch in vowels)
    num_consonants = sum(1 for ch in word)

    if num_consonants == 0:
        return 1.0 if num_vowels > 0 else 0.0
    return num_vowels / num_consonants

def vowel_word_ratio(word):
    if not isinstance(word, str):
        return 0.0
    
    vowels = ['a', 'e', 'i', 'o', 'u', 'A', 'E', 'I', 'O', 'U']
    num_vowels = sum(1 for ch in word if ch.isalpha() and ch in vowels)
    num_consonants = sum(1 for ch in word)

    if num_consonants == 0:
        return 1.0 if num_vowels > 0 else 0.0
    return num_vowels / num_consonants

df['vowel_word_ratio'] = df['word'].apply(vowel_word_ratio)
df['vowel_consonant_ratio'] = df['word'].apply(vowel_consonant_ratio)

Train Test Split

- Check the sklearn to split.
- Model should be 80-20 split

In [None]:
feature_cols = []
X = df[feature_cols]
y = df['label']
print(len(X),len(y))

In [None]:
from sklearn.model_selection import train_test_split

## Split the dataset to 15% test size and 85% train and validation size
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

## Split the train and validation set to 15% validation size and 85% train size
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1765, random_state=42)

Imbalance Data:

Check these techniques in imblearn library:
 - SMOTE
 - Undersampling
 - Oversampling

SMOTE Sampling

In [None]:
from imblearn.over_sampling import SMOTE

smote_sampler = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote_sampler.fit_resample(
    X_train, 
    y_train
)
print("Original training set size:", len(y_train))
print("Resampled training set size:", len(y_train_resampled))

ML Modelling

Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

nb_model = GaussianNB()
nb_model.fit(X_train_resampled, y_train_resampled)

y_val_pred = nb_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"\nValidation Set Accuracy: {val_accuracy:.4f}")

## Hypertune by changing model parameters if needed (var_smoothing for GaussianNB)


Decision Trees

Model Validation:
- Confusion Matrix
- Classification report

Naive Bayes Evaluation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay


y_test_pred = nb_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Test Set Accuracy: **{test_accuracy:.4f}**")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred, zero_division=0))

cm = confusion_matrix(y_test, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title("Naive Bayes Confusion Matrix")
plt.show()