In [37]:
import pandas as pd
import numpy as np
import re, string, os
import joblib

In [50]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [39]:
#setup nltk
NLTK_DATA_PATH = "nltk_data"
os.makedirs(NLTK_DATA_PATH, exist_ok=True)
nltk.data.path.append(NLTK_DATA_PATH)
nltk.download('punkt', download_dir=NLTK_DATA_PATH)
nltk.download('stopwords', download_dir=NLTK_DATA_PATH)
nltk.download('wordnet', download_dir=NLTK_DATA_PATH)
nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_PATH)
nltk.download('punkt_tab', download_dir=NLTK_DATA_PATH)

[nltk_data] Downloading package punkt to nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [40]:
stop_words = set(stopwords.words('english'))
domain_words = {
    "airport","klia","staff","malaysia","malaysian","flight","terminal","gate","counter",
    "immigration","airline","airlines","plane","arrival","departure","queue","checkin",
    "baggage","luggage"
}
stop_words.update(domain_words)

In [41]:
lemmatizer = WordNetLemmatizer()

In [42]:
def handle_negation(text):
    text = re.sub(r"\b(not|no|never|n't)\b\s+(\w+)", r"\1_\2", text)
    return text

In [43]:
def get_pos(tag):
    if tag.startswith('J'): return wordnet.ADJ
    elif tag.startswith('V'): return wordnet.VERB
    elif tag.startswith('N'): return wordnet.NOUN
    elif tag.startswith('R'): return wordnet.ADV
    else: return wordnet.NOUN

def preprocess(text):
    if not isinstance(text, str):
        return ""
    text = text.encode("latin1", "ignore").decode("utf-8", "ignore")
    text = re.sub(r"\s+", " ", text).strip().lower()
    text = handle_negation(text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    lemmas = [
        lemmatizer.lemmatize(tok, get_pos(tag))
        for tok, tag in tagged
        if tok not in stop_words
    ]
    return " ".join(lemmas)

In [44]:
df = pd.read_csv("training_data.csv")  # replace with your dataset filename
df['clean_text'] = df['TEXT'].apply(preprocess)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"], df["SENTIMENT"], test_size=0.2, random_state=42
)

In [46]:
#TF-IDF Vectorixer

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [47]:
# Model Training - WITH TUNING

svm = LinearSVC(random_state=42)
params = {"C": [0.1, 1, 10]}
grid = GridSearchCV(svm, params, cv=5, scoring="accuracy", n_jobs=-1)
grid.fit(X_train_tfidf, y_train)

best_svm = grid.best_estimator_
y_pred = best_svm.predict(X_test_tfidf)

In [48]:
print("Best Params:", grid.best_params_)

Best Params: {'C': 1}


In [52]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.861003861003861


In [53]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

    Negative       0.87      0.93      0.90       291
     Neutral       0.64      0.29      0.40        31
    Positive       0.86      0.85      0.86       196

    accuracy                           0.86       518
   macro avg       0.79      0.69      0.72       518
weighted avg       0.85      0.86      0.85       518



In [54]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[270   2  19]
 [ 14   9   8]
 [ 26   3 167]]


In [55]:
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
joblib.dump(svm, "svm_model_tuned.pkl")

['svm_model_tuned.pkl']