# data

In [None]:
data_dir = "/content/cleaned_data.csv"

In [None]:
import pandas as pd

df = pd.read_csv(data_dir)
print(df.head())

In [None]:
category_counts = df['category'].value_counts()
print(category_counts.index[3:])
print(category_counts.values[3:])

In [None]:
import random
import pandas as pd
import nltk
from nltk.corpus import wordnet

nltk.download('wordnet')
nltk.download('omw-1.4')


def synonym_replacement(words, n):
    new_words = words.copy()
    candidates = [w for w in words if wordnet.synsets(w)]
    random.shuffle(candidates)
    replaced = 0

    for w in candidates:
        syns = wordnet.synsets(w)
        if syns:
            synonym = syns[0].lemmas()[0].name()
            new_words = [synonym if word == w else word for word in new_words]
            replaced += 1
        if replaced >= n:
            break
    return new_words


def augment_sentence(sentence, num_aug):
    words = sentence.split()
    augmented = []
    for _ in range(num_aug):
        new_words = synonym_replacement(words, 1)
        augmented.append(" ".join(new_words))
    return augmented


def auto_balance(df, text_col, label_col):
    # count rows each class
    counts = df[label_col].value_counts()
    max_count = counts.max()

    new_rows = []

    for label, count in counts.items():
        need = max_count - count
        if need == 0:
            continue

        df_class = df[df[label_col] == label]

        while need > 0:
            for sentence in df_class[text_col]:
                if need <= 0:
                    break
                aug_sent = augment_sentence(sentence, 1)[0]
                new_rows.append({text_col: aug_sent, label_col: label})
                need -= 1

    augmented_df = pd.DataFrame(new_rows)
    final_df = pd.concat([df, augmented_df], ignore_index=True)
    return final_df


In [None]:
df_balanced = auto_balance(df, text_col='clean_text', label_col='category')

In [None]:
df_balanced.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
category_counts = df['category'].value_counts()

plt.figure(figsize=(10, 6))
sns.barplot(x=category_counts.index, y=category_counts.values, palette='viridis')
plt.title('Số lượng mẫu trong từng chủ đề')
plt.xlabel('Chủ đề')
plt.ylabel ('Số lượng')
plt.xticks(rotation=90)
plt.savefig("barchart.png")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
category_counts = df_balanced['category'].value_counts()

plt.figure(figsize=(10, 6))
sns.barplot(x=category_counts.index, y=category_counts.values, palette='viridis')
plt.title('Số lượng mẫu trong từng chủ đề')
plt.xlabel('Chủ đề')
plt.ylabel ('Số lượng')
plt.xticks(rotation=90)
plt.savefig("barchart2.png")
plt.show()

# BoW + MNB

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import joblib

X = df_balanced['clean_text']
y = df_balanced['category']

#chia dữ liệu
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

# Fit vectorizer trên train
vectorizer = CountVectorizer(max_features=30000, ngram_range=(1,2), min_df=2, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec  = vectorizer.transform(X_test)

# Train
clf_mnb = MultinomialNB(alpha=1.0)
clf_mnb.fit(X_train_vec, y_train)

# Evaluate
y_pred = clf_mnb.predict(X_test_vec)
print("Acc:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Lưu model + vectorizer
joblib.dump(vectorizer, "countvec.joblib")
joblib.dump(clf_mnb, "mnb_model.joblib")

# Dùng cho 1 đoạn văn mới
vec = joblib.load("countvec.joblib")
model = joblib.load("mnb_model.joblib")
new_text = "Tesla shares surged more than 12% in pre-market trading on Monday after the electric vehicle maker reported record quarterly deliveries that far exceeded Wall Street expectations, despite ongoing supply chain challenges and factory shutdowns in China."
new_vec = vec.transform([new_text])
pred = model.predict(new_vec)
print(pred)

# dùng TF_IDF + LR

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

df_balanced = df_balanced.dropna(subset=['clean_text', 'category']).reset_index(drop=True)

X = df_balanced['clean_text']
y = df_balanced['category']

#chia dữ liệu
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

vectorizer = TfidfVectorizer(max_features=30000, ngram_range=(1,2), min_df=2,stop_words='english',)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec  = vectorizer.transform(X_test)

clf = LogisticRegression(
    C=5.0,
    class_weight='balanced',
    max_iter=1000,
    n_jobs=-1,
    solver='saga',
    penalty='l2',
    tol=1e-4,
)

clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)
print(f"Acc: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

# Lưu model + vectorizer
joblib.dump(vectorizer, "tfidf_v6.joblib")
joblib.dump(clf, "nb_model_v6.joblib")

# Dùng cho 1 đoạn văn mới
vec = joblib.load("tfidf_v6.joblib")
model = joblib.load("nb_model_v6.joblib")
new_text = "Tesla shares surged more than 12% in pre-market trading on Monday after the electric vehicle maker reported record quarterly deliveries that far exceeded Wall Street expectations, despite ongoing supply chain challenges and factory shutdowns in China."
new_vec = vec.transform([new_text])
pred = model.predict(new_vec)
print(pred)

# biểu đồ

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(12,12))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=df_balanced['category'].unique(), yticklabels=df_balanced['category'].unique(), cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix Heatmap")
plt.savefig("confmat.png")
plt.show()
