In [2]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import joblib
import nltk
from nltk.tokenize import word_tokenize
import re
from nltk.corpus import stopwords

In [3]:
username = os.getenv('postgreUsername')
password = os.getenv('postgrePassword')
DATABASE_URL = f"postgresql://{username}:{password}@localhost:5432/news"
engine = create_engine(DATABASE_URL)
session = sessionmaker(bind=engine)


query = "SELECT * FROM stg_document"

db_connection = psycopg2.connect(
    host='localhost',
    database="news",
    user=username,
    password=password
)
cursor = db_connection.cursor()

df = pd.read_sql("SELECT * FROM public.stg_document", con=db_connection)

cursor.close()
# db_connection.close()

  df = pd.read_sql("SELECT * FROM public.stg_document", con=db_connection)


In [4]:
df.head()

Unnamed: 0,id,data,actual_label,text_length,word_count,processed_at
0,1,$1m payoff for former Shell boss Shell is to ...,business,1424,241,2025-08-04 12:10:39.140625+00:00
1,2,$1m payoff for former Shell boss Shell is to ...,business,1424,241,2025-08-04 12:10:39.140625+00:00
2,3,$1m payoff for former Shell boss Shell is to ...,business,1424,241,2025-08-04 12:10:39.140625+00:00
3,4,&#163;1.8m indecency fine for Viacom Media gi...,entertainment,1470,238,2025-08-04 12:10:39.140625+00:00
4,5,&#163;1.8m indecency fine for Viacom Media gi...,entertainment,1470,238,2025-08-04 12:10:39.140625+00:00


In [5]:
df = df.drop_duplicates(subset="data")
df.head()

Unnamed: 0,id,data,actual_label,text_length,word_count,processed_at
0,1,$1m payoff for former Shell boss Shell is to ...,business,1424,241,2025-08-04 12:10:39.140625+00:00
3,4,&#163;1.8m indecency fine for Viacom Media gi...,entertainment,1470,238,2025-08-04 12:10:39.140625+00:00
6,7,2004: An Irish Athletics Year 2004 wont be re...,sport,4240,733,2025-08-04 12:10:39.140625+00:00
9,10,2D Metal Slug offers retro fun Like some dril...,tech,1771,319,2025-08-04 12:10:39.140625+00:00
15,16,Aaliyah claim dismissed by court Late R&B sta...,entertainment,1101,197,2025-08-04 12:10:39.140625+00:00


In [6]:
#remove punctuation: not a word or whitespace
df['data_regex'] = df['data'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
#remove email
df['data_regex'] = df['data_regex'].apply(lambda x: re.sub(r'\S+@\S+', '', x))
#remove url
df['data_regex'] = df['data_regex'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x))
df['data_regex'].head()

0     1m payoff for former Shell boss  Shell is to p...
3     16318m indecency fine for Viacom  Media giant ...
6     2004 An Irish Athletics Year  2004 wont be rem...
9     2D Metal Slug offers retro fun  Like some dril...
15    Aaliyah claim dismissed by court  Late RB star...
Name: data_regex, dtype: object

In [7]:
#tokenize sentence word by word
df['data_tokenize'] = df['data_regex'].apply(word_tokenize)
df['data_tokenize'].head()

0     [1m, payoff, for, former, Shell, boss, Shell, ...
3     [16318m, indecency, fine, for, Viacom, Media, ...
6     [2004, An, Irish, Athletics, Year, 2004, wont,...
9     [2D, Metal, Slug, offers, retro, fun, Like, so...
15    [Aaliyah, claim, dismissed, by, court, Late, R...
Name: data_tokenize, dtype: object

In [8]:
nltk.download('stopwords')
stopwords1 = set(stopwords.words('english'))
print(stopwords1)

{"isn't", 's', 'up', 'd', "it'd", "they'd", 'ourselves', 'being', 'about', "hasn't", 'should', 've', "i'm", 'having', 'or', 'below', 'her', 'not', 're', 'him', 'so', 'other', "that'll", 'were', 'you', 'during', 'each', "doesn't", 'be', 'are', 'its', 'them', 'doesn', 'when', "they're", 'my', 'only', "we'll", 'by', 'don', 'wouldn', 'how', 'over', 'any', 'he', 'didn', "mustn't", 'nor', "he's", 'all', 'has', 'll', 'y', "aren't", "you've", 'while', 'just', 'than', 'at', 'yours', 'out', 'itself', 'himself', "you're", 'too', 'couldn', 'your', 'once', 'these', 'yourselves', 'until', 'i', 'was', 'his', 'that', 'and', 'does', 'here', 'but', "it'll", "wouldn't", 'have', 'no', 'between', 'same', 'down', 'the', "needn't", 'yourself', 'herself', 'm', 'it', "i've", 'mustn', "you'd", 'after', "should've", 'theirs', "haven't", 'most', 'some', 'whom', 'off', "he'll", 'had', 'this', 'through', 'on', 'an', 'as', 'me', "she'd", 'ain', 'myself', "she's", 'wasn', 'with', 'in', 'because', 'they', "couldn't", 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hoang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
#remove stopwords
df['data_tokenize'] = df['data_tokenize'].apply(
    lambda tokens: [word for word in tokens if word.lower() not in stopwords1]
)

df['data_tokenize'].head()

0     [1m, payoff, former, Shell, boss, Shell, pay, ...
3     [16318m, indecency, fine, Viacom, Media, giant...
6     [2004, Irish, Athletics, Year, 2004, wont, rem...
9     [2D, Metal, Slug, offers, retro, fun, Like, dr...
15    [Aaliyah, claim, dismissed, court, Late, RB, s...
Name: data_tokenize, dtype: object

In [10]:
df['data_tokenize'] = df['data_tokenize'].apply(lambda tokens: ' '.join(tokens))

# Feature & label selection
X = df['data_tokenize']  # should be a column of strings (not lists!)
y = df['actual_label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Vectorizer
vectorizer = TfidfVectorizer(
    max_features=3000,
    stop_words='english',
    lowercase=True,
    ngram_range=(1, 2)
)

# TF-IDF transformation
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Classifier
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

# Predictions & Evaluation
y_pred = classifier.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


               precision    recall  f1-score   support

     business       0.97      0.97      0.97       101
entertainment       0.99      0.95      0.97        79
     politics       0.98      0.98      0.98        87
        sport       0.99      1.00      0.99        96
         tech       0.95      0.98      0.97        63

     accuracy                           0.98       426
    macro avg       0.98      0.98      0.98       426
 weighted avg       0.98      0.98      0.98       426



In [12]:
joblib.dump(classifier, '../checkpoint/tfidf_classifier.pkl')
joblib.dump(vectorizer, '../checkpoint/tfidf_vectorizer.pkl')

['../checkpoint/tfidf_vectorizer.pkl']