In [None]:
import os
import sys
import re
import json
import glob
import datetime
from collections import Counter

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from wordcloud import WordCloud
from datetime import datetime, timedelta

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import mlflow
import mlflow.sklearn
import joblib

In [None]:
os.chdir('..') 

In [None]:
from src.loader import NewsDataLoader
from src.config import cfg
import src.utils as utils

In [None]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import LatentDirichletAllocation
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
data_loader = NewsDataLoader(cfg.path)

In [None]:
# Convert the list of messages into a DataFrame
news_data_df = data_loader.get_news_data()

In [None]:
news_data_df.head(2)

In [None]:
# Download 'punkt' resource
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Apply the cleaning function to the 'text' column
news_data_df['clean_content'] = news_data_df['content'].apply(utils.clean_text)

In [None]:
# Remove missing values
news_data_df.dropna(subset=['description'], inplace=True)
news_data_df.isna().sum()

In [None]:
print(news_data_df.columns)

In [None]:
selected_columns = ['description', 'title']
news_data_df_dt = news_data_df[selected_columns]

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(news_data_df_dt['title'], news_data_df_dt['description'], test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)

In [None]:
# Make predictions
predictions = classifier.predict(X_test_vec)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, predictions))

In [None]:
# Using Latent Dirichlet Allocation (LDA) for Topic Modeling
lda = LatentDirichletAllocation(n_components=10, random_state=42)
message_topics = lda.fit_transform(X_train_vec)

In [None]:
# Log the model with MLflow
os.makedirs("mlflow_logs/artifacts", exist_ok=True)
with mlflow.start_run():
    mlflow.log_param("model_type", "NaiveBayes")
    mlflow.sklearn.log_model(classifier, "model")

    # Save vectorizer
    vectorizer_path = "mlflow_logs/artifacts/vectorizer.pkl"
    joblib.dump(vectorizer, vectorizer_path)
    mlflow.log_artifact(vectorizer_path, "artifacts")

    # Save LDA model
    lda_path = "mlflow_logs/artifacts/lda_model.pkl"
    joblib.dump(lda, lda_path)
    mlflow.log_artifact(lda_path, "artifacts")

    # Log additional information
    mlflow.log_param("accuracy", accuracy)