<a href="https://colab.research.google.com/github/dayemsaeed/MLNewsClassifier/blob/main/ML_News_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from google.colab import drive

nltk.download('punkt')
nltk.download('stopwords')

# Mount Google Drive
drive.mount('/content/gdrive')

# Load the dataset from Google Drive
file_path = "/content/gdrive/My Drive/Colab DS/News_Category_Dataset_v3.json"
with open(file_path) as file:
    lines = file.readlines()
    data_json = [json.loads(line) for line in lines]

data = pd.DataFrame(data_json)

def preprocess_text(text):
    text = re.sub(r'\W', ' ', text.lower())
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords.words('english') and len(token) > 2]
    return ' '.join(tokens)

# Apply the preprocessing function to the headline and short_description columns, then concatenate them
data["clean_text"] = data["headline"].apply(preprocess_text) + " " + data["short_description"].apply(preprocess_text)

ps = PorterStemmer()

def preprocess_with_stemming(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [ps.stem(token) for token in tokens if token.isalnum()]
    return " ".join(tokens)

# Apply the preprocessing function with stemming
data["stemmed_text"] = data["headline"].apply(preprocess_with_stemming) + " " + data["short_description"].apply(preprocess_with_stemming)

X_old = data["clean_text"]
X_new = data["stemmed_text"]
y = data["category"]

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_old = vectorizer.fit_transform(X_old)
X_new = vectorizer.fit_transform(X_new)

X_train_old, X_test_old, y_train_old, y_test_old = train_test_split(X_old, y, test_size=0.2, random_state=42, stratify=y)
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y, test_size=0.2, random_state=42, stratify=y)

# Original model
lr_clf_old = LogisticRegression(max_iter=1000, multi_class='ovr', solver='lbfgs', random_state=42)
lr_clf_old.fit(X_train_old, y_train_old)
y_pred_old = lr_clf_old.predict(X_test_old)
accuracy_old = accuracy_score(y_test_old, y_pred_old)
print("Accuracy of original model:", accuracy_old)

# New model with hyperparameter tuning and ensemble
lr_clf = LogisticRegression(max_iter=1000, multi_class='ovr', solver='lbfgs', random_state=42)
rf_clf = RandomForestClassifier(random_state=42)

param_grid_lr = {
    'C': [0.1, 1, 10],
}

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
}

grid_lr = GridSearchCV(lr_clf, param_grid_lr, cv=3, scoring='accuracy')
grid_rf = GridSearchCV(rf_clf, param_grid_rf, cv=3, scoring='accuracy')
grid_lr.fit(X_train_new, y_train_new)
grid_rf.fit(X_train_new, y_train_new)

best_lr_clf = grid_lr.best_estimator_
best_rf_clf = grid_rf.best_estimator_

print("Best Logistic Regression parameters:", grid_lr.best_params_)
print("Best Random Forest parameters:", grid_rf.best_params_)

ensemble_clf = VotingClassifier(estimators=[('lr', best_lr_clf), ('rf', best_rf_clf)], voting='soft')
ensemble_clf.fit(X_train_new, y_train_new)
y_pred_new = ensemble_clf.predict(X_test_new)
accuracy_new = accuracy_score(y_test_new, y_pred_new)
print("Accuracy of new model:", accuracy_new)

print("Improvement in accuracy:", accuracy_new - accuracy_old)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Mounted at /content/gdrive
Accuracy of original model: 0.5919438743855295


KeyboardInterrupt: ignored