# **NewsClassifier: Building an Automated News Classification System with NLP Techniques**

## **The objective of the NewsClassifier project is to develop an automated news classification system using natural language processing (NLP) techniques. The system aims to extract, preprocess, analyze, and categorize news articles from a popular news website. Key goals include grouping similar articles using K-means clustering and building a classification model to automatically categorize news articles.**



## ***So developed a Python script to extract, preprocess, and analyze news articles from a popular news website. The primary goals were to group similar articles using K-means clustering and build a classification model to automatically categorize news articles.***

In [None]:

#Import required libraries

import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import re
import nltk

#Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    #Using BeautifulSoup
    soup = BeautifulSoup(text, 'html.parser')
    cleaned_text = soup.get_text(separator=' ')

    #Removing non-alphabetic characters and extra whitespaces
    cleaned_text = ' '.join(word for word in cleaned_text.split() if word.isalpha())

    return cleaned_text

def tokenize_and_remove_stopwords(text):
    #Tokenize the text
    tokens = word_tokenize(text)

    #Removing stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    return filtered_tokens

def lemmatize_tokens(tokens):
    #Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return lemmatized_tokens

def extract_information(url):
    #Send a GET request to the URL
    response = requests.get(url)

    #Checking if the request was successful (status code 200)
    if response.status_code == 200:

        #Parsing the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        #Extracting information from the "fpj_bignews" section
        bignews_article = soup.find('div', class_='fpj_bignews')

        if bignews_article:
            bignews_title = bignews_article.find('h3').text.strip()

            #Cleaning and preprocessing text
            bignews_title_cleaned = clean_text(bignews_title)
            bignews_tokens = tokenize_and_remove_stopwords(bignews_title_cleaned)
            bignews_lemmatized_tokens = lemmatize_tokens(bignews_tokens)

            #Text representation using TF-IDF
            tfidf_vectorizer = TfidfVectorizer()
            bignews_tfidf_matrix = tfidf_vectorizer.fit_transform([' '.join(bignews_lemmatized_tokens)])

            #Printing information from the "fpj_bignews" section
            print(f"BIG NEWS - Title: {bignews_title}")
            print(f"Cleaned Title: {bignews_title_cleaned}")
            print(f"Lemmatized Tokens: {bignews_lemmatized_tokens}")
            print(f"TF-IDF Matrix: {bignews_tfidf_matrix}")
            print()
        else:
            print("Big news section not found on the page.")

        #Extracting information from the "fpj_newList" section
        newlist_articles = soup.find('div', class_='fpj_newList')

        if newlist_articles:
            newlist_articles = newlist_articles.find_all('li')

            #List to store preprocessed text data for clustering
            article_lemmatized_tokens_list = []

            for article in newlist_articles:
                article_title = article.find('span', class_='fpj_title').text.strip()

                #Cleaning and preprocess text
                article_title_cleaned = clean_text(article_title)
                article_tokens = tokenize_and_remove_stopwords(article_title_cleaned)
                article_lemmatized_tokens = lemmatize_tokens(article_tokens)

                #Append preprocessed tokens to the list
                article_lemmatized_tokens_list.append(article_lemmatized_tokens)

                #Text representation using TF-IDF
                article_tfidf_matrix = tfidf_vectorizer.fit_transform([' '.join(article_lemmatized_tokens)])

                #Printing information from the "fpj_newList" section
                print(f"Article - Title: {article_title}")
                print(f"Cleaned Title: {article_title_cleaned}")
                print(f"Lemmatized Tokens: {article_lemmatized_tokens}")
                print(f"TF-IDF Matrix: {article_tfidf_matrix}")
                print()

                 #Applying K-means clustering
                num_clusters = 1  # Adjust based on the desired number of clusters


                flattened_tokens_list = [' '.join(tokens) for tokens in article_lemmatized_tokens_list]

                 #Using TfidfVectorizer for text vectorization
                tfidf_vectorizer_clustering = TfidfVectorizer()
                tfidf_matrix_clustering = tfidf_vectorizer_clustering.fit_transform(flattened_tokens_list)

                 #Applying K-means clustering
                kmeans = KMeans(n_clusters=num_clusters, random_state=42)
                cluster_labels = kmeans.fit_predict(tfidf_matrix_clustering)

                 #Print cluster labels
                print("Cluster Labels:", cluster_labels)


            #Data for classification model
            data = {'text': [' '.join(tokens) for tokens in article_lemmatized_tokens_list], 'cluster_label': cluster_labels}
            df = pd.DataFrame(data)

            #Split data for training and testing
            X_train, X_test, y_train, y_test = train_test_split(df['text'], df['cluster_label'], test_size=0.2, random_state=42)

            #Text Vectorization using TF-IDF
            tfidf_vectorizer = TfidfVectorizer()
            X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
            X_test_tfidf = tfidf_vectorizer.transform(X_test)

            #Training Naive Bayes model
            nb_model = MultinomialNB()
            nb_model.fit(X_train_tfidf, y_train)

            #Predictions
            y_pred = nb_model.predict(X_test_tfidf)

            #Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            print("Accuracy:", accuracy)
            print("Classification Report:\n", classification_report(y_test, y_pred))

        else:
            print("New list section not found on the page.")

    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

website_link = 'https://www.freepressjournal.in'
extract_information(website_link)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


BIG NEWS - Title: Adani-Hindenburg Row: Supreme Court To Pronounce Verdict Tomorrow On Petitions Seeking Probe
Cleaned Title: Supreme Court To Pronounce Verdict Tomorrow On Petitions Seeking Probe
Lemmatized Tokens: ['Supreme', 'Court', 'Pronounce', 'Verdict', 'Tomorrow', 'Petitions', 'Seeking', 'Probe']
TF-IDF Matrix:   (0, 2)	0.35355339059327373
  (0, 4)	0.35355339059327373
  (0, 1)	0.35355339059327373
  (0, 6)	0.35355339059327373
  (0, 7)	0.35355339059327373
  (0, 3)	0.35355339059327373
  (0, 0)	0.35355339059327373
  (0, 5)	0.35355339059327373

Article - Title: Mumbai News: Cityflo Bus Services Paused For Wednesday Amid Truck Drivers' Protest
Cleaned Title: Mumbai Cityflo Bus Services Paused For Wednesday Amid Truck Protest
Lemmatized Tokens: ['Mumbai', 'Cityflo', 'Bus', 'Services', 'Paused', 'Wednesday', 'Amid', 'Truck', 'Protest']
TF-IDF Matrix:   (0, 5)	0.3333333333333333
  (0, 7)	0.3333333333333333
  (0, 0)	0.3333333333333333
  (0, 8)	0.3333333333333333
  (0, 4)	0.33333333333333



### ***The project successfully achieved automated clustering and classification of news articles, allowing for quick and efficient organization of diverse news topics. The script provided insights into the content structure of the website, and the classification model demonstrated high accuracy in categorizing articles.***