In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
file_path = "B:/_GITHUB\Machine-Learning-project-series/news classification/dataset/data.csv"
data = pd.read_csv(file_path, sep="\t")
data.head()

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [3]:
# checking for null values
data.isnull().sum()

category    0
filename    0
title       0
content     0
dtype: int64

In [4]:
# data category value count
data['category'].value_counts()

sport            511
business         510
politics         417
entertainment    386
tech             113
Name: category, dtype: int64

In [5]:
value = data['category'].value_counts()
value_index = value.index
fig = px.pie(data, values = value, names = value_index, hole = 0.4, width = 500, height = 400)
fig.update_layout(title = 'News data category distribution', title_x = 0.5)
fig.show()

#### **Model Selection**

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# data = data[['title','category']]

x = np.array(data['title'])
y = np.array(data['category'])

cv = CountVectorizer()
X = cv.fit_transform(x)

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.33, random_state = 25)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier

In [9]:
# initiatialize models
bernouilli_model = BernoulliNB()
decision_tree_model = DecisionTreeClassifier()
kneighbors_model = KNeighborsClassifier()
logistic_regression_model = LogisticRegression()

# fitting models
bernouilli_fit = bernouilli_model.fit(xtrain, ytrain)
decision_tree_fit = decision_tree_model.fit(xtrain, ytrain)
kneighbors_fit = kneighbors_model.fit(xtrain, ytrain)
logistic_regression_fit = logistic_regression_model.fit(xtrain, ytrain)

model_performance = {

                    "Models" : ['Bernouilli Classifier', 'decision_tree', 'kneighbors', 'logistic_regression'],
                    "Performance" : [bernouilli_fit.score(xtest, ytest), decision_tree_fit.score(xtest, ytest), kneighbors_fit.score(xtest, ytest), 
                                      logistic_regression_fit.score(xtest, ytest)]
                    }

models_df = pd.DataFrame(model_performance)
models_df.sort_values(by=["Performance"], ascending=True)
# print(models_df.to_markdown())
models_df

Unnamed: 0,Models,Performance
0,Bernouilli Classifier,0.7
1,decision_tree,0.684375
2,kneighbors,0.5125
3,logistic_regression,0.764062


In [11]:
# user testing
user_input = input("Enter your text: ")
user_data = cv.transform([user_input]).toarray()
models = [bernouilli_fit, decision_tree_fit, kneighbors_fit, logistic_regression_fit]
output = [model.predict(user_data) for model in models]
for idx, out in enumerate(output):
    print(f" {models[idx]} : {out}")

 BernoulliNB() : ['entertainment']
 DecisionTreeClassifier() : ['entertainment']
 KNeighborsClassifier() : ['entertainment']
 LogisticRegression() : ['entertainment']


#### **Conclusion**
Every news website classifies the news article before publishing it so that every time visitors visit their website can easily click on the type of news that interests them.