In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

In [8]:
seed_urls = ['https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/sports',
             'https://inshorts.com/en/read/world']

In [9]:
def build_dataset(seed_urls):
    news_data = []
    for url in seed_urls:
        news_category = url.split('/')[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')
        
        news_articles = [{'news_headline': headline.find('span', 
                                                         attrs={"itemprop": "headline"}).string,
                          'news_article': article.find('div', 
                                                       attrs={"itemprop": "articleBody"}).string,
                          'news_category': news_category}
                         
                            for headline, article in 
                             zip(soup.find_all('div', 
                                               class_=["news-card-title news-right-box"]),
                                 soup.find_all('div', 
                                               class_=["news-card-content news-right-box"]))
                        ]
        news_data.extend(news_articles)
        
    df =  pd.DataFrame(news_data)
    df = df[['news_headline', 'news_article', 'news_category']]
    return df

In [10]:
news_df = build_dataset(seed_urls)
news_df.head(10)

Unnamed: 0,news_headline,news_article,news_category
0,Boston Dynamics robots dance together to wish ...,Boston Dynamics got its robots to dance togeth...,technology
1,New Facebook scam sends users a link asking 'I...,A new Facebook Messenger scam has been identif...,technology
2,Jack Ma loses $11 billion in two months,Alibaba Co-Founder Jack Ma's net worth has plu...,technology
3,Jeff Bezos names Blue Origin's landing ship af...,Jeff Bezos on Wednesday unveiled Blue Origin's...,technology
4,Only one unit of PlayStation 5 with 20 kg gold...,Russian company Caviar will make only one unit...,technology
5,Fake news of Microsoft buying Sony for $130 bi...,Fake news about Microsoft buying Sony with all...,technology
6,Be aware of 'pay and register' coronavirus vac...,The Ministry of Home Affairs' cybersecurity aw...,technology
7,Apple removes app promoting 'secret' parties a...,"Apple has removed an iOS app that promoted ""se...",technology
8,Acts of vandalism should stop: Rajnath Singh o...,Union Defence Minister Rajnath Singh on Wednes...,technology
9,Dell's VMware sues ex-COO Rajiv Ramaswami over...,Dell's VMware has sued its former COO Rajiv Ra...,technology


In [11]:
news_df.shape

(74, 3)

In [12]:
news_df['news_category'].value_counts()

technology    25
world         25
sports        24
Name: news_category, dtype: int64

In [15]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [19]:
from sklearn import preprocessing 

In [20]:
label_encoder = preprocessing.LabelEncoder() 

In [22]:
news_df['news_category1']= label_encoder.fit_transform(news_df['news_category'])

In [23]:
news_df['news_category1'].value_counts()

2    25
1    25
0    24
Name: news_category1, dtype: int64

In [25]:
y=news_df['news_category1']

In [26]:
x=news_df['news_headline']

In [27]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

In [29]:
x[6]

"Be aware of 'pay and register' coronavirus vaccine scams: Cyber Dost"

In [31]:
x[72]

'US approves potential sale of 3,000 bombs worth $290mn to S Arabia'

In [41]:
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem.porter import PorterStemmer
import re
from nltk.stem import WordNetLemmatizer 
lem = WordNetLemmatizer() 
ps = PorterStemmer()
corpus = []
for i in range(0, len(x)):
    review = re.sub('[^a-zA-Z]', ' ', x[i])
    review = review.lower()
    review = review.split()
    
    review = [lem.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\darvin.l\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\darvin.l\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


In [44]:
corpus[56]

'wuhan covid case may time higher study show'

In [45]:
x[56]

"Wuhan's Covid-19 cases may have been 10 times higher, study shows"

In [46]:
## TFidf Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_v=TfidfVectorizer(max_features=5000,ngram_range=(1,3))
X=tfidf_v.fit_transform(corpus).toarray()

In [50]:
X.shape

(74, 1419)

In [60]:
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB()
from sklearn.pipeline import Pipeline

In [53]:
from sklearn import metrics

In [61]:
clf=Pipeline([('tfidf',TfidfVectorizer()),('clf',MultinomialNB())])

In [63]:
clf.fit(x,y)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [65]:
metrics.confusion_matrix(y,pred)

array([[24,  0,  0],
       [ 0, 25,  0],
       [ 0,  0, 25]], dtype=int64)

In [64]:
pred=clf.predict(x)

In [67]:
clf.predict(['tesla operate in india from early 2021'])

array([1])

In [68]:
clf.predict(['from 2020 ipl dhoni says his retriment'])

array([0])

In [69]:
clf.predict(['covid-19 vaccies is not working properly on infected persons'])

array([2])

In [70]:
clf.predict(['internet is free for everyone in india '])

array([0])

In [72]:
clf.predict(['data science is leading field in 2021'])

array([1])

In [75]:
clf.predict(['in yesterday match india won by 7 runs'])

array([0])

In [76]:
clf.predict(['indian space rockets '])

array([1])

In [77]:
clf.predict(['covid virus spread faster in 2021'])

array([2])

In [78]:
clf.predict(['nasa delcared space tech will be used by common man'])

array([1])

In [79]:
clf.predict(['tn state grovernment cancelled the education loans'])

array([0])

In [80]:
x[5]

'Fake news of Microsoft buying Sony for $130 billion spreads due to satire article'

In [83]:
clf.predict(['U.S. refuses Israel weapons to attack Iran'])

array([2])