# Analysing InShorts articles using NLP

In [1]:
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline

## Data Retrieval with Web Scraping

In [2]:
start_urls = [
    'https://inshorts.com/en/read/technology',
    'https://inshorts.com/en/read/sports',
    'https://inshorts.com/en/read/world'
]

def create_dataset(start_urls):
    news_data = []
    for url in start_urls:
        news_category = url.split('/')[-1]
        print(news_category)
create_dataset(start_urls)

technology
sports
world


In [11]:
def create_dataset(start_urls):
    news_data = []
    for url in start_urls:
        news_category = url.split('/')[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')
        
        news_articles = [{
            'news_headline': headline.find('span',attrs = {"itemprop":"headline"}).text,
            'news_article': article.find('div',attrs = {"itemprop":"articleBody"}).text,
            'news_category':news_category}
            
            for headline, article in
                             zip(soup.find_all('div', 
                                               class_=["news-card-title news-right-box"]),
                                 soup.find_all('div', 
                                               class_=["news-card-content news-right-box"]))
         ]
        news_data.extend(news_articles)
    df = pd.DataFrame(news_data)
    df = df[['news_headline', 'news_article', 'news_category']]
    return df

In [12]:
data_df = create_dataset(start_urls) 
print(data_df.head())

                                       news_headline  \
0  Katy Perry performs at first-ever OnePlus Musi...   
1  I'm a weird case, obviously don't need to work...   
2  Apple announces special event for apps and gam...   
3  Google's room-sized 331 LED bulb system create...   
4  Realme CEO tweets update about company's phone...   

                                        news_article news_category  
0  The OnePlus Music Festival, held at Mumbai's D...    technology  
1  Microsoft Co-founder Bill Gates, at a recent e...    technology  
2  Apple is hosting a special media event that is...    technology  
3  Google AI team has built a room-sized system c...    technology  
4  Realme India CEO Madhav Sheth recently tweeted...    technology  


In [5]:
data_df.news_category.value_counts()

sports        25
technology    25
world         25
Name: news_category, dtype: int64

## Text Wrangling & Pre-processing

In [6]:
# Installing Stop words from nltk

import nltk
#nltk.download()

In [7]:
from nltk.tokenize.toktok import ToktokTokenizer
import re
from contractions import CONTRACTION_MAP
import unicodedata
import spacy # For importing English Language Model

nlp = spacy.load('en', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

#### Removing HTML tags

In [8]:
def remove_html(text):
    soup = BeautifulSoup(text,'html.parser')
    filtered_text = soup.get_text()
    return filtered_text

remove_html('<h2> HeLLo World </h2><br>')

' HeLLo World '

#### Removing accented characters

In [13]:
def remove_accent(text):
    text = unicodedata.normalize('NFKD',text).encode('ascii','ignore').decode('utf-8','ignore')
    return text
remove_accent('Sómě těxt')

'Some text'