In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline


In [6]:
seed_urls = ["https://inshorts.com/en/read/technology",
             "https://inshorts.com/en/read/sports",
             "https://inshorts.com/en/read/world"]

def build_datasets(seed_urls):
    news_data = []
    for url in seed_urls:
        news_category = url.split("/")[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content,"html.parser")
        
        news_articles = [
            {
                'news_headline': headline.find('span', attrs={'itemprop': "headline"}).string,
                'news_article': article.find('div', attrs={'itemprop': "articleBody"}).string,
                'news_category': news_category,
            }
                for headline, article in
                 zip(soup.find_all('div', class_=["news-card-title news-right-box"]),
                     soup.find_all('div', class_=["news-card-content news-right-box"])
                    )
         ]
       
        news_data.extend(news_articles)
        print(news_data)
    df = pd.DataFrame(news_data)
    df = df[['news_headline', 'news_article', 'news_category']]
    return df

In [7]:
news_df = build_datasets(seed_urls)
news_df.news_category.value_counts()



sports        25
world         25
technology    24
Name: news_category, dtype: int64

In [11]:
import spacy
import nltk
from nltk.tokenize import ToktokTokenizer
import re
from contractions import CONTRACTION_MAP
import unicodedata

nlp = spacy.load('en_core_web_md',parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove("no")
stopword_list.remove("not")

In [14]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text
strip_html_tags('<html><h2>Some important text</h2></html>')
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

remove_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

In [21]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),flags=re.IGNORECASE|re.DOTALL)
    
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

expand_contractions("Y'all can't expand contractions I'd think")

'You all cannot expand contractions I would think'

In [22]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

remove_special_characters("Well this was fun! What do you think? 123#@!", 
                          remove_digits=True)

'Well this was fun What do you think '

## Stemming
the root stems may not be lexicographically correct. Which means, the stemmed words may not be semantically correct and might have a chance of not being present in the dictionary.

In [29]:
text = "Those cars are crashing each others while driving, i think?"
def basic_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text
basic_stemmer(text)

'those car are crash each other while driving, i think?'

## Lemmatization
 the root word is always a lexicographically correct word (present in the dictionary), but the root stem may not be so.


In [32]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text
lemmatize_text(text)

'those car be crash each other while drive , i think ?'

## TODO: Removing stopwords

##T