In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline


In [5]:
seed_urls = ["https://inshorts.com/en/read/technology",
             "https://inshorts.com/en/read/sports",
             "https://inshorts.com/en/read/world"]

def build_datasets(seed_urls):
    news_data = []
    for url in seed_urls:
        news_category = url.split("/")[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content,"html.parser")
        
        news_articles = [
            {
                'news_headline': headline.find('span', attrs={'itemprop': "headline"}).string,
                'news_article': article.find('div', attrs={'itemprop': "articleBody"}).string,
                'news_category': news_category,
            }
                for headline, article in
                 zip(soup.find_all('div', class_=["news-card-title news-right-box"]),
                     soup.find_all('div', class_=["news-card-content news-right-box"])
                    )
         ]
       
        news_data.extend(news_articles)
        print(news_data)
    df = pd.DataFrame(news_data)
    df = df[['news_headline', 'news_article', 'news_category']]
    return df

In [6]:
news_df = build_datasets(seed_urls)
news_df.news_category.value_counts()



world         25
sports        25
technology    24
Name: news_category, dtype: int64

In [7]:
import spacy
import nltk
from nltk.tokenize import ToktokTokenizer
import re
from contractions import CONTRACTION_MAP
import unicodedata

nlp = spacy.load('en_core_web_md',parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove("no")
stopword_list.remove("not")

In [8]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text
strip_html_tags('<html><h2>Some important text</h2></html>')
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

remove_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

In [9]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),flags=re.IGNORECASE|re.DOTALL)
    
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

expand_contractions("Y'all can't expand contractions I'd think")

'You all cannot expand contractions I would think'

In [10]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

remove_special_characters("Well this was fun! What do you think? 123#@!", 
                          remove_digits=True)

'Well this was fun What do you think '

## Stemming
the root stems may not be lexicographically correct. Which means, the stemmed words may not be semantically correct and might have a chance of not being present in the dictionary.

In [11]:
text = "Those cars are crashing each others while driving, i think?"
def basic_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text
basic_stemmer(text)

'those car are crash each other while driving, i think?'

## Lemmatization
 the root word is always a lexicographically correct word (present in the dictionary), but the root stem may not be so.


In [12]:
def lemmatize_text(text):
    text = nlp(text) 
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text
lemmatize_text(text)

'those car be crash each other while drive , i think ?'

## Removing stopwords

In [19]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_tokens = ' '.join(filtered_tokens)
    return filtered_tokens

remove_stopwords("The, and, if are stopwords, computer is not")

', , stopwords , computer not'

## Normalizing corpus

In [21]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                    accented_char_removal=True, text_lower_case=True,
                    text_lemmatization=True, special_char_removal=True,
                    stopword_removal=True, remove_digits=True):

    normalized_corpus = []
    
    # normalize each document in corpus
    for doc in corpus:
        # stip HTML
        if html_stripping:
            doc = strip_html_tags(doc)

        # remove accented char
        if accented_char_removal:
            doc = remove_accented_chars(doc)

        # expand contractions
        if contraction_expansion:
            doc = expand_contractions(doc)
        
        # lowercase the text
        if text_lower_case:
            doc = doc.lower()
        
        # remove extra new lines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ', doc)
        
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        
        # remove special char and\or digits
        if special_char_removal:
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)
            
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
        
        normalized_corpus.append(doc)
        
    return normalized_corpus
            

In [22]:
# combining headline and article text
news_df['full_text'] = news_df["news_headline"].map(str)+'. ' + news_df["news_article"]

# pre-process text and store the same
news_df['clean_text'] = normalize_corpus(news_df['full_text'])
norm_corpus = list(news_df['clean_text'])

# show a sample news article
news_df.iloc[1][['full_text', 'clean_text']].to_dict()



In [23]:
news_df.to_csv('tech_news.csv', index=False, encoding='utf-8')

## Tagging parts of speech (POS)

In [35]:
# create a basic pre-processed corpus, we should not lowercase to get POS context
corpus = normalize_corpus(news_df['full_text'], text_lower_case=False,
                         text_lemmatization=False, special_char_removal=False)

# demo for POS tagging for sample news headline
sentence = str(news_df.iloc[1].news_headline)
sentence_nlp = nlp(sentence)

In [36]:
# POS tagging with spacy
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type'])

Unnamed: 0,Word,POS tag,Tag type
0,Was,VBD,VERB
1,fixed,VBN,VERB
2,in,IN,ADP
3,2017,CD,NUM
4,:,:,PUNCT
5,Google,NNP,PROPN
6,on,IN,ADP
7,',``,PUNCT
8,Maps,NNS,NOUN
9,fooled,VBD,VERB


In [37]:


# POS tagging with nltk
nltk_pos_tagged = nltk.pos_tag(sentence.split(),)
pd.DataFrame(nltk_pos_tagged, columns=['Word', 'POS tag'])

Unnamed: 0,Word,POS tag
0,Was,NNP
1,fixed,VBN
2,in,IN
3,2017:,CD
4,Google,NNP
5,on,IN
6,'Maps,NNS
7,fooled,VBN
8,you',JJ
9,banner,NN


## TODO: Shallow parsing