# Prepare

In [43]:
#  data manipulation
import pandas as pd
import numpy as np

# natural language processing
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import unicodedata
import re

# split data
from sklearn.model_selection import train_test_split

# Quiet all warnings
import warnings
warnings.filterwarnings('ignore')

**Get data**

In [18]:
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
import os
from pprint import pprint
from datetime import datetime as dt


def get_news_articles():
    url = "https://inshorts.com/en/read"
    # set different categories to perse through
    category = ["business", "sports", "technology", "entertainment"]

    articles = {}
    df_setup = []
    for cat in category:
        # read the url
        res = get(url + "/" + category[0])
        print(res)

        # create a beautiful soup object
        soup_parser = BeautifulSoup(res.content, 'html.parser').body

        soup = soup_parser.find_all("span", itemprop="mainEntityOfPage")
        for i in range(len(soup)):
            link = soup[i]["itemid"]

            article = get(link)
            article_soup = BeautifulSoup(article.content,"html.parser").body

            article_title = article_soup.find('span', itemprop='headline').text
            article_body = article_soup.find('div', itemprop='articleBody').text
            # articles[f"article {cat} {i}"] = [article_title, cat ,link ,article_body]

            article_instance = {
                'title': article_title,
                'content': article_body,
                'category': cat,
            }

            df_setup.append(article_instance)
    return pd.DataFrame(df_setup)

In [21]:
rating = get_news_articles()
rating.head()

Unnamed: 0,title,content,category
0,Burger King to face US lawsuit claiming its Wh...,A US judge has rejected Burger King's bid to d...,business
1,SoftBank to sell 1.17% stake in Zomato for ₹94...,SoftBank Vision Fund is likely to offload the ...,business
2,India is a 30:30:30 story: Union Minister Piyu...,Union Minister Piyush Goyal said when he think...,business
3,Sri Lanka to import 92.1 mn eggs from India to...,Sri Lanka will import 92.1 million eggs from I...,business
4,How much will LPG cylinder cost in major citie...,The government has slashed the price of domest...,business


**Clean strings**

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [30]:
def clean(string):
    """
    This function puts a string in lowercase, normalizes any unicode characters, removes anything that         
    isn't an alphanumeric symbol or single quote.
    """
    # Normalize unicode characters
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # Remove unwanted characters and put string in lowercase
    string = re.sub(r"[^\w0-9'\s]", '', string).lower()
            
    return string

**Lemmatize**
- Apply lemmatization to the data

In [6]:
def lemmatize(string):
    """
    This function takes in a string, lemmatizes each word, and returns a lemmatized version of the orignal string
    """
    # Build the lemmatizer
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    # Run the lemmatizer on each word after splitting the input string, store results in the 'results' list
    results = []
    for word in string.split():
        results.append(lemmatizer.lemmatize(word))
    
    # Convert results back into a string
    string = ' '.join(results)
    
    # Return the resulting string
    return string


**Remove stop words**
- Remove all stop words from the data

In [7]:
def remove_stopwords(string, extra_words=None, exclude_words=None):
    """
    Takes in a string, with optional arguments for words to add to stock stopwords and words to ignore in the 
    stock list removes the stopwords, and returns a stopword free version of the original string
    """
    # Get the list of stopwords from nltk
    stopword_list = stopwords.words('english')
    
    # Create a set of stopwords to exclude
    excluded_stopwords = set(exclude_words) if exclude_words else set()
    
    # Include any extra words in the stopwords to exclude
    stopwords_to_exclude = set(stopword_list) - excluded_stopwords
    
    # Add extra words to the stopwords set
    stopwords_to_exclude |= set(extra_words) if extra_words else set()
    
    # Tokenize the input string
    words = string.split()
    
    # Filter out stopwords from the tokenized words
    filtered_words = [word for word in words if word not in stopwords_to_exclude]
    
    # Convert back to string
    string = ' '.join(filtered_words)
    
    # Return the resulting string
    return string

**Split data**
- Apply a 60% training, 20% Validation, and 20% testing split to the data. (Random state 95)

In [38]:
def split_readmes(df):
    """
    Takes in a dataframe and performs a 70/15/15 split. Outputs a train, validate, and test dataframe
    """
    # Perfrom a 70/15/15 split
    train_val, test = train_test_split(df, test_size=.2, random_state=95)
    train, validate = train_test_split(train_val, test_size=.25, random_state=95)
    
    # Return the dataframe slices
    return train, validate, test

**Prepare data**

In [39]:
def prep_readmes(df, col:str="content_column"):
    """
    Takes in the dataframe and the column name that contains the corpus data, creates a column of cleaned data, then uses that 
    to create a column without stopwords that is lemmatized, performs a train-validate-test split, and returns train, validate,
    and test.
    """
    # Initialize a list to collect cleaned elements in the for-loop below
    cleaned_row = []
    
    # Iterate through the readme_content values...
    for i in df[col].values:
        
        # Clean each value in the column and append to the 'cleaned_row' list
        cleaned_row.append(clean(i))
        
    # Assign the clean row content to a new column in the dataframe named 'cleaned_content
    df = df.assign(cleaned_content=cleaned_row)
    
    # Using a lambda, lemmatize all values in the 'cleaned_content' column and assign to a new column called 'lemmatized'
    df['lemmatized'] = df['cleaned_content'].apply(lambda x: lemmatize(remove_stopwords(x)))
    
    # Split the dataframe (70/15/15)
    train, validate, test = split_readmes(df)
    
    # Return train, validate, and test dataframes
    return train, validate, test

In [40]:
train, val, test = prep_readmes(rating, "content")

In [41]:
train.shape, val.shape, test.shape

((24, 5), (8, 5), (8, 5))

In [42]:
train.head()

Unnamed: 0,title,content,category,cleaned_content,lemmatized
4,How much will LPG cylinder cost in major citie...,The government has slashed the price of domest...,business,the government has slashed the price of domest...,government slashed price domestic lpg cylinder...
11,SoftBank to sell 1.17% stake in Zomato for ₹94...,SoftBank Vision Fund is likely to offload the ...,sports,softbank vision fund is likely to offload the ...,softbank vision fund likely offload 117 stake ...
33,Sri Lanka to import 92.1 mn eggs from India to...,Sri Lanka will import 92.1 million eggs from I...,entertainment,sri lanka will import 921 million eggs from in...,sri lanka import 921 million egg india counter...
3,Sri Lanka to import 92.1 mn eggs from India to...,Sri Lanka will import 92.1 million eggs from I...,business,sri lanka will import 921 million eggs from in...,sri lanka import 921 million egg india counter...
39,Influencers mis-sell saying they can make you ...,Zerodha CEO Nithin Kamath said influencers mis...,entertainment,zerodha ceo nithin kamath said influencers mis...,zerodha ceo nithin kamath said influencers mis...


## Preparation actions taken

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.
- Lemmatize
- Remove stop words
- 60, 20, 20 split