In [1]:
import pandas as pd
import math

from cleantext import clean

import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')
snowball_stemmer = SnowballStemmer(language="english")


import spacy
sp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\M\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\M\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Clean each news content
In order to clean training data, we take these steps with help of [clean-text](https://pypi.org/project/clean-text/) python package
- lowercase all words
- remove numbers and replace them with <NUMBER> token
- remove all URLs and replace them with <URL> token
- remove line-breaks and fix unicode characters like tab and other non-necessary escape characters



In [2]:
bdf = pd.read_json("raw/basketball-nba.jl", lines = True)
fdf = pd.read_json("raw/football-talksports.jl", lines = True)

def clean_content(df, name):
    for i, row in df.iterrows():
        if i%1000==0:
            print("[CleanContent] {} {:.2f}%".format(name,((i+1)/len(df))*100))
        clean_content = clean(row["content"],
            fix_unicode=True,               # fix various unicode errors
            to_ascii=True,                  # transliterate to closest ASCII representation
            lower=True,                     # lowercase text
            no_line_breaks=True,            # fully strip line breaks as opposed to only normalizing them
            no_urls=True,                   # replace all URLs with a special token
            no_emails=False,                # replace all email addresses with a special token
            no_phone_numbers=False,         # replace all phone numbers with a special token
            no_numbers=True,                # replace all numbers with a special token
            no_digits=True,                 # replace all digits with a special token
            no_currency_symbols=False,      # replace all currency symbols with a special token
            no_punct=False,                 # remove punctuations
            replace_with_url="<URL>",
            replace_with_number="<NUMBER>",
            lang="en"
        )
        df.at[i,'content'] = clean_content

clean_content(bdf, "basketball news")
clean_content(fdf, "football news")

bdf.to_json("clean/basketball-cleaned.jl",orient="records",lines =True)
fdf.to_json("clean/football-cleaned.jl",orient="records",lines =True)


[CleanContent] basketball news 0.01%
[CleanContent] basketball news 10.01%
[CleanContent] basketball news 20.02%
[CleanContent] basketball news 30.02%
[CleanContent] basketball news 40.03%
[CleanContent] basketball news 50.03%
[CleanContent] basketball news 60.03%
[CleanContent] basketball news 70.04%
[CleanContent] basketball news 80.04%
[CleanContent] basketball news 90.05%
[CleanContent] football news 0.00%
[CleanContent] football news 1.91%
[CleanContent] football news 3.83%
[CleanContent] football news 5.74%
[CleanContent] football news 7.65%
[CleanContent] football news 9.56%
[CleanContent] football news 11.47%
[CleanContent] football news 13.38%
[CleanContent] football news 15.30%
[CleanContent] football news 17.21%
[CleanContent] football news 19.12%
[CleanContent] football news 21.03%
[CleanContent] football news 22.94%
[CleanContent] football news 24.86%
[CleanContent] football news 26.77%
[CleanContent] football news 28.68%
[CleanContent] football news 30.59%
[CleanContent] 

### Remove stopwords
there are words that are used more frequently than other words in the language but they do not necessarily add more value to a sentence, hence it is safe to say that we can ignore them by removing the from our text.
In order to remove these stop words we could use nltk, gensim or spacy which they have different set of stopwords. btw I preferred nltk.


In [3]:
stop_words = set(stopwords.words("english"))

def del_stopwords(df, name):
    for i, row in df.iterrows():
        if i%1000==0:
            print("[DelStopwords] {} {:.2f}%".format(name,((i+1)/len(df))*100))
        content = row["content"]
        clean_content = " ".join([word for word in content.split() if word not in stop_words])
        df.at[i,'content'] = clean_content

del_stopwords(bdf, "basketball news")
del_stopwords(fdf, "football news")

bdf.to_json("clean/basketball-del-stopwords.jl",orient="records",lines =True)
fdf.to_json("clean/football-del-stopwords.jl",orient="records",lines =True)


[DelStopwords] basketball news 0.01%
[DelStopwords] basketball news 10.01%
[DelStopwords] basketball news 20.02%
[DelStopwords] basketball news 30.02%
[DelStopwords] basketball news 40.03%
[DelStopwords] basketball news 50.03%
[DelStopwords] basketball news 60.03%
[DelStopwords] basketball news 70.04%
[DelStopwords] basketball news 80.04%
[DelStopwords] basketball news 90.05%
[DelStopwords] football news 0.00%
[DelStopwords] football news 1.91%
[DelStopwords] football news 3.83%
[DelStopwords] football news 5.74%
[DelStopwords] football news 7.65%
[DelStopwords] football news 9.56%
[DelStopwords] football news 11.47%
[DelStopwords] football news 13.38%
[DelStopwords] football news 15.30%
[DelStopwords] football news 17.21%
[DelStopwords] football news 19.12%
[DelStopwords] football news 21.03%
[DelStopwords] football news 22.94%
[DelStopwords] football news 24.86%
[DelStopwords] football news 26.77%
[DelStopwords] football news 28.68%
[DelStopwords] football news 30.59%
[DelStopwords] 

### Stemming & Lemmatization
- Stemming: In linguistic morphology and information retrieval, stemming is the process of reducing inflected words to their word stem, base, or root form — generally a written word form.
- Lemmatization: Lemmatisation in linguistics is the process of grouping together the inflected forms of a word so they can be analyzed as a single item, identified by the word’s lemma, or dictionary form.

I used snowball instead of Porter stemmer as it is also known as the Porter2 stemming algorithm. It is almost universally accepted as better than the Porter stemmer, even being acknowledged as such by the individual who created the Porter stemmer.

Also used SpaCY for lemmatizing corpus


In [4]:
def stem_lemmatize(df, name):
    for i, row in df.iterrows():
        if i%1000==0:
            print("[Stem & Lemmatize] {} {:.2f}%".format(name,((i+1)/len(df))*100))
        content = row["content"]
        nltk_tokens = nltk.word_tokenize(content)
        stemmed = sp(" ".join([snowball_stemmer.stem(w) for w in nltk_tokens]))
        lemmatized = " ".join([w.lemma_ for w in stemmed])

        df.at[i,'content'] = lemmatized

stem_lemmatize(bdf, "basketball news")
stem_lemmatize(fdf, "football news")



[Stem & Lemmatize] basketball news 0.01%
[Stem & Lemmatize] basketball news 10.01%
[Stem & Lemmatize] basketball news 20.02%
[Stem & Lemmatize] basketball news 30.02%
[Stem & Lemmatize] basketball news 40.03%
[Stem & Lemmatize] basketball news 50.03%
[Stem & Lemmatize] basketball news 60.03%
[Stem & Lemmatize] basketball news 70.04%
[Stem & Lemmatize] basketball news 80.04%
[Stem & Lemmatize] basketball news 90.05%
[Stem & Lemmatize] football news 0.00%
[Stem & Lemmatize] football news 1.91%
[Stem & Lemmatize] football news 3.83%
[Stem & Lemmatize] football news 5.74%
[Stem & Lemmatize] football news 7.65%
[Stem & Lemmatize] football news 9.56%
[Stem & Lemmatize] football news 11.47%
[Stem & Lemmatize] football news 13.38%
[Stem & Lemmatize] football news 15.30%
[Stem & Lemmatize] football news 17.21%
[Stem & Lemmatize] football news 19.12%
[Stem & Lemmatize] football news 21.03%
[Stem & Lemmatize] football news 22.94%
[Stem & Lemmatize] football news 24.86%
[Stem & Lemmatize] football

In [None]:
bdf.to_json("clean/basketball-stem-lem.jl",orient="records",lines =True)
fdf.to_json("clean/football-stem-lem.jl",orient="records",lines =True)