# Extracting Top Keywords from the news article
In this notebook, we will perform the activity of extracting top keywords from news article

In [1]:
import operator

from nltk.tokenize import WhitespaceTokenizer
from nltk import download, stem

# The below statement will download the stop word list
# 'nltk_data/corpora/stopwords/' at home directory of your computer
download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ankit.bhatia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def load_file(file_path):
    news = ''.join([line for line in open(file_path,encoding='utf-8')])
    return news

# This method will take string as input and return the string
#  converted into lowercase
def to_lower_case(text):
    return text.lower()

# This will take a text string as input and return the token.
wht = WhitespaceTokenizer()
def tokenize_text(text):
    return wht.tokenize(text=text)

# This will remove stop word tokens from the token list.
stop_words = stopwords.words('english')
def remove_stop_words(token_list):
    return [word for word in token_list if word not in stop_words]


# This will take a token list as input and return stemmed token list
stemmer = stem.PorterStemmer()
def get_stems(token_list):
    return [stemmer.stem(word) for word in token_list]

# This method will generate a dict of word frequencies from list.
def get_freq(stems):
    freq_dict = {}
    for t in stems:
        freq_dict[t.strip()] = freq_dict.get(t.strip(), 0) + 1
    return freq_dict

# This method will sort the dictionary on the values and return the top n 
# keys of the dictionary.
def get_top_n_words(freq_dict, n):
    sorted_dict = sorted(freq_dict.items(), key=operator.itemgetter(1), reverse=True)
    return [x[0] for x in sorted_dict][:n]


In [3]:
path = "../data/news_article.txt"
news_article = load_file(path)

In [4]:
lower_case_news_art = to_lower_case(text=news_article)

In [5]:
tokens = tokenize_text(lower_case_news_art)

In [6]:
removed_tokens = remove_stop_words(tokens)

In [7]:
stems = get_stems(removed_tokens)

In [8]:
freq_dict = get_freq(stems)

In [9]:
top_keywords = get_top_n_words(freq_dict, 6)
top_keywords

['law', 'justic', 'european', 'parti', 'took', 'poland’']