# Extending dictionary of terms

The following code is used to go over the Internet looking for related terms of a given set of terms.

In [1]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/milos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/milos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from functools import reduce
from itertools import chain
import time

from collections import Counter
from multiprocessing import Pool

import requests
from bs4 import BeautifulSoup
from googlesearch import search

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


def flatmap(func, collection):
    return reduce(
        lambda acc, new_el: chain(acc, new_el),
        map(func, collection)
    )

# Search google to get top n results for the given term
def get_search_results(term, n=20):
    return search(term, num=n, stop=n, tld='com', lang='en', pause=2)

# Search google to get top results for each of the given terms
def get_many_search_results(*terms, top_result_count=20):
    return list(flatmap(lambda term: get_search_results(term, n=top_result_count), terms))

# Requests resource on the given url and return it as BeautifulSoup using html parser
def get_webpage(url):
    doc = requests.get(url, headers={'User-agent': 'Dictionary extender spider'})
    parser = 'html.parser'
    return BeautifulSoup(doc.text, parser)

# Requests resource on the given url and counts the number of occurences of each word
def get_wordcounts(url):
#     print('Counting words on', url)
    ps = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    webpage_text = get_webpage(url).text
    tokens = [token.lower() for token in word_tokenize(webpage_text)]
    # Remove stop words and punctuation
    tokens = filter(lambda word: word not in stop_words and word.isalnum(), tokens)
    # Do stemming
    stemmed_tokens = map(ps.stem, tokens)
    return Counter(stemmed_tokens)
    
def extend_dictionary(*terms, top_result_count=20):
    search_results = get_many_search_results(*terms, top_result_count=top_result_count)
#     print(search_results)
    with Pool() as pool:
        return reduce(
            lambda acc, new_counter: acc + new_counter,
            pool.imap_unordered(get_wordcounts, search_results),
            Counter()
        )

In [None]:
extend_dictionary('vegan', top_result_count=10).most_common()