# Sample WordCloud project with nltk dataset

Download the nltk corpora if you need to get the dataset

In [1]:
# import nltk
# nltk.download()

In [2]:
from nltk.corpus import abc, stopwords
from collections import Counter
import re

### Limit the data to 300 most common words

And remove 'stopwords'

In [3]:
stopwords = stopwords.words('english')

def get_300_words(filename, number_of_words=300):
    name = filename[:-4]
    return Counter(
    [
        word.lower() for word in abc.words(filename) 
        if re.search("\w", word) and not re.search("\d", word) and word.lower() not in stopwords
    ]).most_common(number_of_words)

science_words = {k: v for k, v in get_300_words('science.txt')}
rural_words = {k: v for k, v in get_300_words('rural.txt')}

### Remove the 'title' words from the dictionary since we will add them back later on ... special like

In [4]:
rural_words.pop('rural', None)
science_words.pop('science', None)

297

### Set the color scheme for the project

In [5]:
default_color = 'red'

colors = [
    'hsl(203, 80%, 50%)',
    'hsl(253, 80%, 50%)',
    'hsl(103, 80%, 50%)',
    'hsl(153, 80%, 50%)',
    'hsl(53, 80%, 50%)',
]

### Use the example class provided in the docs to organize colors by group

https://amueller.github.io/word_cloud/auto_examples/colored_by_group.html

In [6]:
from wordcloud import get_single_color_func

In [7]:
class GroupedColorFunc(object):
    """Create a color function object which assigns DIFFERENT SHADES of
       specified colors to certain words based on the color to words mapping.

       Uses wordcloud.get_single_color_func

       Parameters
       ----------
       color_to_words : dict(str -> list(str))
         A dictionary that maps a color to the list of words.

       default_color : str
         Color that will be assigned to a word that's not a member
         of any value from color_to_words.
    """

    def __init__(self, color_to_words, default_color):
        self.color_func_to_words = [
            (get_single_color_func(color), set(words))
            for (color, words) in color_to_words.items()]

        self.default_color_func = get_single_color_func(default_color)

    def get_color_func(self, word):
        """Returns a single_color_func associated with the word"""
        try:
            color_func = next(
                color_func for (color_func, words) in self.color_func_to_words
                if word in words)
        except StopIteration:
            color_func = self.default_color_func

        return color_func

    def __call__(self, word, **kwargs):
        return self.get_color_func(word)(word, **kwargs)


### Break the words into 'color groups' evenly

Based on the number of colors used and the number of words (default 300 - 1)

In [8]:
from random import shuffle
from math import ceil

In [9]:
def get_color_groups(words, colors):
    
    words_list = list(words.keys())
    
    # Use ceil if the length of words is not a multiple of the length of colors
    cnt = ceil(len(words_list)/len(colors))
    
    shuffle(words_list)
    return {color: words_list[i*cnt:(i+1)*cnt] for i, color in enumerate(colors)}

rural_color_groups = get_color_groups(rural_words, colors)

science_color_groups = get_color_groups(science_words, colors)

In [10]:
# Validate the counts for each color
for k, v in rural_color_groups.items():
    print(k, len(v))

hsl(203, 80%, 50%) 60
hsl(253, 80%, 50%) 60
hsl(103, 80%, 50%) 60
hsl(153, 80%, 50%) 60
hsl(53, 80%, 50%) 59


In [11]:
# Validate the counts for each color
for k, v in science_color_groups.items():
    print(k, len(v))

hsl(203, 80%, 50%) 60
hsl(253, 80%, 50%) 60
hsl(103, 80%, 50%) 60
hsl(153, 80%, 50%) 60
hsl(53, 80%, 50%) 59


### Prime time! Create the WordCloud and save the images

In [12]:
from wordcloud import WordCloud

In [13]:
def create_wordcloud(name, color_groups, words):

    # Create a color function with multiple tones
    color_func = GroupedColorFunc(color_groups, default_color)
    
    words[name] = 2*max(words.values())
    
    wordcloud = WordCloud(width=6*300, 
                          height=9*300, 
                          color_func=color_func).generate_from_frequencies(words)
    wordcloud.to_file("./imgs/" + name + "_blk.png")
    
    print(wordcloud.layout_[0])
    
create_wordcloud('rural', rural_color_groups, rural_words)
create_wordcloud('science', science_color_groups, science_words)

print('Done!')

(('rural', 1.0), 513, (1095, 134), None, 'rgb(211, 0, 0)')
(('science', 1.0), 365, (1292, 8), None, 'rgb(105, 0, 0)')
Done!


### Oops! The 'default' red color is being modified by the get_single_color_func() random state

Wouldn't it be nice for the title word to be a nice consistent bright red color?

In [14]:
class GroupedColorFunc(object):
    """ Modify the provided class from WordCloud documentation
        Set the self.default_color_fun in the __init__() method
    """

    def __init__(self, color_to_words, default_color):
        self.color_func_to_words = [
            (get_single_color_func(color), set(words))
            for (color, words) in color_to_words.items()]

        # set the default color to always be 'red'
        self.default_color_func = lambda *args, **kwargs: "red"

    def get_color_func(self, word):
        """Returns a single_color_func associated with the word"""
        try:
            color_func = next(
                color_func for (color_func, words) in self.color_func_to_words
                if word in words)
        except StopIteration:
            color_func = self.default_color_func

        return color_func

    def __call__(self, word, **kwargs):
        return self.get_color_func(word)(word, **kwargs)


### Round 2! Create the WordCloud and save the images

In [15]:
def create_wordcloud(name, color_groups, words):

    # Create a color function with multiple tones
    color_func = GroupedColorFunc(color_groups, default_color)
    
    words[name] = 2*max(words.values())
    
    wordcloud = WordCloud(width=6*300, 
                          height=9*300, 
                          color_func=color_func).generate_from_frequencies(words)
    wordcloud.to_file("./imgs/" + name + "_blk.png")
    
    print(wordcloud.layout_[0])
    
create_wordcloud('rural', rural_color_groups, rural_words)
create_wordcloud('science', science_color_groups, science_words)

print('Done!')

(('rural', 1.0), 460, (1506, 271), None, 'red')
(('science', 1.0), 328, (1627, 108), None, 'red')
Done!


### That's better! Nice and bright!