In [2]:
import string
import nltk
import csv
from collections import Counter
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
def tokenize_text(filename):
    """
    This function generates a list of tokens with punctuation
    stopwords, and spaces removed for the whole text.
    """
    text_tokens = []

    file_path = "./corpora/" + filename
    stop = list(set(stopwords.words('english')))
    punc = [p for p in string.punctuation]
    stop_punc = stop + punc + ["''", '``']

    text = open(file_path, 'r')

    for row in text:
        tokens = pos_tag(word_tokenize(row.lower()))
        if len(tokens) is not 0:
            # puts everything in lowercase, removes punctuation and stopwords
            tokens = [token for token in tokens if token[0] not in stop_punc]
            # adds row tokens to master list
            text_tokens.extend(tokens)

    return text_tokens

In [4]:
tokenized = tokenize_text('ShelleyMary_Frankenstein_Gutenberg.txt')

In [5]:
print(tokenized[308:321])

[('visions', 'NNS'), ('faded', 'VBD'), ('perused', 'VBD'), ('first', 'JJ'), ('time', 'NN'), ('poets', 'NNS'), ('whose', 'WP$'), ('effusions', 'NNS'), ('entranced', 'VBD'), ('soul', 'NN'), ('lifted', 'VBD'), ('heaven', 'VB'), ('also', 'RB')]


In [6]:
def get_color_words():
    """
    Gets color words from the csv file and puts them into a dict where key = word
    and value = hex value.
    """
    color_word_dict = {}
    modern_color_words = []
    color_data = csv.reader(open('./color_names.csv'), delimiter=",", quotechar='"')
    next(color_data, None)

    for row in color_data:
        name = row[0].lower()
        year = int(row[1])
        if ' ' not in name:
            if year < 1914:
                family = row[2].lower()
                hex_value = row[3].lower()
                color_word_dict[name] = (hex_value, family)
            else:
                modern_color_words.append((year, name))
                   
    return color_word_dict, modern_color_words

In [7]:
color_dict, modern_color_words = get_color_words()
print("There are %d color words in our dictionary that were in use before 1914, per the OED." % len(color_dict))
print()
sample_colors = list(color_dict.keys())[400:410]
print("Here is a sample - first value is the HTML color, the second is the color family.")
for s in sample_colors:
    print(s, ":", color_dict[s])

There are 980 color words in our dictionary that were in use before 1914, per the OED.

Here is a sample - first value is the HTML color, the second is the color family.
greensome : ('green', 'greenish')
greenery : ('green', 'greenish')
smaragdine : ('mediumseagreen', 'bright green')
emerald-green : ('mediumseagreen', 'bright green')
emeraldine : ('mediumseagreen', 'bright green')
lime-green : ('mediumseagreen', 'bright green')
twig-green : ('mediumseagreen', 'bright green')
steel-green : ('darkgreen', 'dark green')
bottle-green : ('darkgreen', 'dark green')
corbeau : ('darkgreen', 'dark green')


In [8]:
print("Here are the post-1914 words and the year of first usage according to the Oxford English Dictionary.")
print("The 20th century ushered in food-related color words including avocado, citron, cocoa, mustard, pimento & toffee.")
print()
print(sorted(modern_color_words, key=lambda x: x[0]))

Here are the post-1914 words and the year of first usage according to the Oxford English Dictionary.
The 20th century ushered in food-related color words including avocado, citron, cocoa, mustard, pimento & toffee.

[(1918, 'oxblood'), (1918, 'jacobean'), (1918, 'shadow-grey'), (1919, 'mustard'), (1921, 'tony'), (1921, 'pimento'), (1922, 'nude'), (1922, 'straw-pale'), (1923, 'alizarin'), (1923, 'sahara'), (1923, 'blue-brilliant'), (1923, 'sahara'), (1924, 'citron'), (1924, 'hennaed'), (1927, 'greige'), (1928, 'mimosa'), (1931, 'off-white'), (1931, 'magnolia-pink'), (1934, 'safari'), (1935, 'salmony'), (1937, 'seaweed-green'), (1938, 'guinea-gold'), (1941, 'xanthophyllic'), (1941, 'prune-dark'), (1942, 'cocoa'), (1946, 'whale-blue'), (1947, 'avocado'), (1949, 'saffronic'), (1950, 'wine'), (1951, 'bone-coloured'), (1953, 'bible-black'), (1953, 'pinko-grey'), (1961, 'toffee-brown'), (1962, 'toffee'), (1962, 'peppery'), (1963, 'pimiento'), (1963, 'magnolia'), (1965, 'lily-green'), (1965, '

In [17]:
def find_color_words(text):
    wnl = WordNetLemmatizer()
    color_words = []
    color_names = list(color_dict.keys())
    colors = [wnl.lemmatize(c) for c in color_names]
    tags = {'NN': 'n', 
            'JJ': 'a'}
    
    
    return color_words

In [18]:
color_words = find_color_words(tokenized)
print(color_words)

['white', 'blank', 'blanch', 'paper-white', 'candid', 'whitish', 'whitelike', 'albid', 'incandent', 'all-white', 'snow-white', 'snowish', 'snowy', 'nixious', 'snow-like', 'snow', 'milk-white', 'milkish', 'milky', 'milk', 'milken', 'lacteal', 'lacteous', 'lactaceous', 'lactean', 'lactescent', 'bone', 'ivory', 'ivory-white', 'eburnean', 'eburnine', 'bone-white', 'ivorine', 'ivoried', 'lily-white', 'lily', 'lily-whited', 'lily-wristed', 'lily-like', 'lily-coloured', 'chalk-white', 'chalky', 'chalkish', 'ermine', 'ermined', 'marble', 'alabaster', 'alabastrine', 'marmorean', 'marbly', 'marmoreal', 'pearly', 'swan-white', 'ice-white', 'lint-white', 'wool-white', 'wax-white', 'cottonary', 'cottonous', 'cottony', 'sheep-hued', 'ecru', 'natural-coloured', 'oysterous', 'oyster', 'silver', 'silverish', 'argentine', 'argent', 'silver-coloured', 'silver-white', 'silver-like', 'silvery', 'argenteous', 'silvern', 'flatten', 'fleeten', 'blue-white', 'blush-white', 'cream-coloured', 'creamy', 'cream', 

In [19]:
wnl = WordNetLemmatizer()

In [26]:
print(wnl.lemmatize('cottonary'))

cottonary


In [None]:
nouns = Counter([n[0] for n in colors_in_text if n[1] == 'NN'])
adjectives = Counter([a[0] for a in colors_in_text if a[1] == 'JJ'])

In [None]:
print("*** Nouns ***")
print(nouns)
print("*** Adjectives ***")
print(adjectives)