<img src="gothic.png" />

In [None]:
import csv
import json
import matplotlib.pyplot as plt
import numpy as np
import string
from collections import Counter, defaultdict, OrderedDict
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
title_file = 'ShelleyMary_Frankenstein_Gutenberg.txt'
title = "Frankenstein"

In [None]:
def tokenize_text(filename):
    """
    This function generates a list of tokens with punctuation
    stopwords, and spaces removed for the whole text.
    """
    text_tokens = []

    file_path = "./corpora/" + filename
    stop = list(set(stopwords.words('english')))
    punc = [p for p in string.punctuation]
    stop_punc = stop + punc + ["''", '``']

    text = open(file_path, 'r')

    for row in text:
        tokens = pos_tag(word_tokenize(row.lower()))
        if len(tokens) is not 0:
            # puts everything in lowercase, removes punctuation and stopwords
            tokens = [token for token in tokens if token[0] not in stop_punc]
            # adds row tokens to master list
            text_tokens.extend(tokens)

    return text_tokens

In [None]:
tokenized = tokenize_text(title_file)

In [None]:
print(tokenized[300:320])

In [None]:
def id_color_words():
    """
    Gets color words from the csv file and puts them into a dict where key = word
    and value = (hex value, color family).
    """
    color_word_dict = {}
    modern_color_words = []
    color_data = csv.reader(open('./color_names.csv'), delimiter=",", quotechar='"')
    next(color_data, None)

    for row in color_data:
        name = row[0].lower()
        year = int(row[1])
        if ' ' not in name:
            if year < 1914:
                family = row[2].lower()
                hex_value = row[3].lower()
                color_word_dict[name] = (hex_value, family)
            else:
                modern_color_words.append((year, name))
                   
    return color_word_dict, modern_color_words

In [None]:
color_dict, modern_color_words = id_color_words()
print("There are %d color words in our dictionary that were in use before 1914, per the OED." % len(color_dict))
print()
sample_colors = list(color_dict.keys())[230:240]
print("Here is a sample - first value is the HTML color, the second is the color family.")
for s in sample_colors:
    print(s, ":", color_dict[s])

In [None]:
def word_order(t, color_dict):
    colors = color_dict.keys()
    for i in range(len(t)-1):
        if 'JJ' in t[i][1] and t[i][0] in colors:
            if 'NN' in t[i+1][1] and t[i+1][0] in colors:
                print("JJ, NN", t[i], t[i + 1])
        elif'JJ' in t[i+1][1] and t[i+1][0] in colors:
            if 'NN' in t[i][1] and t[i][0] in colors:
                print("NN, JJ", t[i], t[i + 1])
word_order(tokenized, color_dict)

In [None]:
print("Here are the post-1914 words and the year of first usage according to the Oxford English Dictionary.")
print("The 20th century ushered in food-related color words including avocado, citron, cocoa, mustard, pimento & toffee.")
print()
print(sorted(modern_color_words, key=lambda x: x[0]))

In [None]:
def color_words(c_dict, text):
    tags = ['NN', 'JJ']
    color_names = list(c_dict.keys())
    color_words = [w for w in text if w[0] in color_names and w[1] in tags]
    
    return color_words

In [None]:
color_in_text = color_words(color_dict, tokenized)

In [None]:
print(color_in_text[:15])

In [None]:
nouns = Counter([n[0] for n in color_in_text if n[1] == 'NN'])
adjectives = Counter([a[0] for a in color_in_text if a[1] == 'JJ'])

In [None]:
print("*** Nouns ***")
print(nouns)
print("*** Adjectives ***")
print(adjectives)

In [None]:
pct_color_words = round((len(color_in_text)/len(tokenized))*100, 2)
print("%s is %s percent color words."% (title, pct_color_words))

In [None]:
def group_to_plot(color_dict, c_i_t):
    color_summary = defaultdict(int)
    for c in c_i_t:
        color_summary[color_dict[c[0]][0]] += 1
    color_sum_a = dict(OrderedDict(sorted(color_summary.items(), key=lambda t: t[1])))
    color_sum_d = dict(OrderedDict(sorted(color_summary.items(), key=lambda t: t[1], reverse=True)))
    return color_sum_a, color_sum_d

In [None]:
html_color_counts_asc, html_color_counts_dsc  = group_to_plot(color_dict, color_in_text)
print (json.dumps(html_color_counts_dsc, indent=1))

In [None]:
colors = list(html_color_counts.keys())
counts = list(html_color_counts.values())
y_pos = np.arange(len(counts))

plt.barh(y_pos, counts, align='center', alpha=0.9, color=colors)
plt.yticks(y_pos, colors)
plt.xlabel('# references')
plt.title('HTML colors from ' + title)
 
plt.show()

In [None]:
color_labels = [c[0] for c in color_in_text]
html_color_list = [color_dict[c][0] for c in color_labels]

cmap = html_color_list
bounds = [0,10,20]

fig, ax = plt.subplots()
ax.grid(which='major', linestyle='-', color='red', linewidth=2)
ax.set_xticks(np.arange(0, 25, 1));
ax.set_yticks(np.arange(0, 25, 1));

plt.show()