In [34]:
###
#
# MiniProject1: Compare "uniqueness" of vocabulary of multiple authors, using entropic measures
#
###
import requests, re, nltk

urls = {
    "lovecraft": [
        "https://www.gutenberg.org/cache/epub/50133/pg50133.txt",
        "https://www.gutenberg.org/cache/epub/68283/pg68283.txt",
    ],
    "land": [
        "https://archive.org/stream/1993landspiritandteeth/Nick%20Land%20-%20Papers/%281988%29%20LAND%20--%20Kant%2C%20Capital%2C%20and%20the%20Prohibition%20of%20Incest_djvu.txt",
        "https://archive.org/stream/1993landspiritandteeth/Nick%20Land%20-%20Papers/%281991%29%20LAND%20--%20Art%20as%20Insurrection_djvu.txt",
    ],
    "dostoevsky": [
        "https://www.gutenberg.org/files/2554/2554-0.txt",
    ],
    "bible": [ "https://www.gutenberg.org/cache/epub/10/pg10.txt" ],
    
}

# word frequency maps
wfs = {}

for author, works in urls.items():
    # concatenate all sources
    txt = ""
    for work in works:
        txt += requests.get(work).text
    
    # populate a dictionary (wf)
    wf = {}
    
    # count split words
    for w in re.split('\s+', txt):
        if w in wf: wf[w] += 1
        else: wf[w] = 1
            
    # sort so that most used words come first
    wfs[author] = sorted(wf.items(), key=lambda x: x[1], reverse=True)
    
    
# compute uniqueness
uniqscores = {}
for author, wf in wfs.items():
    #print(author, wf)
    # compute psuedo-zipf coefficient; higher suggests more unique
    sm = sum([c for (w, c) in wf])
    zipfc = 0.0
    num = len(wf)
    # do a parabolic window filter, that weights the "middle" part of the word frequency 
    for i, (w, c) in enumerate(wf):
        # linear window shift (w.r.t word frequencies)
        #
        #               ----
        #           ----
        #      ---- 
        # ----
        #    rarer words ---->
        zipfc += (c / sm) * (i / num)
        # parabolic window shift:
        #      -----
        #    /        \
        #  /           \
        # /             \
        #zipfc += (c / sm) * (1 - (2 * (i / num) - 1) ** 2)
        #print(w, c)
        
    uniqscores[author] = 100 * zipfc
    #print(author, zipfc)
    
# print out in order
print("idx  score author")
for i, (author, score) in enumerate(sorted(uniqscores.items(), key=lambda x: x[1], reverse=True)):
    print(f"{i:3d}  {score:5.2f} {author}")

# output:
"""

idx  score author
  0  17.81 land
  1  15.28 lovecraft
  2   8.16 dostoevsky
  3   4.76 bible

"""
    
    
    

idx  score author
  0  17.81 land
  1  15.28 lovecraft
  2   8.16 dostoevsky
  3   4.76 bible
