# excerpt metadata functions

In [None]:
# Space for Imports

In [1]:
sample_data = spark.read.parquet("data/sample_data.parquet/")
data = spark.read.parquet("data/data.parquet/")
print(data.printSchema())

root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- excerpt_number: long (nullable = true)
 |-- title: string (nullable = true)
 |-- author_id: double (nullable = true)
 |-- title_id: double (nullable = true)
 |-- id_vector: vector (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- termfreq: vector (nullable = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)

None


In [6]:
df = data.select("author", "title", "excerpt_number", "words").persist()
df.show(5)

+--------------+---------------+--------------+--------------------+
|        author|          title|excerpt_number|               words|
+--------------+---------------+--------------+--------------------+
|CharlesDickens|AChristmasCarol|             0|[a, christmas, ca...|
|CharlesDickens|AChristmasCarol|             1|[mind, !, i, do, ...|
|CharlesDickens|AChristmasCarol|             2|[scrooge, never, ...|
|CharlesDickens|AChristmasCarol|             3|[nobody, ever, st...|
|CharlesDickens|AChristmasCarol|             4|[the, door, of, s...|
+--------------+---------------+--------------+--------------------+
only showing top 5 rows



In [31]:
def not_word(words):
    letters = set([x for x in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"])
    return set(word for word in words if (word[0] not in letters and len(word)==1))

In [40]:
temp = df.take(8)[7]["words"]
print(temp)
print(not_word(temp))

['the', 'clerk', 'in', 'the', 'tank', 'involuntarily', 'applauded', '.', 'becoming', 'immediately', 'sensible', 'of', 'the', 'impropriety', ',', 'he', 'poked', 'the', 'fire', ',', 'and', 'extinguished', 'the', 'last', 'frail', 'spark', 'for', 'ever', '.', '||', '"', 'let', 'me', 'hear', 'another', 'sound', 'from', '_', 'you', '_', ',', '"', 'said', 'scrooge', ',', '"', 'and', 'you', "'ll", 'keep', 'your', 'christmas', 'by', 'losing', 'your', 'situation', '!', 'you', "'re", 'quite', 'a', 'powerful', 'speaker', ',', 'sir', ',', '"', 'he', 'added', ',', 'turning', 'to', 'his', 'nephew', '.', '"', 'i', 'wonder', 'you', 'do', "n't", 'go', 'into', 'parliament', '.', '"', '||', '"', 'do', "n't", 'be', 'angry', ',', 'uncle', '.', 'come', '!', 'dine', 'with', 'us', 'to', '-', 'morrow', '.', '"', '||', 'scrooge', 'said', 'that', 'he', 'would', 'see', 'him----yes', ',', 'indeed', 'he', 'did', '.', 'he', 'went', 'the', 'whole', 'length', 'of', 'the', 'expression', ',', 'and', 'said', 'that', 'he',

In [59]:
def words_found(words):
    letters = set([x for x in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"])
    S = set(w for w in words if w[0] in letters or (len(w)>1 and w[1] in letters))
    return S

def unique_words(words):
    return len(words_found(words))

def word_count(words):
    # Known Flaws:
    # possessives (e.g. "Bob's") will be counted as two words!
    # hypenated phrases (e.g. "he--wait!) will be counted as one word
    
    letters = set([x for x in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"])
    S = [w for w in words if w[0] in letters or (len(w)>1 and w[1] in letters)]
    return len(S)

def sent_count(words):
    return sum(words.count(x) for x in [".", "!", "?"])

def para_count(words):
    return words.count('||')+1


def metadata(words):
    print(words_found(words), "\n")
    print("unique wrd: ", unique_words(words))
    print('word count: ', word_count(words))
    print('sent count: ', sent_count(words))
    print('para count: ', para_count(words))


In [60]:
metadata(temp)

{'we', 'morrow', 'for', 'sound', 'want', 'as', 'you', 'sir', 'why', 'fire', 'friends', 'by', 'get', 'a', 'quite', 'merry', 'did', "'ll", 'never', 'world', 'be', 'reason', 'now', 'frail', 'that', "'s", 'scrooge', 'last', 'to', 'nothing', 'not', 'married', 'immediately', 'becoming', 'one', 'losing', 'turning', 'more', 'speaker', 'me', 'tank', 'indeed', 'growled', 'give', 'love', 'uncle', 'wonder', "n't", "'re", 'spark', 'than', 'can', 'said', 'added', 'before', 'he', 'i', 'poked', 'happened', 'parliament', 'with', 'but', 'keep', 'because', 'fell', 'the', 'another', 'it', 'extinguished', 'nephew', 'went', 'first', 'powerful', 'ridiculous', 'afternoon', 'ever', 'and', 'only', 'clerk', 'let', 'came', 'in', 'see', 'if', 'impropriety', 'his', 'situation', 'applauded', 'us', 'from', 'into', 'your', 'length', 'thing', 'dine', 'sensible', 'him----yes', 'of', 'were', 'expression', 'whole', 'him', 'do', 'hear', 'ask', 'go', 'extremity', 'coming', 'come', 'would', 'angry', 'good', 'involuntarily', 

In [61]:
temp2 = df.take(1000)[999]["words"]
metadata(temp2)

{'out', 'morrow', 'how', 'for', 'little', 'jobbing', 'they', 'could', 'about', 'business', 'want', 'then', 'stark', 'so', 'rightly', 'advance', 'as', 'old', 'used', 'england', 'after', 'back', 'degree', 'you', 'parish', 'sir', 'cogitation', 'been', 'where', 'by', 'get', 'yourself', 'expect', 'pleaded', 'description', 'a', 'eyes', 'tellson', 'floppin’', 'wives', 'did', 'patron', 'great', 'respectable', 'secret', 'never', 'side', 'ruinating', 'mr.', 'doctors’', 'idea', 'their', 'even', 'n’t', 'might', 'ud', 'be', 'o’', 'sarse', 'till', 'account', 'that', 'what', 'my', 'to', 'not', 'quarter', 'watchmen', '’m', 'own', 'there', 'doctors', 'upon', 'one', 'took', 'wot', 'picking', 'hour', 'all', 'imposing', 'more', 'gentleman', 'shoulders', 'me', 'lorry', 'honour', 'way', 'which', 'prosper', 'nor', 'cruncher', 'wos', 'hope', 'well', 'man', 'sides', 'messenger', 'mind', 'flop', 'tradesman', 'imposed', 'at', 'gander', 'em', 'catch', '’s', 'away', 'up', 'medical', 'carriages', 'patients', 'can',