## Setup

In [1]:
import nltk
from nltk import FreqDist, sent_tokenize
import re
import pandas as pd

In [2]:
with open('data/prepared_texts/dune.txt') as file:
    dune_text = file.read()
with open('data/prepared_texts/sandworms_clean.txt', encoding="utf8", mode='r') as file:
    worm_text = file.read()

summary_df = pd.DataFrame(index=['Dune', 'Worms'])

## Tokenizing

In [3]:
dune_split = nltk.sent_tokenize(dune_text)
dune_words = [nltk.word_tokenize(sent) for sent in dune_split]

In [4]:
worm_split = nltk.sent_tokenize(worm_text)
worm_words = [nltk.word_tokenize(sent) for sent in worm_split]

In [5]:
summary_df['Num. Sentences'] = [len(dune_words), len(worm_words)]
dune_sent_len = sum([len(i) for i in dune_words])/len(dune_words)
worm_sent_len = sum([len(i) for i in worm_words])/len(worm_words)
summary_df['Avg. Sentence Len'] = [dune_sent_len, worm_sent_len]

In [6]:
summary_df

Unnamed: 0,Num. Sentences,Avg. Sentence Len
Dune,17668,13.420874
Worms,9448,17.989733


## Tagging

In [7]:
dune_tagged = [nltk.pos_tag(tokens) for tokens in dune_words]
print(dune_tagged[:2])

[[('A', 'DT'), ('beginning', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('time', 'NN'), ('for', 'IN'), ('taking', 'VBG'), ('the', 'DT'), ('most', 'RBS'), ('delicate', 'JJ'), ('care', 'NN'), ('that', 'IN'), ('the', 'DT'), ('balances', 'NNS'), ('are', 'VBP'), ('correct', 'JJ'), ('.', '.')], [('This', 'DT'), ('every', 'DT'), ('sister', 'NN'), ('of', 'IN'), ('the', 'DT'), ('Bene', 'NNP'), ('Gesserit', 'NNP'), ('knows', 'VBZ'), ('.', '.')]]


In [8]:
worm_tagged = [nltk.pos_tag(tokens) for tokens in worm_words]
print(worm_tagged[:2])

[[('So', 'RB'), ('many', 'JJ'), ('people', 'NNS'), ('I', 'PRP'), ('knew', 'VBD'), ('in', 'IN'), ('the', 'DT'), ('past', 'NN'), ('are', 'VBP'), ('not', 'RB'), ('yet', 'RB'), ('reborn', 'VBN'), ('.', '.')], [('I', 'PRP'), ('still', 'RB'), ('miss', 'VB'), ('them', 'PRP'), (',', ','), ('even', 'RB'), ('though', 'IN'), ('I', 'PRP'), ('do', 'VBP'), ('not', 'RB'), ('remember', 'VB'), ('them', 'PRP'), ('.', '.')]]


## Parsing - Adjectives

In [9]:
grammar_adjph = "ADJPH: {<RB.?>+<JJ.?>}"
chunk_parser_adj = nltk.RegexpParser(grammar_adjph)

In [37]:
#adjectives
dune_adjective_tokens = []
for sentence in dune_tagged:
    for word, pos in sentence:
        if pos in ['JJ', 'JJR', 'JJS']: 
            if len(word)>1:
                dune_adjective_tokens.append(word)
dune_freq_adjective = nltk.FreqDist(dune_adjective_tokens)

worm_adjective_tokens = []
for sentence in worm_tagged:
    for word, pos in sentence:
        if pos in ['JJ', 'JJR', 'JJS']: 
            if len(word)>1:
                worm_adjective_tokens.append(word)
worm_freq_adjective = nltk.FreqDist(worm_adjective_tokens)
summary_df['Num. Adjectives'] = [len(dune_adjective_tokens), len(worm_adjective_tokens)]

dune_adjs = [word for word, freq in dune_freq_adjective.most_common(50)]
worm_adjs = [word for word, freq in worm_freq_adjective.most_common(50)]
shared_adjs = [word for word in dune_adjs if word in worm_adjs]
print(f'The {len(shared_adjs)} adjectives that occur in both text\'s top 50 adjectives are: ', shared_adjs)
dune_unq_adjs = [word for word in dune_adjs if word not in worm_adjs]
worm_unq_adjs = [word for word in worm_adjs if word not in dune_adjs]
print(f'There are {len(dune_unq_adjs)} unique adjectives for Dune: ', dune_unq_adjs)
print(f'There are {len(worm_unq_adjs)} unique adjectives for Sandworms of Dune: ', worm_unq_adjs)

The 28 adjectives that occur in both text's top 50 adjectives are:  ['own', 'old', 'other', 'more', 'many', 'dead', 'such', 'new', 'little', 'open', 'first', 'good', 'much', 'young', 'deep', 'long', 'few', 'full', 'great', 'small', 'same', 'entire', 'last', 'true', 'possible', 'human', 'enough', 'real']
There are 22 unique adjectives for Dune:  ['black', 'right', 'sure', 'silent', 'left', 'low', 'sudden', 'dark', 'certain', 'terrible', 'high', 'green', 'dry', 'dangerous', 'clear', 'white', 'strange', 'wide', 'subtle', 'yellow', 'obvious', 'big']
There are 22 unique adjectives for Sandworms of Dune:  ['no-ship', 'original', 'large', 'll', 'past', 'Duncan', 'least', 'ready', 'most', 'different', 'able', 'necessary', 'final', 'thinking-machine', 'us.', 'greater', 'whole', 've', 'independent', 'only', 'huge', 'next']


In [11]:
dune_adjph_tags = []
for sent in dune_tagged:
    if len(sent) > 0:
        tree = chunk_parser_adj.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'ADJPH':
                dune_adjph_tags.append(subtree)     

dune_adjective_phrases = []
for sent in dune_adjph_tags:
    temp = ''
    for w, t in sent:
        temp += w+ ' '    
    dune_adjective_phrases.append(temp)   

In [12]:
worm_adjph_tags = []
for sent in worm_tagged:
    if len(sent) > 0:
        tree = chunk_parser_adj.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'ADJPH':
                worm_adjph_tags.append(subtree)
                
worm_adjective_phrases = []
for sent in worm_adjph_tags:
    temp = ''
    for w, t in sent:
        temp += w+ ' '    
    worm_adjective_phrases.append(temp)

In [13]:
#Generate Freqeuncy Distributions
dune_freq_adjph = FreqDist(dune_adjective_phrases)
worm_freq_adjph = FreqDist(worm_adjective_phrases)


#Store in DataFrame
adjs_df = pd.DataFrame({'dune_adjph': dune_freq_adjph.most_common(50), \
    'dune_adjs': dune_freq_adjective.most_common(50),\
    'worms_adjph': worm_freq_adjph.most_common(50),\
    'worms_adjs': worm_freq_adjective.most_common(50)})
summary_df['Num. AdjPh'] = [len(dune_adjph_tags), len(worm_adjph_tags)]

In [39]:

#Get set of shared top 50 adjective phrases.

dune_adjph = [word for word, freq in dune_freq_adjph.most_common(50)]
worm_adjph = [word for word, freq in worm_freq_adjph.most_common(50)]
shared_adjph = [word for word in dune_adjph if word in worm_adjph]
print(f'The {len(shared_adjph)} adjective phrases that occur in both text\'s top 50 adjective phrases are: ', shared_adjph)
dune_unq_adjphs = [word for word in dune_adjph if word not in worm_adjph]
worm_unq_adjphs = [word for word in worm_adjph if word not in dune_adjph]
print(f'There are {len(dune_unq_adjphs)} unique adjective phrases for Dune: ', dune_unq_adjphs)
print(f'There are {len(worm_unq_adjphs)} unique adjective phrases for Sandworms of Dune: ', worm_unq_adjphs)

The 11 adjective phrases that occur in both text's top 50 adjective phrases are:  ['so many ', 'too much ', 'so much ', 'So many ', 'so little ', 'too late ', 'little more ', 'not necessary ', 'no longer ', 'more important ', 'very little ']
There are 39 unique adjective phrases for Dune:  ['as much ', 'not sure ', 'most precious ', 'precisely correct ', 'most dangerous ', 'so young ', 'so few ', 'not likely ', 'so dark ', 'So much ', 'so low ', 'Very good ', 'not good ', 'so sure ', 'Too much ', 'Too bad ', 'so slow ', 'as good ', 'so small ', 'more beautiful ', 'not often wrong ', 'most efficient ', 'most interesting ', 'no more ', 'there more ', 'Once more ', 'pretty bad ', 'as bad ', 'so clear ', 'very old ', 'more sensitive ', 'as dangerous ', 'so tired ', 'not worthy ', 'completely loyal ', 'not much ', "n't likely ", 'well aware ', 'too deep ']
There are 39 unique adjective phrases for Sandworms of Dune:  ['much more ', 'too many ', 'far more ', 'even greater ', 'only thirteen '

In [15]:
adjs_df.head()

Unnamed: 0,dune_adjph,dune_adjs,worms_adjph,worms_adjs
0,"(so many , 18)","(own, 245)","(so many , 30)","(own, 220)"
1,"(too much , 18)","(old, 228)","(so much , 19)","(old, 217)"
2,"(so much , 8)","(other, 211)","(no longer , 9)","(other, 199)"
3,"(as much , 8)","(more, 143)","(too much , 9)","(new, 188)"
4,"(So many , 6)","(many, 131)","(very little , 8)","(more, 185)"


## Parsing - Adverbs

In [16]:
grammar_advph = "ADVPH: {<RB>+<RB>}"
chunk_parser_adv = nltk.RegexpParser(grammar_advph)

In [41]:
#Adverbs
dune_adverb_tokens = []
for sentence in dune_tagged:
    for word, pos in sentence:
        if pos in ['RB', 'RBR', 'RBS']: 
            if len(word)>1:
                dune_adverb_tokens.append(word)
dune_freq_adverb = nltk.FreqDist(dune_adverb_tokens)

worm_adverb_tokens = []
for sentence in worm_tagged:
    for word, pos in sentence:
        if pos in ['RB', 'RBR', 'RBS']:
            if len(word)>1:
                worm_adverb_tokens.append(word)
worm_freq_adverb = nltk.FreqDist(worm_adverb_tokens)
summary_df['Num. Adverbs'] = [len(dune_adverb_tokens), len(worm_adverb_tokens)]

dune_advs = [word for word, freq in dune_freq_adverb.most_common(50)]
worm_advs = [word for word, freq in worm_freq_adverb.most_common(50)]
shared_advs = [word for word in dune_advs if word in worm_advs]
print(f'The {len(shared_advs)} adverbs that occur in both text\'s top 50 adverbs are: ', shared_advs)
dune_unq_advs = [word for word in dune_advs if word not in worm_advs]
worm_unq_advs = [word for word in worm_advs if word not in dune_advs]
print(f'There are {len(dune_unq_advs)} unique adverbs for Dune: ', dune_unq_advs)
print(f'There are {len(worm_unq_advs)} unique adverbs for Sandworms of Dune: ', worm_unq_advs)

The 41 adverbs that occur in both text's top 50 adverbs are:  ['not', "n't", 'here', 'now', 'back', 'there', 'only', 'then', 'so', 'away', 'too', 'never', 'down', 'even', 'still', 'more', 'Now', 'Then', 'yet', 'just', 'well', 'once', 'as', 'most', 'long', 'enough', 'So', 'always', 'up', 'ever', 'Not', 'again', 'far', 'very', 'already', 'forward', 'soon', 'much', 'else', 'Even', 'perhaps']
There are 9 unique adverbs for Dune:  ['almost', 'Perhaps', 'ahead', 'Well', 'Again', 'Presently', 'Here', 'suddenly', 'Only']
There are 9 unique adverbs for Sandworms of Dune:  ['ago', 'no', 'longer', 'also', 'simply', 'together', 'easily', 'exactly', 'finally']


In [18]:
#Adverb Phrases
dune_advph_tags = []
for sent in dune_tagged:
    if len(sent) > 0:
        tree = chunk_parser_adv.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'ADVPH':
                dune_advph_tags.append(subtree)
                
dune_adverb_phrases = []
for sent in dune_advph_tags:
    temp = ''
    for w, t in sent:
        temp += w+ ' '    
    dune_adverb_phrases.append(temp)

In [19]:
worm_advph_tags = []
for sent in worm_tagged:
    if len(sent) > 0:
        tree = chunk_parser_adv.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'ADVPH':
                worm_advph_tags.append(subtree)
                
worm_adverb_phrases = []
for sent in worm_advph_tags:
    temp = ''
    for w, t in sent:
        temp += w+ ' '    
    worm_adverb_phrases.append(temp)

In [20]:
dune_freq_advph = nltk.FreqDist(dune_adverb_phrases)
worm_freq_advph = nltk.FreqDist(worm_adverb_phrases)

In [21]:
#Store in DataFrame
advs_df = pd.DataFrame({'dune_advph': dune_freq_advph.most_common(50), \
    'dune_advs': dune_freq_adverb.most_common(50),\
    'worms_advph': worm_freq_advph.most_common(50),\
    'worms_advs': worm_freq_adverb.most_common(50)})
summary_df['Num. AdvPh'] = [len(dune_advph_tags), len(worm_advph_tags)]

In [43]:

dune_advphs = [word for word, freq in dune_freq_advph.most_common(50)]
worm_advphs = [word for word, freq in worm_freq_advph.most_common(50)]
shared_advphs = [word for word in dune_advphs if word in worm_advphs]
print(f'The {len(shared_advphs)} adverb phrases that occur in both text\'s top 50 adverb phrases are: ', shared_advphs)
dune_unq_advphs = [word for word in dune_advphs if word not in worm_advphs]
worm_unq_advphs = [word for word in worm_advphs if word not in dune_advphs]
print(f'There are {len(dune_unq_advphs)} unique adverb phrases for Dune: ', dune_unq_advphs)
print(f'There are {len(worm_unq_advphs)} unique adverb phrases for Sandworms of Dune: ', worm_unq_advphs)

The 18 adverb phrases that occur in both text's top 50 adverb phrases are:  ['as well ', 'not yet ', 'not even ', 'so long ', 'never again ', 'as soon ', 'far away ', 'not just ', 'soon enough ', 'long ago ', 'no longer ', 'not enough ', 'so easily ', 'long enough ', 'too much ', 'perhaps even ', 'Right now ', 'as fast ']
There are 32 unique adverb phrases for Dune:  ['as long ', 'never before ', "n't even ", 'right now ', 'down here ', 'As long ', 'not so ', 'far enough ', "n't really ", 'Surely not ', 'Not even ', 'not as ', 'Very well ', 'not here ', 'here soon ', 'not very ', 'back there ', 'so well ', 'just now ', 'not quite ', 'just enough ', 'not always ', 'never once ', 'almost too ', 'not often ', 'Well then ', 'never even ', 'only now ', 'not really ', 'along presently ', 'far back ', 'here only ']
There are 32 unique adverb phrases for Sandworms of Dune:  ['So far ', '“ Even ', 'Even so ', 'As soon ', 'so long ago ', 'so much ', 'not only ', 'so far ', 'Long ago ', 'Not yet 

In [23]:
advs_df.head()

Unnamed: 0,dune_advph,dune_advs,worms_advph,worms_advs
0,"(as well , 18)","(not, 852)","(as well , 43)","(not, 718)"
1,"(as long , 12)","(n't, 557)","(not yet , 14)","(so, 274)"
2,"(never before , 11)","(here, 364)","(no longer , 14)","(now, 245)"
3,"(n't even , 10)","(now, 344)","(long ago , 14)","(even, 217)"
4,"(not yet , 10)","(back, 258)","(so long , 11)","(back, 214)"


## Parsing - Nouns

In [24]:
grammar_nounph = "NP: {<DT>?<JJ>*<NN>}"
chunk_parser_noun = nltk.RegexpParser(grammar_nounph)

In [44]:
#Nouns
dune_noun_tokens = []
for sentence in dune_tagged:
    for word, pos in sentence:
        if pos in ['NN', 'NNS', 'NNP', 'NNPS']:
            if len(word)>1:
                dune_noun_tokens.append(word)
dune_freq_noun = nltk.FreqDist(dune_noun_tokens)

worm_noun_tokens = []
for sentence in worm_tagged:
    for word, pos in sentence:
        if pos in ['NN', 'NNS', 'NNP', 'NNPS']: 
            if len(word)>1:
                worm_noun_tokens.append(word)
worm_freq_noun = nltk.FreqDist(worm_noun_tokens)
summary_df['Num. Nouns'] = [len(dune_noun_tokens), len(worm_noun_tokens)]

shared_nouns = [word for word, freq in dune_freq_noun.most_common(50)]
shared_nouns = [word for word, freq in worm_freq_noun.most_common(50) if word in shared_nouns]
print(f'The {len(shared_advs)} nouns that occur in both text\'s top 50 nouns are: ', shared_nouns)


dune_nouns = [word for word, freq in dune_freq_noun.most_common(50)]
worm_nouns = [word for word, freq in worm_freq_noun.most_common(50)]
shared_advs = [word for word in dune_nouns if word in worm_nouns]
print(f'The {len(shared_nouns)} nouns that occur in both text\'s top 50 nouns are: ', shared_nouns)
dune_unq_nouns = [word for word in dune_nouns if word not in worm_nouns]
worm_unq_nouns = [word for word in worm_nouns if word not in dune_nouns]
print(f'There are {len(dune_unq_nouns)} unique nouns for Dune: ', dune_unq_nouns)
print(f'There are {len(worm_unq_nouns)} unique nouns for Sandworms of Dune: ', worm_unq_nouns)


The 41 nouns that occur in both text's top 50 nouns are:  ['Paul', 'man', 'time', 'Baron', 'Leto', 'Bene', 'Yueh', 'Mother', 'Jessica', 'eyes', 'way', 'people', 'face', 'Gesserit', 'voice', 'woman', 'desert', 'spice', 'Stilgar', 'Atreides']
The 20 nouns that occur in both text's top 50 nouns are:  ['Paul', 'man', 'time', 'Baron', 'Leto', 'Bene', 'Yueh', 'Mother', 'Jessica', 'eyes', 'way', 'people', 'face', 'Gesserit', 'voice', 'woman', 'desert', 'spice', 'Stilgar', 'Atreides']
There are 30 unique nouns for Dune:  ['Duke', 'Hawat', 'Fremen', 'Gurney', 'Kynes', 'hand', 'men', 'Emperor', 'water', 'Chani', 'sand', 'Arrakis', 'mother', 'Halleck', 'father', 'thing', 'room', 'Feyd-Rautha', 'place', 'Sardaukar', 'Harkonnen', 'thought', 'door', 'mind', 'son', 'head', "Muad'Dib", 'words', 'Piter', 'rock']
There are 30 unique nouns for Sandworms of Dune:  ['Duncan', 'Sheeana', 'Murbella', 'Face', 'machines', 'Teg', 'Erasmus', 'ghola', 'Waff', 'Omnius', 'Paolo', 'Dancers', 'Enemy', 'Tleilaxu', 'me

In [26]:
dune_nounph_tags = []
for sent in dune_tagged:
    if len(sent) > 0:
        tree = chunk_parser_noun.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'NP':
                dune_nounph_tags.append(subtree)
                
dune_noun_phrases = []
for sent in dune_nounph_tags:
    temp = ''
    for w, t in sent:
        temp += w+ ' '    
    dune_noun_phrases.append(temp)

In [27]:
worm_nounph_tags = []
for sent in worm_tagged:
    if len(sent) > 0:
        tree = chunk_parser_noun.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'NP':
                worm_nounph_tags.append(subtree)
                
worm_noun_phrases = []
for sent in worm_nounph_tags:
    temp = ''
    for w, t in sent:
        temp += w+ ' '    
    worm_noun_phrases.append(temp)

In [28]:
dune_freq_nounph = nltk.FreqDist(dune_noun_phrases)
worm_freq_nounph = nltk.FreqDist(worm_noun_phrases)

nouns_df = pd.DataFrame({'dune_nounphs': dune_freq_nounph.most_common(50), \
    'dune_nouns': dune_freq_noun.most_common(50),\
    'worm_nounphs': worm_freq_nounph.most_common(50),\
    'worms_nouns': worm_freq_noun.most_common(50)})
summary_df['Num. NounPh'] = [len(dune_nounph_tags), len(worm_nounph_tags)]

In [29]:
nouns_df.head()

Unnamed: 0,dune_nounphs,dune_nouns,worm_nounphs,worms_nouns
0,"(mother , 198)","(Paul, 1690)","(“ , 941)","(Duncan, 403)"
1,"(voice , 197)","(Jessica, 894)","(” , 308)","(Sheeana, 369)"
2,"(father , 192)","(Baron, 567)","(s , 298)","(Murbella, 310)"
3,"(water , 163)","(Duke, 555)","(t , 160)","(Face, 299)"
4,"(face , 159)","(man, 481)","(’ t , 144)","(Paul, 248)"


In [45]:
dune_nounphs = [word for word, freq in dune_freq_nounph.most_common(50)]
worm_nounphs = [word for word, freq in worm_freq_nounph.most_common(50)]
shared_nounphs = [word for word in dune_nounphs if word in worm_nounphs]
print(f'The {len(shared_nounphs)} noun phrases that occur in both text\'s top 50 noun phrases are: ', shared_nounphs)
dune_unq_nounphs = [word for word in dune_nounphs if word not in worm_nounphs]
worm_unq_nounphs = [word for word in worm_nounphs if word not in dune_nounphs]
print(f'There are {len(dune_unq_nounphs)} unique noun phrases for Dune: ', dune_unq_nounphs)
print(f'There are {len(worm_unq_nounphs)} unique noun phrases for Sandworms of Dune: ', worm_unq_nounphs)

The 18 noun phrases that occur in both text's top 50 noun phrases are:  ['voice ', 'face ', 'mind ', 'hand ', 'head ', 'the way ', 'something ', 'the desert ', 'time ', 'life ', 'man ', 'nothing ', 'spice ', 'way ', 'side ', 'body ', 'anything ', 'the floor ']
There are 32 unique noun phrases for Dune:  ['mother ', 'father ', 'water ', 'the man ', 'son ', 'thought ', 'the sand ', 'attention ', 'the room ', 'sand ', 'the table ', 'the door ', 'a man ', "m'Lord ", 'the spice ', 'fear ', 'mouth ', 'arm ', 'course ', 'a thing ', 'the basin ', 'room ', 'The man ', 'death ', 'front ', 'the troop ', 'the old woman ', 'someone ', 'a hand ', 'robe ', 'rock ', 'awareness ']
There are 32 unique noun phrases for Sandworms of Dune:  ['“ ', '” ', 's ', 't ', '’ t ', '” “ ', 'everything ', 'melange ', 'space ', 'the thinking ', 'the ship ', '’ ', 'the ghola ', 'ghola ', 'the evermind ', 'humanity ', 'fleet ', 'the no-ship ', 'the old man ', 'the robot ', 'the machine ', 'the boy ', 'the air ', 'machi

In [31]:
summary_df

Unnamed: 0,Num. Sentences,Avg. Sentence Len,Num. Adjectives,Num. AdjPh,Num. Adverbs,Num. AdvPh,Num. Nouns,Num. NounPh
Dune,17668,13.420874,10575,738,8803,548,49588,29983
Worms,9448,17.989733,11819,970,8217,685,37770,21538


## Sentiment Analysis

In [48]:
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
nltk.download('sentiwordnet')
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer


[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\lizzi\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\lizzi\AppData\Roaming\nltk_data...


In [50]:
sia = SentimentIntensityAnalyzer()
sia.polarity_scores(dune_text)

{'neg': 0.094, 'neu': 0.818, 'pos': 0.088, 'compound': -1.0}

In [51]:
sia.polarity_scores(worm_text)

{'neg': 0.119, 'neu': 0.775, 'pos': 0.106, 'compound': -1.0}