In [17]:
import nltk
from nltk import FreqDist
from nltk.corpus import PlaintextCorpusReader
from nltk.collocations import *
from nltk.text import Text 
import re
import pandas as pd

In [2]:
with open("data/prepared_texts/sandworms_clean.txt", 'r') as file:
    worm_file = file.read()

# Create Corpuses and Tokens

In [3]:
corpus_dune = PlaintextCorpusReader(\
            "data/prepared_texts/", "dune.txt")


#### Dune Tokens

In [4]:
#create list of tokens, preserving contractions and the odd 
#punctuation that I consider part of F. Herbert's style
dune_tokens = re.findall(r"[\w']+|\.\.\.|--|[.,!?;]", corpus_dune.raw())
dune_words = [word.lower() for word in dune_tokens]
dune_text = Text(dune_words)

#### Worm Tokens

In [5]:
#Repeat process as above on Sandworms.
#The apostrophe in the text is a different format, so that's changed first.
worm_tokens = re.sub('’', '\'', worm_file)
worm_tokens = re.findall(r"[\w']+|\.\.\.|--|[.,!?;]", worm_tokens)
worm_words = [word.lower() for word in worm_tokens]
worm_text = Text(worm_words)

# Analysis

### Frequencies

In [6]:
dune_dist = FreqDist(dune_words)
worm_dist = FreqDist(worm_words)

#### See top 50 tokens of raw text

In [7]:
top_words_raw = pd.DataFrame({'dune':dune_dist.most_common(50), \
                              'worm':worm_dist.most_common(50)})
top_words_raw

Unnamed: 0,dune,worm
0,"(., 15083)","(., 10229)"
1,"(the, 13537)","(the, 9758)"
2,"(,, 11474)","(,, 8582)"
3,"(of, 4768)","(to, 3602)"
4,"(a, 4433)","(of, 3493)"
5,"(to, 3955)","(and, 3331)"
6,"(and, 3473)","(a, 2906)"
7,"(he, 2841)","(he, 1778)"
8,"(in, 2476)","(had, 1747)"
9,"(his, 2418)","(in, 1611)"


#### Top 50 tokens once stopwords and puctuation marks are removed.

In [8]:
#Filter out non-alphabetical tokens
def filter(w):
    pattern = re.compile('^[^a-z]+$')
    if(pattern.match(w)):
        return True
    else:
        return False

In [9]:
dune_alpha = [word for word in dune_words if not filter(word)]
worm_alpha = [word for word in worm_words if not filter(word)]

In [10]:
dune_dist = FreqDist(dune_alpha)
worm_dist = FreqDist(worm_alpha)
top_words_alpha = pd.DataFrame({'dune':dune_dist.most_common(50), \
                              'worm':worm_dist.most_common(50)})
top_words_alpha    

Unnamed: 0,dune,worm
0,"(the, 13537)","(the, 9758)"
1,"(of, 4768)","(to, 3602)"
2,"(a, 4433)","(of, 3493)"
3,"(to, 3955)","(and, 3331)"
4,"(and, 3473)","(a, 2906)"
5,"(he, 2841)","(he, 1778)"
6,"(in, 2476)","(had, 1747)"
7,"(his, 2418)","(in, 1611)"
8,"(said, 2253)","(his, 1589)"
9,"(you, 2133)","(that, 1201)"


In [15]:
#Filter out stopwords
stop_words = nltk.corpus.stopwords.words('english') + \
['would', 'could', 'like','us']

dune_no_stop = [word for word in dune_alpha \
                if not word in stop_words]
worm_no_stop = [word for word in worm_alpha \
                if not word in stop_words]

In [13]:
dune_dist = FreqDist(dune_no_stop)
worm_dist = FreqDist(worm_no_stop)
top_words_no_stop = pd.DataFrame({'dune':dune_dist.most_common(50), \
                              'worm':worm_dist.most_common(50)})
top_words_no_stop 

Unnamed: 0,dune,worm
0,"(said, 2253)","(one, 474)"
1,"(paul, 1559)","(face, 469)"
2,"(jessica, 850)","(duncan, 431)"
3,"(one, 619)","(sheeana, 349)"
4,"(thought, 617)","(even, 330)"
5,"(baron, 547)","(said, 295)"
6,"(duke, 494)","(back, 289)"
7,"(man, 442)","(old, 280)"
8,"(fremen, 415)","(murbella, 277)"
9,"(asked, 373)","(time, 255)"


#### Normalize frequencies for easier comparison against different original text length

In [14]:
num_words_dune = len(dune_words)
num_words_worm = len(worm_words)
dune_normal = [(word, freq/num_words_dune) for (word, freq) \
               in top_words_no_stop['dune']]
worm_normal = [(word, freq/num_words_worm) for (word, freq) \
               in top_words_no_stop['worm']]

top_normalized = pd.DataFrame({'dune': dune_normal, \
                               'worm':worm_normal})
top_normalized

Unnamed: 0,dune,worm
0,"(said, 0.01024607870334578)","(one, 0.002885036762915713)"
1,"(paul, 0.007089940833784319)","(face, 0.002854603885669767)"
2,"(jessica, 0.0038655867278490513)","(duncan, 0.0026233140186005745)"
3,"(one, 0.0028150566876924267)","(sheeana, 0.0021242148317670547)"
4,"(thought, 0.0028059611895092525)","(even, 0.0020085698982324584)"
5,"(baron, 0.002487618753098154)","(said, 0.001795539757510834)"
6,"(duke, 0.0022465880512440367)","(back, 0.0017590203048156985)"
7,"(man, 0.0020101050984815067)","(old, 0.001704241125772995)"
8,"(fremen, 0.0018873158730086544)","(murbella, 0.0016859813994254272)"
9,"(asked, 0.0016963104111619954)","(time, 0.0015520767395432633)"


### Bi-grams - Frequency

#### Create finders

In [47]:
bigram_measures = nltk.collocations.BigramAssocMeasures()

dune_finder = BigramCollocationFinder.from_words(dune_words)
worm_finder = BigramCollocationFinder.from_words(worm_words)


#### Filter out bigrams including punctuation

In [48]:
dune_finder.apply_word_filter(filter)
worm_finder.apply_word_filter(filter)

#### Filter out stopwords

In [49]:
dune_finder.apply_word_filter(lambda word: word in stop_words)
worm_finder.apply_word_filter(lambda word: word in stop_words)

#### Score frequencies

In [50]:
dune_scored = dune_finder.score_ngrams(bigram_measures.raw_freq)
worm_scored = worm_finder.score_ngrams(bigram_measures.raw_freq)

#### Create dataframe of top 50 bigrams by frequency

In [29]:
bigram_frequencies = pd.DataFrame({'dune': dune_scored[:50],\
                                   'worm': worm_scored[:50]})
bigram_frequencies

Unnamed: 0,dune,worm
0,"((paul, said), 0.001337038232926613)","((face, dancers), 0.0009677654964210937)"
1,"((feyd, rautha), 0.0008185948364856815)","((thinking, machines), 0.0008825534401324438)"
2,"((bene, gesserit), 0.0006275893746390224)","((face, dancer), 0.0008642937137848761)"
3,"((reverend, mother), 0.000573016385539977)","((bene, gesserit), 0.0007912548083946049)"
4,"((baron, said), 0.0005684686364483899)","((kwisatz, haderach), 0.0007912548083946049)"
5,"((jessica, said), 0.0005684686364483899)","((mother, commander), 0.0007243024784535229)"
6,"((stilgar, said), 0.0005002524000745831)","((duncan, idaho), 0.00040780055509568095)"
7,"((kynes, said), 0.0004229406655176021)","((old, man), 0.0003651945269513561)"
8,"((duke, said), 0.0004047496691512536)","((leto, ii), 0.0003469348006037883)"
9,"((jessica, thought), 0.00040020192005966646)","((bene, gesserits), 0.000340848225154599)"


In [51]:
shared_bigrams = []
dune_bigrams = []
for bigram, freq in dune_scored[:50]:
    dune_bigrams.append(bigram)
for bigram, freq in worm_scored[:50]:
    if bigram in dune_bigrams:
        shared_bigrams.append(bigram)
shared_bigrams

[('bene', 'gesserit'),
 ('kwisatz', 'haderach'),
 ('duncan', 'idaho'),
 ('reverend', 'mother'),
 ('thufir', 'hawat'),
 ('old', 'woman'),
 ('duke', 'leto')]

### Bi-grams - Mutual Information

In [38]:
dune_finder = BigramCollocationFinder.from_words(dune_words)
dune_finder.apply_freq_filter(5)
worm_finder = BigramCollocationFinder.from_words(worm_words)
worm_finder.apply_freq_filter(5)


In [39]:
dune_scored = dune_finder.score_ngrams(bigram_measures.pmi)
worm_scored = worm_finder.score_ngrams(bigram_measures.pmi)

In [42]:
bigrams_pmi = pd.DataFrame(\
                {'dune':dune_scored[:50], 'worm':worm_scored[:50]})
bigrams_pmi

Unnamed: 0,dune,worm
0,"((bela, tegeuse), 15.424487814381784)","((van, gogh), 15.004009735976137)"
1,"((ajax, niner), 15.161453408547988)","((gold, hilted), 14.325937830863499)"
2,"((diamond, tattoo), 15.161453408547988)","((optic, threads), 13.78161731463969)"
3,"((delta, ajax), 14.93906098721154)","((shai, hulud), 13.625498112722408)"
4,"((shaddam, iv), 14.93906098721154)","((nullentropy, capsule), 13.419047235254979)"
5,"((giedi, prime), 14.746415909269146)","((crystal, sheets), 13.325937830863499)"
6,"((bi, lal), 14.57649090782683)","((shakkad, station), 13.2555485029721)"
7,"((lal, kaifa), 14.57649090782683)","((god's, messenger), 13.226402157312585)"
8,"((e, e), 14.4536341600413)","((chief, fabricator), 13.131559785696648)"
9,"((gaius, helen), 14.424487814381784)","((crystalline, teeth), 12.892978423587392)"


In [45]:
shared_bigrams = []
dune_bigrams = []
for bigram, freq in dune_scored[:50]:
    dune_bigrams.append(bigram)
for bigram, freq in worm_scored[:50]:
    if bigram in dune_bigrams:
        shared_bigrams.append(bigram)
shared_bigrams

[('shai', 'hulud'), ('de', 'vries')]