In [1]:
import nltk

In [2]:
from urllib.request import urlopen
my_url = "http://gutenberg.net.au/ebooks02/0200331.txt"
file = urlopen(my_url)
raw = file.read()
orlando = raw.decode()
o_tokens = nltk.word_tokenize(orlando)

In [3]:
o_text = o_tokens[872:-5]

In [4]:
orl = nltk.Text(o_text)
orl

<Text: He -- for there could be no doubt...>

# cleaning the text (caps, punct, stops)

In [5]:
orl[:41]

['He',
 '--',
 'for',
 'there',
 'could',
 'be',
 'no',
 'doubt',
 'of',
 'his',
 'sex',
 ',',
 'though',
 'the',
 'fashion',
 'of',
 'the',
 'time',
 'did',
 'something',
 'to',
 'disguise',
 'it',
 '--',
 'was',
 'in',
 'the',
 'act',
 'of',
 'slicing',
 'at',
 'the',
 'head',
 'of',
 'a',
 'Moor',
 'which',
 'swung',
 'from',
 'the',
 'rafters']

In [6]:
# lowercasing and removing punct
lower_no_punct = []
for word in orl:
    if word.isalpha():
        lower_no_punct.append(word.lower())

In [7]:
from nltk.corpus import stopwords
stops = stopwords.words('english')
# removing stops
no_stops = [word for word in lower_no_punct if word not in stops]

In [8]:
no_stops[:10]

['could',
 'doubt',
 'sex',
 'though',
 'fashion',
 'time',
 'something',
 'disguise',
 'act',
 'slicing']

In [9]:
# lemmatizing
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
clean_text = []
for word in no_stops:
    word_lem = wordnet_lemmatizer.lemmatize(word)
    clean_text.append(word_lem)
    
clean_text[:30]

['could',
 'doubt',
 'sex',
 'though',
 'fashion',
 'time',
 'something',
 'disguise',
 'act',
 'slicing',
 'head',
 'moor',
 'swung',
 'rafter',
 'colour',
 'old',
 'football',
 'le',
 'shape',
 'one',
 'save',
 'sunken',
 'cheek',
 'strand',
 'two',
 'coarse',
 'dry',
 'hair',
 'like',
 'hair']

In [10]:
orl = clean_text

# similar words, first level

to compute distinctive similarity, we:
- make a list of words similar to "man" or "woman" using Text.similar()
- find words similar to those words
- filter out the words that are shared among the two lists
- repeat as needed


In [11]:
# to make these lists of distictive similar words automatically, 
# use ContextIndex class
idx = nltk.text.ContextIndex(orl)
# put words similar to woman in a list
woman_sim_1 = idx.similar_words("woman")
woman_sim_1_str = " ".join(woman_sim_1)
woman_sim_1_str

'reached till friend word moment saw always could cried sailor wit scarcely petticoat go servant conclusion'

In [12]:
# put words similar to man in a list
man_sim_1 = idx.similar_words("man")
man_sim_1
man_sim_1_str = " ".join(man_sim_1)
man_sim_1_str

'hurry father window tongue carriage still even countrywoman indulged old fortune title ship writing fell become always love grown never'

In [13]:
# loops through woman similar and filters out words shared with man similar
woman_disc_1 = []
for word in woman_sim_1:
    if word not in man_sim_1:
        woman_disc_1.append(word)
woman_disc_1

['reached',
 'till',
 'friend',
 'word',
 'moment',
 'saw',
 'could',
 'cried',
 'sailor',
 'wit',
 'scarcely',
 'petticoat',
 'go',
 'servant',
 'conclusion']

In [14]:
# loops through man similar and filters out words shared with woman similar
man_disc_1 = []
for word in man_sim_1:
    if word not in woman_sim_1:
        man_disc_1.append(word)
man_disc_1

['hurry',
 'father',
 'window',
 'tongue',
 'carriage',
 'still',
 'even',
 'countrywoman',
 'indulged',
 'old',
 'fortune',
 'title',
 'ship',
 'writing',
 'fell',
 'become',
 'love',
 'grown',
 'never']

# similar words, second level

In [15]:
# loop that computes similar words for each word from woman_sim_1
woman_sim_2_nested = []
for word in woman_disc_1:
    woman_sim_2_nested.append(idx.similar_words(word)) # this returns nested lists

# list comprehension that flattens nested list
woman_sim_2 = [inner
    for outer in woman_sim_2_nested
        for inner in outer]

# create a string that joins all words
woman_sim_2_str =" ".join(set(woman_sim_2))
woman_sim_2_str

'wisdom stood roused till encumbrance another wit laughing bird woman might rust petticoat thing understood shape madam fool always way account ran sailor none used happiness toss cried make friend time known explain year first word scarcely would mixture could indeed allied met prophet reached say fly dog window leave thought come sake able london'

In [16]:
len(set(woman_sim_2))

55

In [17]:
# creating a loop to find similar words to man_sim_1
man_sim_2_nested = []
for word in man_sim_1:
    man_sim_2_nested.append(idx.similar_words(word))
man_sim_2_nested

# collapsing the nested list
man_sim_2 = [inner
    for outer in man_sim_2_nested
        for inner in outer]

man_sim_2_str =" ".join(set(man_sim_2))
man_sim_2_str

'exactly fault alone roused man almost death moment little need must title woman without soon grown always admit sens kinsman monstrously love matter hurry written strength full orlando carriage hear hungry tongue saw word sink father would perhaps boy plump become ever fortune determined morning take whose still window old two passion come gloomy beautiful'

In [18]:
# loops through woman similar and filters out words shared with man similar
woman_disc_2 = []
for word in woman_sim_2:
    if word not in man_sim_2:
        woman_disc_2.append(word)
woman_disc_2

['till',
 'reached',
 'friend',
 'time',
 'dog',
 'explain',
 'thing',
 'another',
 'mixture',
 'toss',
 'way',
 'first',
 'met',
 'madam',
 'stood',
 'might',
 'used',
 'understood',
 'prophet',
 'bird',
 'thought',
 'shape',
 'none',
 'cried',
 'allied',
 'known',
 'laughing',
 'fool',
 'london',
 'could',
 'ran',
 'wit',
 'able',
 'happiness',
 'wisdom',
 'sailor',
 'say',
 'petticoat',
 'indeed',
 'scarcely',
 'encumbrance',
 'account',
 'fly',
 'leave',
 'rust',
 'make',
 'sake',
 'year']

In [19]:
len(set(woman_disc_2))

48

In [20]:
# loops through woman similar and filters out words shared with man similar
man_disc_2 = []
for word in man_sim_2:
    if word not in woman_sim_2:
        man_disc_2.append(word)
man_disc_2

['man',
 'father',
 'tongue',
 'carriage',
 'still',
 'man',
 'written',
 'fault',
 'hurry',
 'tongue',
 'carriage',
 'still',
 'man',
 'old',
 'hurry',
 'father',
 'tongue',
 'carriage',
 'still',
 'strength',
 'moment',
 'man',
 'hurry',
 'father',
 'carriage',
 'still',
 'man',
 'hurry',
 'father',
 'tongue',
 'still',
 'man',
 'need',
 'hungry',
 'beautiful',
 'monstrously',
 'hear',
 'without',
 'hurry',
 'father',
 'tongue',
 'carriage',
 'full',
 'two',
 'determined',
 'morning',
 'man',
 'matter',
 'man',
 'man',
 'strength',
 'man',
 'must',
 'plump',
 'sens',
 'morning',
 'man',
 'title',
 'man',
 'fortune',
 'man',
 'whose',
 'passion',
 'boy',
 'little',
 'man',
 'sink',
 'man',
 'love',
 'man',
 'grown',
 'become',
 'love',
 'man',
 'grown',
 'saw',
 'take',
 'kinsman',
 'perhaps',
 'soon',
 'become',
 'man',
 'grown',
 'exactly',
 'become',
 'love',
 'man',
 'alone',
 'gloomy',
 'almost',
 'death',
 'man',
 'ever',
 'orlando',
 'admit']

In [21]:
woman_disc_2_str = ' '.join(set(woman_disc_2))
woman_disc_2_str

'wisdom stood till encumbrance another wit laughing bird might rust petticoat thing understood shape madam fool way account ran sailor none used happiness toss cried make time friend known explain year first scarcely indeed mixture could allied met prophet reached say fly dog leave thought sake able london'

In [22]:
man_disc_2_str = ' '.join(set(man_disc_2))
man_disc_2_str

'exactly fault alone man almost death moment little need must title without soon grown admit sens kinsman monstrously love matter hurry written strength full orlando carriage hear hungry tongue saw sink father perhaps boy plump become ever fortune determined morning take whose still old two passion gloomy beautiful'

# Third Level

In [23]:
# 3rd level words similar to woman_disc_2
woman_sim_3_nested = []
for word in woman_sim_2:
    woman_sim_3_nested.append(idx.similar_words(word)) # this returns nested lists

# list comprehension that flattens nested list
woman_sim_3 = [inner
    for outer in woman_sim_3_nested
        for inner in outer]

woman_sim_3

['reached',
 'till',
 'friend',
 'word',
 'moment',
 'saw',
 'always',
 'could',
 'cried',
 'sailor',
 'wit',
 'scarcely',
 'petticoat',
 'go',
 'servant',
 'conclusion',
 'woman',
 'reached',
 'reached',
 'till',
 'friend',
 'word',
 'moment',
 'saw',
 'always',
 'could',
 'cried',
 'sailor',
 'wit',
 'scarcely',
 'petticoat',
 'go',
 'servant',
 'conclusion',
 'woman',
 'till',
 'friend',
 'woman',
 'time',
 'dog',
 'window',
 'roused',
 'reached',
 'till',
 'friend',
 'word',
 'moment',
 'saw',
 'always',
 'could',
 'cried',
 'sailor',
 'wit',
 'scarcely',
 'petticoat',
 'go',
 'servant',
 'conclusion',
 'word',
 'woman',
 'reached',
 'till',
 'friend',
 'word',
 'moment',
 'saw',
 'always',
 'could',
 'cried',
 'sailor',
 'wit',
 'scarcely',
 'petticoat',
 'go',
 'servant',
 'conclusion',
 'went',
 'top',
 'gave',
 'word',
 'green',
 'character',
 'thought',
 'ribbon',
 'would',
 'finger',
 'draw',
 'true',
 'side',
 'visit',
 'way',
 'street',
 'must',
 'sex',
 'insipid',
 'thing'

In [24]:
# 3rd level words similar to man_disc_2
man_sim_3_nested = []
for word in man_sim_2:
    man_sim_3_nested.append(idx.similar_words(word)) # this returns nested lists

# list comprehension that flattens nested list
man_sim_3 = [inner
    for outer in man_sim_3_nested
        for inner in outer]

man_sim_3

['hurry',
 'father',
 'window',
 'tongue',
 'carriage',
 'still',
 'even',
 'countrywoman',
 'indulged',
 'old',
 'fortune',
 'title',
 'ship',
 'writing',
 'fell',
 'become',
 'always',
 'love',
 'grown',
 'never',
 'man',
 'written',
 'fault',
 'hurry',
 'window',
 'tongue',
 'carriage',
 'still',
 'man',
 'old',
 'roused',
 'word',
 'hurry',
 'father',
 'tongue',
 'carriage',
 'still',
 'strength',
 'moment',
 'man',
 'hurry',
 'father',
 'window',
 'carriage',
 'still',
 'man',
 'hurry',
 'father',
 'window',
 'tongue',
 'still',
 'man',
 'need',
 'hungry',
 'beautiful',
 'monstrously',
 'hear',
 'without',
 'hurry',
 'father',
 'window',
 'tongue',
 'carriage',
 'hurry',
 'father',
 'window',
 'tongue',
 'carriage',
 'still',
 'even',
 'countrywoman',
 'indulged',
 'old',
 'fortune',
 'title',
 'ship',
 'writing',
 'fell',
 'become',
 'always',
 'love',
 'grown',
 'never',
 'father',
 'fault',
 'remembered',
 'father',
 'written',
 'man',
 'father',
 'window',
 'tongue',
 'carriag

In [25]:
# loops through woman similar and filters out words shared with man similar
woman_disc_3 = []
for word in woman_sim_3:
    if word not in man_sim_3:
        woman_disc_3.append(word)
woman_disc_3

['went',
 'top',
 'gave',
 'green',
 'character',
 'thought',
 'ribbon',
 'finger',
 'draw',
 'true',
 'side',
 'street',
 'good',
 'since',
 'sawings',
 'among',
 'person',
 'people',
 'sag',
 'top',
 'ring',
 'knee',
 'draw',
 'true',
 'side',
 'street',
 'frequent',
 'fasten',
 'raising',
 'standing',
 'bald',
 'launched',
 'burst',
 'gently',
 'flung',
 'used',
 'let',
 'suddenly',
 'might',
 'thought',
 'spoke',
 'may',
 'bird',
 'prophet',
 'none',
 'hot',
 'looked',
 'third',
 'heard',
 'voyage',
 'omit',
 'thought',
 'try',
 'london',
 'ran',
 'prayer',
 'hid',
 'beneath',
 'new',
 'might',
 'used',
 'prophet',
 'bird',
 'thought',
 'come',
 'shape',
 'none',
 'allied',
 'known',
 'laughing',
 'fool',
 'went',
 'ala',
 'reflection',
 'catching',
 'able',
 'happiness',
 'wisdom',
 'ask',
 'bethink',
 'blackness',
 'found',
 'scramble',
 'going',
 'thought',
 'come',
 'denied',
 'leave',
 'observed',
 'groping',
 'minute',
 'day',
 'indeed',
 'case',
 'leave',
 'rust',
 'summer',

In [26]:
# loops through woman similar and filters out words shared with man similar
man_disc_3 = []
for word in man_sim_3:
    if word not in woman_sim_3:
        man_disc_3.append(word)
man_disc_3

['even',
 'countrywoman',
 'indulged',
 'fortune',
 'title',
 'ship',
 'writing',
 'fell',
 'written',
 'fault',
 'need',
 'hungry',
 'beautiful',
 'monstrously',
 'hear',
 'without',
 'even',
 'countrywoman',
 'indulged',
 'fortune',
 'title',
 'ship',
 'writing',
 'fell',
 'fault',
 'remembered',
 'written',
 'need',
 'hungry',
 'beautiful',
 'monstrously',
 'hear',
 'without',
 'even',
 'countrywoman',
 'indulged',
 'fortune',
 'title',
 'ship',
 'writing',
 'fell',
 'plump',
 'written',
 'fault',
 'need',
 'hungry',
 'beautiful',
 'monstrously',
 'hear',
 'without',
 'explain',
 'mixture',
 'toss',
 'first',
 'even',
 'countrywoman',
 'indulged',
 'fortune',
 'title',
 'ship',
 'writing',
 'fell',
 'written',
 'fault',
 'need',
 'hungry',
 'beautiful',
 'monstrously',
 'hear',
 'without',
 'even',
 'countrywoman',
 'indulged',
 'fortune',
 'title',
 'ship',
 'writing',
 'fell',
 'written',
 'fault',
 'need',
 'hungry',
 'beautiful',
 'monstrously',
 'hear',
 'without',
 'even',
 'c

In [27]:
set(woman_disc_3)

{'able',
 'ala',
 'allied',
 'among',
 'ask',
 'bald',
 'beneath',
 'bethink',
 'bird',
 'blackness',
 'bred',
 'burst',
 'case',
 'catching',
 'character',
 'child',
 'come',
 'day',
 'denied',
 'draw',
 'fasten',
 'finger',
 'flung',
 'fly',
 'fool',
 'found',
 'frequent',
 'gave',
 'gently',
 'going',
 'good',
 'green',
 'groping',
 'happiness',
 'heard',
 'hid',
 'hot',
 'indeed',
 'knee',
 'known',
 'laughing',
 'launched',
 'leave',
 'let',
 'london',
 'looked',
 'may',
 'might',
 'minute',
 'month',
 'nail',
 'new',
 'none',
 'observed',
 'omit',
 'people',
 'person',
 'prayer',
 'profit',
 'prophet',
 'raising',
 'ran',
 'reflection',
 'ribbon',
 'ring',
 'rust',
 'sag',
 'sawings',
 'scramble',
 'shape',
 'side',
 'since',
 'slept',
 'slipping',
 'spoke',
 'standing',
 'street',
 'suddenly',
 'summer',
 'third',
 'thought',
 'thousand',
 'top',
 'true',
 'try',
 'upstairs',
 'used',
 'voyage',
 'went',
 'wisdom'}

In [28]:
woman_disc3_str = " ".join(set(woman_disc_3))
woman_disc3_str

'summer rust burst shape thousand beneath spoke let none raising catching character prophet fly person sag flung denied laughing bird might groping reflection bred ran knee used going bethink try indeed slipping bald launched found prayer gently among day sawings able standing wisdom upstairs frequent street ring month scramble people green third good draw nail hid london ala profit observed blackness went ask fool omit heard happiness top voyage hot new suddenly known may minute slept child gave allied true fasten since side case finger leave looked thought come ribbon'

In [29]:
man_disc3_str = " ".join(set(man_disc_3))
man_disc3_str

'ship turn stood pushing story admitted immovable longed filled talk written agitate princess explain insisted round hear first wicket tied needing plump inborn indulged remembered whole put beast raised set writing soon fell looking long sens monstrously toss certain even mixture fortune met profound mind whose awkwardness imagine without sleep taste madam countrywoman iron treachery made said blush perhaps fumbled sofa many longer husband deep passion rout beautiful exactly fault whatever boyish pared need title tell kinsman glance ague hungry part beard circle morning take'