In [1]:
import nltk
import matplotlib
%matplotlib inline
import pandas as pd

In [2]:
from urllib.request import urlopen
my_url = "http://gutenberg.net.au/ebooks02/0200331.txt"
file = urlopen(my_url)
raw = file.read()
orlando = raw.decode()
o_tokens = nltk.word_tokenize(orlando)

In [3]:
o_text = o_tokens[872:-5]

In [4]:
orl = nltk.Text(o_text)
orl

<Text: He -- for there could be no doubt...>

# cleaning the text (caps, punct, stops)

In [15]:
orl[:25]

['He',
 '--',
 'for',
 'there',
 'could',
 'be',
 'no',
 'doubt',
 'of',
 'his',
 'sex',
 ',',
 'though',
 'the',
 'fashion',
 'of',
 'the',
 'time',
 'did',
 'something',
 'to',
 'disguise',
 'it',
 '--',
 'was']

In [9]:
# lowercasing and removing punct
lower_no_punct = []
for word in orl:
    if word.isalpha():
        lower_no_punct.append(word.lower())

In [12]:
from nltk.corpus import stopwords
stops = stopwords.words('english')
# removing stops
no_stops = [word for word in lower_no_punct if word not in stops]

In [14]:
no_stops[:10]

['could',
 'doubt',
 'sex',
 'though',
 'fashion',
 'time',
 'something',
 'disguise',
 'act',
 'slicing']

# similar words, first level

to compute distinctive similarity, we:
- make a list of words similar to "man" or "woman" using Text.similar()
- find words similar to those words
- filter out the words that are shared among the two lists
- repeat as needed


In [6]:
# to make these lists of distictive similar words automatically, 
# use ContextIndex class
idx = nltk.text.ContextIndex(orl)
# put words similar to woman in a list
woman_sim_1 = idx.similar_words("woman")
woman_sim_1

['man',
 'moment',
 'boy',
 'little',
 'lover',
 'house',
 'girl',
 'poet',
 'child',
 'thought',
 'lady',
 'fox',
 'then',
 'Duke',
 'summer',
 'tree',
 'gipsy',
 'shadow',
 'fat',
 'trance']

In [7]:
# put words similar to man in a list
man_sim_1 = idx.similar_words("man")
man_sim_1

['woman',
 'moment',
 'boy',
 'house',
 'poet',
 'little',
 'word',
 'lover',
 'pair',
 'thought',
 'lady',
 'child',
 'score',
 'summer',
 'time',
 'night',
 'life',
 'river',
 'it',
 'girl']

In [8]:
# loops through woman similar and filters out words shared with man similar
woman_disc_1 = []
for word in woman_sim_1:
    if word not in man_sim_1:
        woman_disc_1.append(word)
woman_disc_1

['man', 'fox', 'then', 'Duke', 'tree', 'gipsy', 'shadow', 'fat', 'trance']

In [9]:
# loops through man similar and filters out words shared with woman similar
man_disc_1 = []
for word in man_sim_1:
    if word not in woman_sim_1:
        man_disc_1.append(word)
man_disc_1

['woman', 'word', 'pair', 'score', 'time', 'night', 'life', 'river', 'it']

# similar words, second level

In [10]:
# loop that computes similar words for each word from woman_sim_1
woman_sim_2_nested = []
for word in woman_sim_1:
    woman_sim_2_nested.append(idx.similar_words(word)) # this returns nested lists

# list comprehension that flattens nested list
woman_sim_2 = [inner
    for outer in woman_sim_2_nested
        for inner in outer]

woman_sim_2

['woman',
 'moment',
 'boy',
 'house',
 'poet',
 'little',
 'word',
 'lover',
 'pair',
 'thought',
 'lady',
 'child',
 'score',
 'summer',
 'time',
 'night',
 'life',
 'river',
 'it',
 'girl',
 'woman',
 'window',
 'room',
 'man',
 'time',
 'ground',
 'house',
 'table',
 'world',
 'sky',
 'door',
 'age',
 'word',
 'river',
 'next',
 'truth',
 'floor',
 'ice',
 'courtyard',
 'Serpentine',
 'woman',
 'man',
 'moment',
 'window',
 'reader',
 'poet',
 'lover',
 'sooner',
 'room',
 'house',
 'ground',
 'world',
 'biographer',
 'girl',
 'time',
 'thought',
 'river',
 'child',
 'summer',
 'little',
 'woman',
 'man',
 'pair',
 'score',
 'moment',
 'toy',
 'word',
 'piece',
 'child',
 'lover',
 'house',
 'glass',
 'thought',
 'touch',
 'variety',
 'gipsy',
 'trance',
 'melon',
 'fox',
 'Duke',
 'woman',
 'man',
 'moment',
 'little',
 'thought',
 'boy',
 'house',
 'child',
 'hand',
 'standstill',
 'time',
 'word',
 'head',
 'trance',
 'melon',
 'fox',
 'gipsy',
 'Duke',
 'shadow',
 'pleasant',
 

In [11]:
# creating a loop to find similar words to man_sim_1
man_sim_2_nested = []
for word in man_sim_1:
    man_sim_2_nested.append(idx.similar_words(word))
man_sim_2_nested

# collapsing the nested list
man_sim_2 = [inner
    for outer in woman_sim_2_nested
        for inner in outer]

man_sim_2


['woman',
 'moment',
 'boy',
 'house',
 'poet',
 'little',
 'word',
 'lover',
 'pair',
 'thought',
 'lady',
 'child',
 'score',
 'summer',
 'time',
 'night',
 'life',
 'river',
 'it',
 'girl',
 'woman',
 'window',
 'room',
 'man',
 'time',
 'ground',
 'house',
 'table',
 'world',
 'sky',
 'door',
 'age',
 'word',
 'river',
 'next',
 'truth',
 'floor',
 'ice',
 'courtyard',
 'Serpentine',
 'woman',
 'man',
 'moment',
 'window',
 'reader',
 'poet',
 'lover',
 'sooner',
 'room',
 'house',
 'ground',
 'world',
 'biographer',
 'girl',
 'time',
 'thought',
 'river',
 'child',
 'summer',
 'little',
 'woman',
 'man',
 'pair',
 'score',
 'moment',
 'toy',
 'word',
 'piece',
 'child',
 'lover',
 'house',
 'glass',
 'thought',
 'touch',
 'variety',
 'gipsy',
 'trance',
 'melon',
 'fox',
 'Duke',
 'woman',
 'man',
 'moment',
 'little',
 'thought',
 'boy',
 'house',
 'child',
 'hand',
 'standstill',
 'time',
 'word',
 'head',
 'trance',
 'melon',
 'fox',
 'gipsy',
 'Duke',
 'shadow',
 'pleasant',
 

In [13]:
# loops through woman similar and filters out words shared with man similar
woman_disc_2 = []
for word in woman_sim_2:
    if word not in man_sim_2:
        woman_disc_2.append(word)
woman_disc_2

[]

In [29]:
# words similar to woman_disc
woman_disc1_str = " ".join(woman_disc_1)
#woman_similar_2 = idx.similar_words(woman_disc1_str)
#woman_similar_2
woman_disc1_str

'man fox then Duke tree gipsy shadow fat trance'

In [34]:
for word
woman_similar_2 = idx.similar_words("man, fox")
woman_similar_2

[]

In [28]:
# see the words that are similar to the distinctly masculine words above
for word in man_list:
    orl.similar(word)

'man fox then Duke tree gipsy shadow fat trance'

In [None]:
# put those similar words into a list
masc_similar_2 = set(( """park man room sky word queen time and house air lady table age other
world sight question pavement one hair
thought man word one orlando poet age company change time hair room
way sun light life lady sky nature place
night room and house way sky moment world one window hand word side
man it more cheeks orlando shoulders air
room head time hand queen night air lips window other heart man world
ice cheeks hair poet sky table book
it all death mind him that hand love was head orlando writing nature
man night time which one them light
said was had saw sight cried felt man heard that made as life word age
wrote truth company blushed time
man time word night woman river boy rest room green trees table book
other pen fortress clouds moment ship ground
window mind it light age man rest time air sun body life death sky
poets thought sea city earth door
""").replace('\n',' ').split(" "))
masc_similar_2

In [None]:
# make a list for all the words distinctly associated with women
for word in woman_list:
    similar_words = orl.similar(word)

In [None]:
fem_similar_2 = set(( """woman word hand man stone way turn little nature line moment nobleman
barrel porpoise creature rat sudden crown spirit gale
table woman head house room river matter air floor window hand legs
face trees sky other moment ship heart finger
door night woman light sky right queen man world king river rest
change spirit time moor one house room air
sky table ship woman air window light age city truth night river rest
time one moon room poet birds park
house world time head sky heart change window night air poet table age
other park river snow company streets cheeks
sun truth night company rest room window life sky age sea ship door
man gold woman court time moor colour
word moment time house man night woman sun lady table ship queen world
river snow rest change spirit one hair
ship man night woman ambassador time sun sky queen door world same
river rest change spirit one shoulders house room
""").replace('\n',' ').split(" "))
fem_similar_2

# similar words, third level

In [None]:
# now we are making a list of all words in masc_similar that are not in fem_similar
masc_similar_3 = []
for word in masc_similar_2:
    if word not in fem_similar_2:
        masc_similar_3.append(word)
masc_similar_3

In [None]:
fem_similar_3 = []
for word in fem_similar_2:
    if word not in masc_similar_2:
        fem_similar_3.append(word)
fem_similar_3

# similar words, fourth level

In [None]:
# make a list for all the words distinctly associated with women
for word in fem_similar_3:
    fem_similar_words_4 = orl.similar(word)

In [None]:
# now we are making a list of all words in masc_similar that are not in fem_similar
fem_similar_4 = []
for word in masc_similar_2:
    if word not in fem_similar_2:
        masc_similar_3.append(word)
masc_similar_3

In [None]:
# we need to rename the lists -- masc_similar for lists of similar words and
# masc_distinct for words that are similar but distinct. 

In [None]:
# now I want to think about ways of exploring this data. 
# make it into a series? a dataframe? 

https://towardsdatascience.com/from-dataframe-to-network-graph-bbb35c8ab675