In [1]:
import nltk
import matplotlib
%matplotlib inline
import networkx
# import pandas as pd

In [2]:
from urllib.request import urlopen
my_url = "http://gutenberg.net.au/ebooks02/0200331.txt"
file = urlopen(my_url)
raw = file.read()
orlando = raw.decode()
o_tokens = nltk.word_tokenize(orlando)

In [3]:
o_text = o_tokens[872:-5]

In [4]:
orl = nltk.Text(o_text)
orl

<Text: He -- for there could be no doubt...>

# cleaning the text (caps, punct, stops)

In [5]:
orl[:41]

['He',
 '--',
 'for',
 'there',
 'could',
 'be',
 'no',
 'doubt',
 'of',
 'his',
 'sex',
 ',',
 'though',
 'the',
 'fashion',
 'of',
 'the',
 'time',
 'did',
 'something',
 'to',
 'disguise',
 'it',
 '--',
 'was',
 'in',
 'the',
 'act',
 'of',
 'slicing',
 'at',
 'the',
 'head',
 'of',
 'a',
 'Moor',
 'which',
 'swung',
 'from',
 'the',
 'rafters']

In [6]:
# lowercasing and removing punct
lower_no_punct = []
for word in orl:
    if word.isalpha():
        lower_no_punct.append(word.lower())

In [7]:
from nltk.corpus import stopwords
stops = stopwords.words('english')
# removing stops
no_stops = [word for word in lower_no_punct if word not in stops]

In [8]:
no_stops[:10]

['could',
 'doubt',
 'sex',
 'though',
 'fashion',
 'time',
 'something',
 'disguise',
 'act',
 'slicing']

In [9]:
# lemmatizing
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
wordnet_lemmatizer.lemmatize("slicings")

'slicing'

In [10]:
clean_text = []
for word in no_stops:
    word_lem = wordnet_lemmatizer.lemmatize(word)
    clean_text.append(word_lem)
    
clean_text[:30]

['could',
 'doubt',
 'sex',
 'though',
 'fashion',
 'time',
 'something',
 'disguise',
 'act',
 'slicing',
 'head',
 'moor',
 'swung',
 'rafter',
 'colour',
 'old',
 'football',
 'le',
 'shape',
 'one',
 'save',
 'sunken',
 'cheek',
 'strand',
 'two',
 'coarse',
 'dry',
 'hair',
 'like',
 'hair']

In [11]:
orl = clean_text

# similar words, first level

to compute distinctive similarity, we:
- make a list of words similar to "man" or "woman" using Text.similar()
- find words similar to those words
- filter out the words that are shared among the two lists
- repeat as needed


In [12]:
# to make these lists of distictive similar words automatically, 
# use ContextIndex class
idx = nltk.text.ContextIndex(orl)
# put words similar to woman in a list
woman_sim_1 = idx.similar_words("woman")
woman_sim_1
# woman_sim_1_str = " ".join(woman_sim_1)
# woman_sim_1_str

['reached',
 'till',
 'friend',
 'word',
 'moment',
 'saw',
 'always',
 'could',
 'cried',
 'sailor',
 'wit',
 'scarcely',
 'petticoat',
 'go',
 'servant',
 'conclusion']

In [13]:
# put words similar to man in a list
man_sim_1 = idx.similar_words("man")
man_sim_1
man_sim_1_str = " ".join(man_sim_1)
man_sim_1_str

'hurry father window tongue carriage still even countrywoman indulged old fortune title ship writing fell become always love grown never'

In [14]:
# loops through woman similar and filters out words shared with man similar
woman_disc_1 = []
for word in woman_sim_1:
    if word not in man_sim_1:
        woman_disc_1.append(word)
woman_disc_1

['reached',
 'till',
 'friend',
 'word',
 'moment',
 'saw',
 'could',
 'cried',
 'sailor',
 'wit',
 'scarcely',
 'petticoat',
 'go',
 'servant',
 'conclusion']

In [15]:
# loops through man similar and filters out words shared with woman similar
man_disc_1 = []
for word in man_sim_1:
    if word not in woman_sim_1:
        man_disc_1.append(word)
man_disc_1

['hurry',
 'father',
 'window',
 'tongue',
 'carriage',
 'still',
 'even',
 'countrywoman',
 'indulged',
 'old',
 'fortune',
 'title',
 'ship',
 'writing',
 'fell',
 'become',
 'love',
 'grown',
 'never']

# similar words, second level

In [16]:
# loop that computes similar words for each word from woman_sim_1
woman_sim_2_nested = []
for word in woman_sim_1:
    woman_sim_2_nested.append(idx.similar_words(word)) # this returns nested lists

woman_sim_2_nested

# list comprehension that flattens nested list
# woman_sim_2 = [inner
#     for outer in woman_sim_2_nested
#         for inner in outer]

# woman_sim_2_str =" ".join(set(woman_sim_2))
# woman_sim_2_str

[['woman', 'till'],
 ['woman', 'reached'],
 ['word', 'woman'],
 ['friend', 'woman', 'time', 'dog', 'window', 'roused'],
 ['explain',
  'thing',
  'another',
  'mixture',
  'toss',
  'way',
  'window',
  'first',
  'woman'],
 ['always', 'woman', 'met', 'madam', 'stood'],
 ['become', 'love', 'man', 'grown', 'saw', 'woman'],
 ['might',
  'used',
  'understood',
  'prophet',
  'bird',
  'thought',
  'come',
  'shape',
  'none',
  'would',
  'woman',
  'cried',
  'allied',
  'known',
  'laughing',
  'fool'],
 ['london', 'woman', 'could', 'ran'],
 ['wit', 'woman'],
 ['able', 'happiness', 'wisdom', 'sailor', 'woman', 'say'],
 ['woman', 'petticoat', 'indeed'],
 ['scarcely', 'woman'],
 ['woman', 'encumbrance', 'account', 'fly', 'leave', 'rust', 'make', 'sake'],
 ['woman', 'year'],
 ['woman']]

In [17]:
import pandas as pd

In [18]:
woman = pd.Series(woman_disc_1)
woman

0        reached
1           till
2         friend
3           word
4         moment
5            saw
6          could
7          cried
8         sailor
9            wit
10      scarcely
11     petticoat
12            go
13       servant
14    conclusion
dtype: object

In [19]:
woman.describe()

count          15
unique         15
top       reached
freq            1
dtype: object

In [20]:
len(set(woman_sim_2))

NameError: name 'woman_sim_2' is not defined

In [None]:
# creating a loop to find similar words to man_sim_1
man_sim_2_nested = []
for word in man_sim_1:
    man_sim_2_nested.append(idx.similar_words(word))
man_sim_2_nested

# collapsing the nested list
man_sim_2 = [inner
    for outer in man_sim_2_nested
        for inner in outer]

man_sim_2_str =" ".join(set(man_sim_2))
man_sim_2_str

In [None]:
len(set(man_sim_2))

In [None]:
len(man_sim_2)

In [None]:
# loops through woman similar and filters out words shared with man similar
woman_disc_2 = []
for word in woman_sim_2:
    if word not in man_sim_2:
        woman_disc_2.append(word)
woman_disc_2

In [None]:
len(set(woman_disc_2))

In [None]:
# loops through woman similar and filters out words shared with man similar
man_disc_2 = []
for word in man_sim_2:
    if word not in woman_sim_2:
        man_disc_2.append(word)
man_disc_2

# Third Level

In [None]:
# 3rd level words similar to woman_disc_2
woman_sim_3_nested = []
for word in woman_sim_2:
    woman_sim_3_nested.append(idx.similar_words(word)) # this returns nested lists

# list comprehension that flattens nested list
woman_sim_3 = [inner
    for outer in woman_sim_3_nested
        for inner in outer]

woman_sim_3

In [None]:
# 3rd level words similar to man_disc_2
man_sim_3_nested = []
for word in man_sim_2:
    man_sim_3_nested.append(idx.similar_words(word)) # this returns nested lists

# list comprehension that flattens nested list
man_sim_3 = [inner
    for outer in man_sim_3_nested
        for inner in outer]

man_sim_3

In [None]:
# loops through woman similar and filters out words shared with man similar
woman_disc_3 = []
for word in woman_sim_3:
    if word not in man_sim_3:
        woman_disc_3.append(word)
woman_disc_3

In [None]:
# loops through woman similar and filters out words shared with man similar
man_disc_3 = []
for word in man_sim_3:
    if word not in woman_sim_3:
        man_disc_3.append(word)
man_disc_3

In [None]:
set(woman_disc_3)

In [None]:
woman_disc3_str = " ".join(set(woman_disc_3))
woman_disc3_str

In [None]:
man_disc3_str = " ".join(set(man_disc_3))
man_disc3_str

# working with pandas

In [None]:
# now I want to think about ways of exploring this data. 
# make it into a series? a dataframe? 

https://towardsdatascience.com/from-dataframe-to-network-graph-bbb35c8ab675