In [1]:
import nltk
import matplotlib
%matplotlib inline
import networkx
import pandas as pd

In [2]:
from urllib.request import urlopen
my_url = "http://gutenberg.net.au/ebooks02/0200331.txt"
file = urlopen(my_url)
raw = file.read()
orlando = raw.decode()
o_tokens = nltk.word_tokenize(orlando)

In [3]:
o_text = o_tokens[872:-5]
orl = nltk.Text(o_text)
orl

<Text: He -- for there could be no doubt...>

In [4]:
orl.concordance("woman")

Displaying 25 of 123 matches:
charm -- all qualities which the old woman loved the more the more they failed 
yed her cheeks scarlet . For the old woman loved him . And the Queen , who knew
rladen with apples . The old bumboat woman , who was carrying her fruit to mark
a figure , which , whether boy 's or woman 's , for the loose tunic and trouser
 , for alas , a boy it must be -- no woman could skate with such speed and vigo
s not a handsbreadth off . She was a woman . Orlando stared ; trembled ; turned
mult of emotion , until now ? An old woman , he answered , all skin and bone . 
e for sea birds and some old country woman hacking at the ice in a vain attempt
h their heat , and pity the poor old woman who had no such natural means of tha
agan ; of this man 's beard and that woman 's skin ; of a rat that fed from her
of melancholy ; the sight of the old woman hobbling over the ice might be the c
en waters or night coming or the old woman or whatever it was , and would try t
anners wer

In [5]:
orl.similar("woman")

man moment night boy word world child pen ship door one room window
light little lady table book queen king


# cleaning the text (caps, punct, stops)

In [6]:
# lowercasing and removing punct
lower_no_punct = []
for word in orl:
    if word.isalpha():
        lower_no_punct.append(word.lower())

In [7]:
# removing stopwords
from nltk.corpus import stopwords
stops = stopwords.words('english')
no_stops = [word for word in lower_no_punct if word not in stops]

In [8]:
# lemmatizing
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
clean_text = []
for word in no_stops:
    word_lem = wordnet_lemmatizer.lemmatize(word)
    clean_text.append(word_lem)

In [9]:
orl = clean_text

# similar words, first level

to compute distinctive similarity, we:
- make a list of words similar to "man" or "woman" using Text.similar()
- find words similar to those words
- filter out the words that are shared among the two lists
- repeat as needed


In [12]:
# making list of similar words using ContextIndex class
idx = nltk.text.ContextIndex(orl)
# put words similar to 'woman' in a list
woman_sim_1 = idx.similar_words("woman")
woman_sim_1

['reached',
 'till',
 'friend',
 'word',
 'moment',
 'saw',
 'always',
 'could',
 'cried',
 'sailor',
 'wit',
 'scarcely',
 'petticoat',
 'go',
 'servant',
 'conclusion']

In [9]:
# put words similar to man in a list
man_sim_1 = idx.similar_words("man")
man_sim_1

['hurry',
 'father',
 'window',
 'tongue',
 'carriage',
 'still',
 'even',
 'countrywoman',
 'indulged',
 'old',
 'fortune',
 'title',
 'ship',
 'writing',
 'fell',
 'become',
 'always',
 'love',
 'grown',
 'never']

In [10]:
# loops through woman similar and filters out words shared with man similar
woman_disc_1 = []
for word in woman_sim_1:
    if word not in man_sim_1:
        woman_disc_1.append(word)
woman_disc_1

['reached',
 'till',
 'friend',
 'word',
 'moment',
 'saw',
 'could',
 'cried',
 'sailor',
 'wit',
 'scarcely',
 'petticoat',
 'go',
 'servant',
 'conclusion']

In [11]:
# loops through man similar and filters out words shared with woman similar
man_disc_1 = []
for word in man_sim_1:
    if word not in woman_sim_1:
        man_disc_1.append(word)
man_disc_1

['hurry',
 'father',
 'window',
 'tongue',
 'carriage',
 'still',
 'even',
 'countrywoman',
 'indulged',
 'old',
 'fortune',
 'title',
 'ship',
 'writing',
 'fell',
 'become',
 'love',
 'grown',
 'never']

# second level

In [12]:
s = pd.Series(woman_disc_1)
s

0        reached
1           till
2         friend
3           word
4         moment
5            saw
6          could
7          cried
8         sailor
9            wit
10      scarcely
11     petticoat
12            go
13       servant
14    conclusion
dtype: object

## nodes of second level 'woman'

In [13]:
# loop that computes second level similar words for each word from woman_sim_1
woman_sim_2_nested = []
for word in woman_disc_1:
    woman_sim_2_nested.append(idx.similar_words(word)) # this returns nested lists

woman_sim_2_nested

[['woman', 'till'],
 ['woman', 'reached'],
 ['word', 'woman'],
 ['friend', 'woman', 'time', 'dog', 'window', 'roused'],
 ['explain',
  'thing',
  'another',
  'mixture',
  'toss',
  'way',
  'window',
  'first',
  'woman'],
 ['always', 'woman', 'met', 'madam', 'stood'],
 ['might',
  'used',
  'understood',
  'prophet',
  'bird',
  'thought',
  'come',
  'shape',
  'none',
  'would',
  'woman',
  'cried',
  'allied',
  'known',
  'laughing',
  'fool'],
 ['london', 'woman', 'could', 'ran'],
 ['wit', 'woman'],
 ['able', 'happiness', 'wisdom', 'sailor', 'woman', 'say'],
 ['woman', 'petticoat', 'indeed'],
 ['scarcely', 'woman'],
 ['woman', 'encumbrance', 'account', 'fly', 'leave', 'rust', 'make', 'sake'],
 ['woman', 'year'],
 ['woman']]

In [14]:
# un-nest the words
woman_sim_2 = [inner
    for outer in woman_sim_2_nested
        for inner in outer]
woman_sim_2

# filter out 'woman'
woman_sim_2_clean = []
for word in woman_sim_2:
    if word != 'woman':
        woman_sim_2_clean.append(word)

woman_sim_2_clean

['till',
 'reached',
 'word',
 'friend',
 'time',
 'dog',
 'window',
 'roused',
 'explain',
 'thing',
 'another',
 'mixture',
 'toss',
 'way',
 'window',
 'first',
 'always',
 'met',
 'madam',
 'stood',
 'might',
 'used',
 'understood',
 'prophet',
 'bird',
 'thought',
 'come',
 'shape',
 'none',
 'would',
 'cried',
 'allied',
 'known',
 'laughing',
 'fool',
 'london',
 'could',
 'ran',
 'wit',
 'able',
 'happiness',
 'wisdom',
 'sailor',
 'say',
 'petticoat',
 'indeed',
 'scarcely',
 'encumbrance',
 'account',
 'fly',
 'leave',
 'rust',
 'make',
 'sake',
 'year']

## adding edges to woman_sim_2

In [22]:
s = pd.Series(woman_sim_2_nested)
s

0                                         [woman, till]
1                                      [woman, reached]
2                                         [word, woman]
3            [friend, woman, time, dog, window, roused]
4     [explain, thing, another, mixture, toss, way, ...
5                    [always, woman, met, madam, stood]
6     [might, used, understood, prophet, bird, thoug...
7                           [london, woman, could, ran]
8                                          [wit, woman]
9         [able, happiness, wisdom, sailor, woman, say]
10                           [woman, petticoat, indeed]
11                                    [scarcely, woman]
12    [woman, encumbrance, account, fly, leave, rust...
13                                        [woman, year]
14                                              [woman]
dtype: object

In [59]:
for item in woman_sim_2_nested[0]:
    print(item)

woman
till


In [23]:
s = pd.Series(woman_sim_2_nested)
s

0                                         [woman, till]
1                                      [woman, reached]
2                                         [word, woman]
3            [friend, woman, time, dog, window, roused]
4     [explain, thing, another, mixture, toss, way, ...
5                    [always, woman, met, madam, stood]
6     [might, used, understood, prophet, bird, thoug...
7                           [london, woman, could, ran]
8                                          [wit, woman]
9         [able, happiness, wisdom, sailor, woman, say]
10                           [woman, petticoat, indeed]
11                                    [scarcely, woman]
12    [woman, encumbrance, account, fly, leave, rust...
13                                        [woman, year]
14                                              [woman]
dtype: object

In [51]:
for item in woman_sim_2_nested[0][0]:
   # if item != 'woman':
        print(item)

w
o
m
a
n


In [40]:
def remove_woman(woman_sim_2_nested[i]):
    new_list = []
    for word in woman_sim_2_nested[i]: # goes through each word in list within list
        if word != "woman":
            new_list.append(word)
    new_list

remove_woman(woman_sim_2_nested[0])

SyntaxError: invalid syntax (<ipython-input-40-486ec013c5ca>, line 1)

In [24]:
list_0 = []
for word in woman_sim_2_nested[0]:
    if word !="woman":
        list_0.append(word)
list_0

['till']

In [25]:
list_1 = []
for word in woman_sim_2_nested[1]:
    if word !="woman":
        list_1.append(word)
list_1

['reached']

In [26]:
list_2 = []
for word in woman_sim_2_nested[2]:
    if word !="woman":
        list_2.append(word)
list_2

['word']

In [27]:
list_3 = []
for word in woman_sim_2_nested[3]:
    if word !="woman":
        list_3.append(word)
list_3

['friend', 'time', 'dog', 'window', 'roused']

In [28]:
list_4 = []
for word in woman_sim_2_nested[4]:
    if word !="woman":
        list_4.append(word)
list_4

['explain', 'thing', 'another', 'mixture', 'toss', 'way', 'window', 'first']

In [29]:
list_5 = []
for word in woman_sim_2_nested[5]:
    if word !="woman":
        list_5.append(word)
list_5

['always', 'met', 'madam', 'stood']

In [103]:
df = pd.DataFrame(woman_sim_2_nested)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,woman,till,,,,,,,,,,,,,,
1,woman,reached,,,,,,,,,,,,,,
2,word,woman,,,,,,,,,,,,,,
3,friend,woman,time,dog,window,roused,,,,,,,,,,
4,explain,thing,another,mixture,toss,way,window,first,woman,,,,,,,
5,always,woman,met,madam,stood,,,,,,,,,,,
6,might,used,understood,prophet,bird,thought,come,shape,none,would,woman,cried,allied,known,laughing,fool
7,london,woman,could,ran,,,,,,,,,,,,
8,wit,woman,,,,,,,,,,,,,,
9,able,happiness,wisdom,sailor,woman,say,,,,,,,,,,


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

In [105]:
s = pd.Series(woman_sim_2_nested)
s

0                                         [woman, till]
1                                      [woman, reached]
2                                         [word, woman]
3            [friend, woman, time, dog, window, roused]
4     [explain, thing, another, mixture, toss, way, ...
5                    [always, woman, met, madam, stood]
6     [might, used, understood, prophet, bird, thoug...
7                           [london, woman, could, ran]
8                                          [wit, woman]
9         [able, happiness, wisdom, sailor, woman, say]
10                           [woman, petticoat, indeed]
11                                    [scarcely, woman]
12    [woman, encumbrance, account, fly, leave, rust...
13                                        [woman, year]
14                                              [woman]
dtype: object

In [44]:
the_list = [['blue'], ['blue', 'red', 'black'], ['green', 'yellow'],['orange'], ['white', 'gray']]
filters = ['blue', 'white']
final_l = [i for i in the_list if not any(word in filters for word in i)]
final_l


[['green', 'yellow'], ['orange']]

In [64]:
woman_sim_2_nested

['reached',
 'till',
 'friend',
 'word',
 'moment',
 'saw',
 'could',
 'cried',
 'sailor',
 'wit',
 'scarcely',
 'petticoat',
 'go',
 'servant',
 'conclusion']