In [1]:
import bs4 as bs
import urllib.request
import re
import nltk

In [2]:
# download punkt and stopwords

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maxen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maxen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# read the artificial intelligence page from wikipedia

html = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = html.read()

parsed = bs.BeautifulSoup(article, 'lxml')

paragraphs = parsed.find_all('p')

article_text = ""
for p in paragraphs:
    article_text += p.text
    
article_text[:250]

'\n\nIn computer science, artificial intelligence (AI), sometimes called machine intelligence, is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as th'

In [4]:
# lowercase, replace non-words and non-space whitespace

text = article_text.lower()
text = re.sub('[^a-zA-Z]', ' ', text)
text = re.sub(r'\s+', ' ', text)

text[:250]

' in computer science artificial intelligence ai sometimes called machine intelligence is intelligence demonstrated by machines in contrast to the natural intelligence displayed by humans and animals leading ai textbooks define the field as the study '

In [5]:
# tokenize the page

all_words = nltk.word_tokenize(text)

In [6]:
# remove stopwords

stopwords = nltk.corpus.stopwords.words('english')
all_words = [token for token in all_words if token not in stopwords]

In [7]:
# first 10 words 

all_words[:10]

['computer',
 'science',
 'artificial',
 'intelligence',
 'ai',
 'sometimes',
 'called',
 'machine',
 'intelligence',
 'intelligence']

In [8]:
# total # of words

len(all_words)

8482

In [9]:
# import Word2Vec

from gensim.models import Word2Vec

# create Word2Vec, no words with freq < 2
word2vec = Word2Vec([all_words], min_count=2)

In [10]:
len(word2vec.wv.vocab)

1345

In [11]:
# vocabulary maps each word to a tensor

word2vec.wv.vocab

683f9b0>,
 'cover': <gensim.models.keyedvectors.Vocab at 0x1d47683f9e8>,
 'interest': <gensim.models.keyedvectors.Vocab at 0x1d47683fa20>,
 'area': <gensim.models.keyedvectors.Vocab at 0x1d47683fa58>,
 'concern': <gensim.models.keyedvectors.Vocab at 0x1d47683fa90>,
 'representations': <gensim.models.keyedvectors.Vocab at 0x1d47683fac8>,
 'retrieval': <gensim.models.keyedvectors.Vocab at 0x1d47683fb00>,
 'interpretation': <gensim.models.keyedvectors.Vocab at 0x1d47683fb38>,
 'support': <gensim.models.keyedvectors.Vocab at 0x1d47683fb70>,
 'discovery': <gensim.models.keyedvectors.Vocab at 0x1d47683fba8>,
 'need': <gensim.models.keyedvectors.Vocab at 0x1d47683fbe0>,
 'predictions': <gensim.models.keyedvectors.Vocab at 0x1d47683fc18>,
 'change': <gensim.models.keyedvectors.Vocab at 0x1d47683fc50>,
 'choices': <gensim.models.keyedvectors.Vocab at 0x1d47683fc88>,
 'value': <gensim.models.keyedvectors.Vocab at 0x1d47683fcc0>,
 'available': <gensim.models.keyedvectors.Vocab at 0x1d47683fcf8>,


In [12]:
# get the tensor for a specific word

word2vec.wv['artificial']

array([-4.2200703e-03,  4.2646267e-03,  1.3033608e-03,  2.3771303e-04,
        7.9396332e-04,  5.3310101e-03, -1.0929332e-03,  5.5002836e-03,
        4.4052843e-03, -4.5145601e-03, -6.5544578e-03, -1.5000853e-03,
       -1.9468633e-03,  2.6910456e-03,  1.2082589e-02,  4.6160184e-03,
        6.0704385e-04,  2.5428620e-03,  4.6923752e-03, -6.9670030e-03,
       -7.8677256e-03,  1.2521048e-02, -2.7099906e-03, -2.2589155e-03,
       -3.1183278e-03, -2.2527194e-03, -5.5147214e-03,  2.3510803e-03,
        3.0719298e-03, -4.9976553e-03,  5.9758229e-03, -1.8805023e-03,
       -4.1475147e-03,  2.7430493e-03,  3.9193816e-03, -6.9486341e-03,
        3.2794962e-03,  3.0808768e-04,  1.9147416e-03, -3.0170912e-03,
        7.2778459e-03,  1.1024425e-03, -5.0140019e-03, -4.8578111e-03,
       -2.8422954e-03,  6.0934632e-04,  2.2722674e-03,  1.7705726e-03,
       -7.0803482e-03, -2.7537126e-05, -4.6176864e-03, -8.5542724e-03,
        5.1947962e-03, -6.5082838e-03, -4.9285372e-03, -8.3690666e-04,
      

In [13]:
# how many dimensions in the tensor?

len(word2vec.wv['artificial'])

100

In [14]:
# find words most similar to "intelligence"

word2vec.wv.most_similar('intelligence')

[('ai', 0.8141664266586304),
 ('human', 0.7835574150085449),
 ('many', 0.7350308895111084),
 ('artificial', 0.7282922267913818),
 ('researchers', 0.7233484387397766),
 ('research', 0.7007607817649841),
 ('learning', 0.6979109048843384),
 ('problem', 0.6950682997703552),
 ('well', 0.693013072013855),
 ('networks', 0.6913335919380188)]

In [15]:
# find words most similar to "intelligence"

word2vec.wv.most_similar('ai')

[('intelligence', 0.8141664266586304),
 ('human', 0.7721558213233948),
 ('machines', 0.7628246545791626),
 ('theory', 0.7381200194358826),
 ('use', 0.7366983890533447),
 ('problem', 0.7361205816268921),
 ('networks', 0.7299946546554565),
 ('also', 0.7205549478530884),
 ('artificial', 0.7199418544769287),
 ('humans', 0.7092365622520447)]