In [3]:
import bs4 as bs 
import urllib.request
import re
import nltk

scrapped_data =  urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scrapped_data.read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

In the script above, we first download the Wikipedia article using the urlopen method of the request class of the urllib library. We then read the article content and parse it using an object of the BeautifulSoup class. Wikipedia stores the text content of the article inside p tags. We use the find_all function of the BeautifulSoup object to fetch all the contents from the paragraph tags of the article.
Finally, we join all the paragraphs together and store the scraped article in article_text variable for later use.

In [10]:
# cleaning the text
processed_article = article_text.lower()
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article)
processed_article = re.sub(r'\s+', ' ', processed_article)

# preparing the dataset
all_sentences = nltk.sent_tokenize(processed_article)
all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# removing stop-words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

In the script above, we convert all the text to lowercase and then remove all the digits, special characters, and extra spaces from the text. After preprocessing, we are only left with the words.

The Word2Vec model is trained on a collection of words. First, we need to convert our article into sentences. We use nltk.sent_tokenize utility to convert our article into sentences. To convert sentences into words, we use nltk.word_tokenize utility. As a last preprocessing step, we remove all the stop words from the text.

In [11]:
from gensim.models import Word2Vec

word2vec = Word2Vec(all_words,min_count=2)

In [12]:
vocabulary = word2vec.wv.vocab
print(vocabulary)

s.Vocab object at 0x000001CBA62F9EB0>, 'individual': <gensim.models.keyedvectors.Vocab object at 0x000001CBA62F9B20>, 'moravec': <gensim.models.keyedvectors.Vocab object at 0x000001CBA62F9700>, 'paradox': <gensim.models.keyedvectors.Vocab object at 0x000001CBA62F94C0>, 'take': <gensim.models.keyedvectors.Vocab object at 0x000001CBA62F9A30>, 'named': <gensim.models.keyedvectors.Vocab object at 0x000001CBA62F92E0>, 'hans': <gensim.models.keyedvectors.Vocab object at 0x000001CBA62F9850>, 'stated': <gensim.models.keyedvectors.Vocab object at 0x000001CBA62F9D00>, 'easy': <gensim.models.keyedvectors.Vocab object at 0x000001CBA62F9A60>, 'exhibit': <gensim.models.keyedvectors.Vocab object at 0x000001CBA62F98E0>, 'performance': <gensim.models.keyedvectors.Vocab object at 0x000001CBA62F9FD0>, 'impossible': <gensim.models.keyedvectors.Vocab object at 0x000001CBA62F9160>, 'give': <gensim.models.keyedvectors.Vocab object at 0x000001CBA62F9DC0>, 'old': <gensim.models.keyedvectors.Vocab object at 0x0

In [13]:
v1 = word2vec.wv['artificial'] # The vector v1 contains the vector representation for the word "artificial". By default, a hundred dimensional vector is created by Gensim Word2Vec.

In [14]:
sim_words = word2vec.wv.most_similar('intelligence')

In [15]:
print(sim_words)

[('described', 0.46398288011550903), ('ethics', 0.45365941524505615), ('use', 0.44582992792129517), ('ai', 0.4442201554775238), ('field', 0.4385843276977539), ('symbolic', 0.4367824196815491), ('many', 0.4224812984466553), ('given', 0.4097159504890442), ('dangerous', 0.4095437228679657), ('human', 0.39762768149375916)]
