In [58]:
import bs4 as bs
import urllib.request
import re
import nltk

scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scrapped_data .read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

In [59]:
article_text

'\nArtificial intelligence (AI), sometimes called machine intelligence, is intelligence demonstrated by machines, unlike the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals.[1] Colloquially, the term "artificial intelligence" is often used to describe machines (or computers) that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving".[2]\nAs machines become increasingly capable, tasks considered to require "intelligence" are often removed from the definition of AI, a phenomenon known as the AI effect.[3] A quip in Tesler\'s Theorem says "AI is whatever hasn\'t been done yet."[4] For instance, optical character recognition is frequently excluded from things considered to be AI,[5] having become a routine technology.[6] Modern machi

In [60]:
processed_article = article_text.lower()
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )
processed_article = re.sub(r'\s+', ' ', processed_article)

processed_article

' artificial intelligence ai sometimes called machine intelligence is intelligence demonstrated by machines unlike the natural intelligence displayed by humans and animals leading ai textbooks define the field as the study of intelligent agents any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals colloquially the term artificial intelligence is often used to describe machines or computers that mimic cognitive functions that humans associate with the human mind such as learning and problem solving as machines become increasingly capable tasks considered to require intelligence are often removed from the definition of ai a phenomenon known as the ai effect a quip in tesler s theorem says ai is whatever hasn t been done yet for instance optical character recognition is frequently excluded from things considered to be ai having become a routine technology modern machine capabilities generally classified as ai include succe

In [61]:
# Preparing the dataset
all_sentences = nltk.sent_tokenize(processed_article)

all_sentences

[' artificial intelligence ai sometimes called machine intelligence is intelligence demonstrated by machines unlike the natural intelligence displayed by humans and animals leading ai textbooks define the field as the study of intelligent agents any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals colloquially the term artificial intelligence is often used to describe machines or computers that mimic cognitive functions that humans associate with the human mind such as learning and problem solving as machines become increasingly capable tasks considered to require intelligence are often removed from the definition of ai a phenomenon known as the ai effect a quip in tesler s theorem says ai is whatever hasn t been done yet for instance optical character recognition is frequently excluded from things considered to be ai having become a routine technology modern machine capabilities generally classified as ai include succ

In [62]:
all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

all_words

[['artificial',
  'intelligence',
  'ai',
  'sometimes',
  'called',
  'machine',
  'intelligence',
  'is',
  'intelligence',
  'demonstrated',
  'by',
  'machines',
  'unlike',
  'the',
  'natural',
  'intelligence',
  'displayed',
  'by',
  'humans',
  'and',
  'animals',
  'leading',
  'ai',
  'textbooks',
  'define',
  'the',
  'field',
  'as',
  'the',
  'study',
  'of',
  'intelligent',
  'agents',
  'any',
  'device',
  'that',
  'perceives',
  'its',
  'environment',
  'and',
  'takes',
  'actions',
  'that',
  'maximize',
  'its',
  'chance',
  'of',
  'successfully',
  'achieving',
  'its',
  'goals',
  'colloquially',
  'the',
  'term',
  'artificial',
  'intelligence',
  'is',
  'often',
  'used',
  'to',
  'describe',
  'machines',
  'or',
  'computers',
  'that',
  'mimic',
  'cognitive',
  'functions',
  'that',
  'humans',
  'associate',
  'with',
  'the',
  'human',
  'mind',
  'such',
  'as',
  'learning',
  'and',
  'problem',
  'solving',
  'as',
  'machines',
  'be

In [63]:
# Removing Stop Words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

In [64]:
len(all_words[0])

5577

In [65]:
all_words

[['artificial',
  'intelligence',
  'ai',
  'sometimes',
  'called',
  'machine',
  'intelligence',
  'intelligence',
  'demonstrated',
  'machines',
  'unlike',
  'natural',
  'intelligence',
  'displayed',
  'humans',
  'animals',
  'leading',
  'ai',
  'textbooks',
  'define',
  'field',
  'study',
  'intelligent',
  'agents',
  'device',
  'perceives',
  'environment',
  'takes',
  'actions',
  'maximize',
  'chance',
  'successfully',
  'achieving',
  'goals',
  'colloquially',
  'term',
  'artificial',
  'intelligence',
  'often',
  'used',
  'describe',
  'machines',
  'computers',
  'mimic',
  'cognitive',
  'functions',
  'humans',
  'associate',
  'human',
  'mind',
  'learning',
  'problem',
  'solving',
  'machines',
  'become',
  'increasingly',
  'capable',
  'tasks',
  'considered',
  'require',
  'intelligence',
  'often',
  'removed',
  'definition',
  'ai',
  'phenomenon',
  'known',
  'ai',
  'effect',
  'quip',
  'tesler',
  'theorem',
  'says',
  'ai',
  'whatever'

In [66]:
import collections
print(len([item for item, count in collections.Counter(all_words[0]).items() if count > 1]))

925


### Word2Vec

In [67]:
# This is essentially a neural network step
from gensim.models import Word2Vec
word2vec = Word2Vec(all_words, min_count=2)

In [87]:
vocabulary = word2vec.wv.vocab
#print(vocabulary.keys())

In [88]:
len(vocabulary)

925

In [89]:
v1 = word2vec.wv['ai']

In [90]:
v1.shape

(100,)

In [83]:
v1

array([ 3.4873465e-03, -6.2373251e-04, -2.5275813e-03, -7.3889662e-03,
       -5.7522370e-04,  2.8161628e-03, -6.2342668e-03, -3.6799246e-03,
       -4.3299128e-04, -4.7693183e-03, -2.4923435e-03,  3.7649409e-03,
       -2.1130131e-03,  5.2022147e-03,  5.0608207e-05,  3.0269888e-03,
        6.8915254e-03, -3.4287286e-03, -6.1120623e-04, -5.9978152e-04,
       -4.8128744e-03, -6.4760712e-03,  7.0837885e-03,  2.7297318e-03,
       -1.9480509e-03, -1.5556695e-03,  2.8677818e-03,  6.2769023e-03,
       -4.5190598e-03,  3.2176808e-03,  5.3597242e-03, -2.1805209e-03,
       -7.2445832e-03, -6.3092250e-04,  5.5856020e-03, -9.5955556e-04,
       -2.6261234e-03, -6.1142631e-03,  1.9068280e-03,  4.1500446e-03,
        1.1259889e-03,  1.4845583e-03, -2.4051494e-03,  7.6083187e-04,
        4.4587133e-03, -7.2197113e-03,  5.5891164e-03, -3.4054688e-03,
        1.7444246e-03, -5.2433568e-03,  3.9381543e-03, -2.7215062e-03,
        1.2919869e-03,  6.1294604e-03,  7.1436586e-03,  1.7748007e-03,
      

In [84]:
# similarity
sim_words = word2vec.wv.most_similar('intelligence')
sim_words

[('processing', 0.4462946057319641),
 ('ai', 0.39518052339553833),
 ('list', 0.37809574604034424),
 ('human', 0.36974620819091797),
 ('problem', 0.3468899726867676),
 ('rights', 0.3421890139579773),
 ('induced', 0.3362908363342285),
 ('many', 0.3335104286670685),
 ('difficult', 0.3330446183681488),
 ('within', 0.328824907541275)]