In [1]:
import nltk
import urllib
import bs4 as bs
from gensim.models import Word2Vec

### Pulling in a wikipedia article to analyse

In [2]:
source = urllib.request.urlopen('https://en.wikipedia.org/wiki/Global_warming').read()

### Parsing the data/ creating BeautifulSoup object

In [3]:
soup = bs.BeautifulSoup(source,'lxml')

### Preprocessing the data

In [4]:
from functools import reduce
import re
from nltk.corpus import stopwords

In [11]:
text = reduce(lambda text, paragraph: text + paragraph.text, 
              soup.find_all('p'), 
              '')

In [12]:
text = text.lower()
text = re.sub(r'\[[0-9]*\]', ' ', text) # removes [number]
text = re.sub(r'[\(\)\[\]\{|\}\<|\>\*\+\-\'@#%&,;:!?=`"]', ' ', text) # removes puntuation except dots
text = re.sub(r'\d', ' ', text) # removes numbers
text = re.sub(r'\s+', ' ', text) # removes extra spaces
text = re.sub(r"'s", ' is', text)
text = re.sub(r"'re", ' are', text)
text = re.sub(r"won't", 'will not', text)

### Preparing the dataset and removing stopwords

In [13]:
sentences = [nltk.word_tokenize(sentence) for sentence in nltk.sent_tokenize(text)]

for i in range(len(sentences)):
    sentences[i] = \
        [word for word in sentences[i] if word not in stopwords.words('english')]

### Training the Word2Vec model

In [14]:
model = Word2Vec(sentences, min_count=1)

### Finding Word Vectors and similars

In [15]:
model.wv['warming']

array([-7.4098203e-03, -4.1346266e-03, -1.6169812e-03,  8.9467637e-04,
        3.5840755e-03,  4.9035517e-03, -4.6747611e-03,  6.2784663e-04,
        4.1934382e-03, -4.5940764e-03, -4.3141283e-03,  4.1263504e-03,
       -6.4010252e-05, -1.9118217e-04, -3.5028839e-03,  3.3158951e-03,
       -2.3725651e-04, -5.4052059e-04,  4.5400675e-04,  4.8665907e-03,
        1.5086887e-03,  1.2235566e-03,  5.9111531e-05,  4.8548309e-03,
       -3.2006437e-03,  2.0776337e-03,  3.4812717e-03,  5.2382457e-03,
        6.8374993e-03,  3.8444886e-03,  2.1106100e-03,  3.6832080e-03,
        6.3533648e-03,  1.2515329e-05,  3.3861450e-03, -4.7877529e-03,
        1.3543242e-04,  7.5833914e-03, -5.1302514e-03,  5.7997219e-03,
       -4.8654522e-03, -2.0872043e-03,  2.1366104e-03, -6.4219086e-04,
        1.3138225e-03,  5.9242328e-03, -3.1133760e-03, -4.6584983e-03,
        4.0514546e-04, -5.4671182e-03,  6.7492807e-03, -2.4109746e-03,
        4.2333622e-03,  5.0360560e-03, -5.8572604e-03, -2.7994041e-03,
      

In [16]:
model.wv.most_similar('warming')

[('rising', 0.4035898447036743),
 ('energy', 0.38041236996650696),
 ('global', 0.37321072816848755),
 ('would', 0.36459946632385254),
 ('change', 0.3640798032283783),
 ('climate', 0.3638940453529358),
 ('chemical', 0.3402023911476135),
 ('greenhouse', 0.3310290277004242),
 ('including', 0.32724010944366455),
 ('pre', 0.3223811984062195)]