In [None]:
pip install nltk



In [None]:
para = """Some theories, most notably special and general relativity, suggest that suitable geometries of spacetime or specific types of motion in space might allow time travel into the past and future if these geometries or motions were possible.[34]: 499  In technical papers, physicists discuss the possibility of closed timelike curves, which are world lines that form closed loops in spacetime, allowing objects to return to their own past. There are known to be solutions to the equations of general relativity that describe spacetimes which contain closed timelike curves, such as Gödel spacetime, but the physical plausibility of these solutions is uncertain.
Many in the scientific community believe that backward time travel is highly unlikely to be possible. Any theory that would allow time travel would introduce potential problems of causality.[35] The classic example of a problem involving causality is the "grandfather paradox," which postulates travelling to the past and intervening in the conception of one's ancestors (causing the death of an ancestor before conception being frequently cited). Some physicists, such as Novikov and Deutsch, suggested that these sorts of temporal paradoxes can be avoided through the Novikov self-consistency principle or a variation of the many-worlds interpretation with interacting worlds."""

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [None]:
#tokenization : corpus to documents
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
documents = nltk.sent_tokenize(para)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
print(documents)

['Some theories, most notably special and general relativity, suggest that suitable geometries of spacetime or specific types of motion in space might allow time travel into the past and future if these geometries or motions were possible.', '[34]:\u200a499\u200a In technical papers, physicists discuss the possibility of closed timelike curves, which are world lines that form closed loops in spacetime, allowing objects to return to their own past.', 'There are known to be solutions to the equations of general relativity that describe spacetimes which contain closed timelike curves, such as Gödel spacetime, but the physical plausibility of these solutions is uncertain.', 'Many in the scientific community believe that backward time travel is highly unlikely to be possible.', 'Any theory that would allow time travel would introduce potential problems of causality.', '[35] The classic example of a problem involving causality is the "grandfather paradox," which postulates travelling to the 

In [None]:
stemmer = PorterStemmer()

In [None]:
stemmer.stem('spacetime')

'spacetim'

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
#Text-cleaning
import re
corpus = []
for i in range(len(documents)):
  review = re.sub('[^a-zA-Z]',' ', documents[i])
  review = review.lower()
  corpus.append(review)

In [None]:
corpus

['some theories  most notably special and general relativity  suggest that suitable geometries of spacetime or specific types of motion in space might allow time travel into the past and future if these geometries or motions were possible ',
 '           in technical papers  physicists discuss the possibility of closed timelike curves  which are world lines that form closed loops in spacetime  allowing objects to return to their own past ',
 'there are known to be solutions to the equations of general relativity that describe spacetimes which contain closed timelike curves  such as g del spacetime  but the physical plausibility of these solutions is uncertain ',
 'many in the scientific community believe that backward time travel is highly unlikely to be possible ',
 'any theory that would allow time travel would introduce potential problems of causality ',
 '     the classic example of a problem involving causality is the  grandfather paradox   which postulates travelling to the past 

In [None]:
#stemming
for i in corpus:
  words = nltk.word_tokenize(i)
  for word in words:
    if word not in set(stopwords.words('english')):
      print(stemmer.stem(word))

theori
notabl
special
gener
rel
suggest
suitabl
geometri
spacetim
specif
type
motion
space
might
allow
time
travel
past
futur
geometri
motion
possibl
technic
paper
physicist
discuss
possibl
close
timelik
curv
world
line
form
close
loop
spacetim
allow
object
return
past
known
solut
equat
gener
rel
describ
spacetim
contain
close
timelik
curv
g
del
spacetim
physic
plausibl
solut
uncertain
mani
scientif
commun
believ
backward
time
travel
highli
unlik
possibl
theori
would
allow
time
travel
would
introduc
potenti
problem
causal
classic
exampl
problem
involv
causal
grandfath
paradox
postul
travel
past
interven
concept
one
ancestor
caus
death
ancestor
concept
frequent
cite
physicist
novikov
deutsch
suggest
sort
tempor
paradox
avoid
novikov
self
consist
principl
variat
mani
world
interpret
interact
world


In [None]:
# Lemmetization
for i in range(len(documents)):
  r = re.sub('[^a-zA-Z]',' ', documents[i])
  r = r.lower()
  r = r.split()
  r = [lemmatizer.lemmatize(word) for word in r if not word in set(stopwords.words('english'))]
  r = ' '.join(r)
  corpus.append(r)

In [None]:
#Bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True,ngram_range=(2,3))

In [None]:
x = cv.fit_transform(corpus)

In [None]:
cv.vocabulary_

{'some theories': 338,
 'theories most': 428,
 'most notably': 206,
 'notably special': 215,
 'special and': 361,
 'and general': 20,
 'general relativity': 132,
 'relativity suggest': 317,
 'suggest that': 374,
 'that suitable': 398,
 'suitable geometries': 380,
 'geometries of': 136,
 'of spacetime': 241,
 'spacetime or': 351,
 'or specific': 255,
 'specific types': 367,
 'types of': 484,
 'of motion': 235,
 'motion in': 208,
 'in space': 152,
 'space might': 344,
 'might allow': 204,
 'allow time': 0,
 'time travel': 446,
 'travel into': 470,
 'into the': 171,
 'the past': 418,
 'past and': 272,
 'and future': 18,
 'future if': 130,
 'if these': 150,
 'these geometries': 438,
 'geometries or': 138,
 'or motions': 253,
 'motions were': 213,
 'were possible': 493,
 'some theories most': 339,
 'theories most notably': 429,
 'most notably special': 207,
 'notably special and': 216,
 'special and general': 362,
 'and general relativity': 21,
 'general relativity suggest': 134,
 'relativi

In [None]:
x[0].toarray()

array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
        1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(ngram_range=(3,3),max_features=2)
X = cv.fit_transform(corpus)

In [None]:
corpus

['some theories  most notably special and general relativity  suggest that suitable geometries of spacetime or specific types of motion in space might allow time travel into the past and future if these geometries or motions were possible ',
 '           in technical papers  physicists discuss the possibility of closed timelike curves  which are world lines that form closed loops in spacetime  allowing objects to return to their own past ',
 'there are known to be solutions to the equations of general relativity that describe spacetimes which contain closed timelike curves  such as g del spacetime  but the physical plausibility of these solutions is uncertain ',
 'many in the scientific community believe that backward time travel is highly unlikely to be possible ',
 'any theory that would allow time travel would introduce potential problems of causality ',
 '     the classic example of a problem involving causality is the  grandfather paradox   which postulates travelling to the past 

In [1]:
pip install gensim



In [2]:
#Word2Vec
import gensim

In [3]:
from gensim.models import Word2Vec, KeyedVectors

In [4]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')
vec_king = wv['king']



In [5]:
vec_king

array([ 1.25976562e-01,  2.97851562e-02,  8.60595703e-03,  1.39648438e-01,
       -2.56347656e-02, -3.61328125e-02,  1.11816406e-01, -1.98242188e-01,
        5.12695312e-02,  3.63281250e-01, -2.42187500e-01, -3.02734375e-01,
       -1.77734375e-01, -2.49023438e-02, -1.67968750e-01, -1.69921875e-01,
        3.46679688e-02,  5.21850586e-03,  4.63867188e-02,  1.28906250e-01,
        1.36718750e-01,  1.12792969e-01,  5.95703125e-02,  1.36718750e-01,
        1.01074219e-01, -1.76757812e-01, -2.51953125e-01,  5.98144531e-02,
        3.41796875e-01, -3.11279297e-02,  1.04492188e-01,  6.17675781e-02,
        1.24511719e-01,  4.00390625e-01, -3.22265625e-01,  8.39843750e-02,
        3.90625000e-02,  5.85937500e-03,  7.03125000e-02,  1.72851562e-01,
        1.38671875e-01, -2.31445312e-01,  2.83203125e-01,  1.42578125e-01,
        3.41796875e-01, -2.39257812e-02, -1.09863281e-01,  3.32031250e-02,
       -5.46875000e-02,  1.53198242e-02, -1.62109375e-01,  1.58203125e-01,
       -2.59765625e-01,  

In [18]:
#cosine-similarity
wv.most_similar('cricket')

[('cricketing', 0.8372225761413574),
 ('cricketers', 0.8165745735168457),
 ('Test_cricket', 0.8094819188117981),
 ('Twenty##_cricket', 0.8068488240242004),
 ('Twenty##', 0.7624265551567078),
 ('Cricket', 0.75413978099823),
 ('cricketer', 0.7372578382492065),
 ('twenty##', 0.7316356897354126),
 ('T##_cricket', 0.7304614186286926),
 ('West_Indies_cricket', 0.6987985968589783)]

In [20]:
wv.most_similar('corn')

[('soybean', 0.7784005403518677),
 ('soybeans', 0.7631983757019043),
 ('wheat', 0.7077561616897583),
 ('corn_crop', 0.699353814125061),
 ('Corn', 0.6878867149353027),
 ('soy_bean', 0.6768984794616699),
 ('corn_soybean', 0.6687990427017212),
 ('sweet_corn', 0.6664301753044128),
 ('soyabeans', 0.6518656611442566),
 ('grain', 0.6460866928100586)]

In [9]:
wv.most_similar('rupee')

[('Rupee', 0.8221039772033691),
 ('Indian_Rupee', 0.7288974523544312),
 ('rupee_appreciation', 0.6984492540359497),
 ('rupees', 0.6948850750923157),
 ('depreciating_rupee', 0.6792574524879456),
 ('rupee_depreciation', 0.6582969427108765),
 ('appreciating_rupee', 0.650733470916748),
 ('Philippine_peso', 0.6499502658843994),
 ('rupee_appreciating', 0.6421096324920654),
 ('Sensex', 0.6407861113548279)]

In [17]:
wv.similarity("wheels","car")

0.38558435

In [21]:
vec = wv['king'] - wv['man'] + wv['woman']

In [23]:
wv.most_similar([vec])

[('king', 0.8449392318725586),
 ('queen', 0.7300517559051514),
 ('monarch', 0.645466148853302),
 ('princess', 0.6156251430511475),
 ('crown_prince', 0.5818676352500916),
 ('prince', 0.5777117609977722),
 ('kings', 0.5613663792610168),
 ('sultan', 0.5376775860786438),
 ('Queen_Consort', 0.5344247817993164),
 ('queens', 0.5289887189865112)]