In [1]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /Users/dbevz/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


DOCUMENTATION FOR WORDNET
https://www.nltk.org/howto/wordnet.html

# Какие синонимы соответствую слову "data" из wordnet?

https://en.wikipedia.org/wiki/Synonym_ring

In [28]:
# synset is a synonym ring is a group of data elements that are considered semantically equivalent for the purposes of information retrieval.
data = wordnet.synset("data.n.01")
data

Synset('data.n.01')

In [17]:
# hypernyms are words that have more broarder meaning  https://en.wikipedia.org/wiki/Hyponymy_and_hypernymy
data.hypernyms()

[Synset('collection.n.01')]

In [18]:
# hyponyms is a word or phrase whose semantic field[1] is included within that of another word
data.hyponyms()

[Synset('accounting_data.n.01'),
 Synset('metadata.n.01'),
 Synset('raw_data.n.01')]

In [19]:
# get all hyponyms and hypernyms
hyper = lambda x: x.hypernyms()
hypo = lambda x: x.hyponyms()

In [29]:
list(data.closure(hyper))

[Synset('collection.n.01'),
 Synset('group.n.01'),
 Synset('abstraction.n.06'),
 Synset('entity.n.01')]

In [30]:
list(data.closure(hypo))

[Synset('accounting_data.n.01'),
 Synset('metadata.n.01'),
 Synset('raw_data.n.01')]

#### hypernyms and hyponyms are not synonims!! should use lemma
https://en.wikipedia.org/wiki/Lemma_(morphology)

In [44]:
# a lemma (plural lemmas or lemmata) is the canonical form, dictionary form, or citation form of a set of words (headword)
data = wordnet.synsets("data")  # Same as synset, but gets all 
for token in data:
    lemmas = token.lemmas()
    for synonym in lemmas:
        print(synonym.name())

data
information
datum
data_point






# Выберите ближайшие слово к "run" из wordnet

In [77]:
# sit do not have noun meaning in wordnet hence we should compare only verbs-meaning
run = wordnet.synset('run.v.01')
drive = wordnet.synset('drive.v.01')
jump = wordnet.synset('jump.v.01')
sit = wordnet.synset('sit.v.01')
climb = wordnet.synset('climb.v.01')

In [78]:
# Taxonomy is the practice and science of categorization or classification based on discrete sets
# Return a score denoting how similar two word senses are, based on the shortest path that connects the senses in the is-a (hypernym/hypnoym) taxonomy
print(run, run.path_similarity(run))
print(drive, run.path_similarity(drive))
print(jump, run.path_similarity(jump))
print(sit, run.path_similarity(sit))
print(climb, run.path_similarity(climb))

Synset('run.v.01') 1.0
Synset('drive.v.01') 0.1111111111111111
Synset('jump.v.01') 0.16666666666666666
Synset('sit.v.01') 0.2
Synset('climb.v.01') 0.2


Sit or Climb?

In [79]:
# Leacock-Chodorow Similarity: Return a score denoting how similar two word senses are, 
# based on the shortest path that connects the senses (as above) and the maximum depth of the taxonomy in which the senses occur
print(run, run.lch_similarity(run))
print(drive, run.lch_similarity(drive))
print(jump, run.lch_similarity(jump))
print(sit, run.lch_similarity(sit))
print(climb, run.lch_similarity(climb))

Synset('run.v.01') 3.258096538021482
Synset('drive.v.01') 1.0608719606852628
Synset('jump.v.01') 1.466337068793427
Synset('sit.v.01') 1.6486586255873816
Synset('climb.v.01') 1.6486586255873816


Sit or Climb?

In [80]:
# Wu-Palmer Similarity: Return a score denoting how similar two word senses are, 
# based on the depth of the two senses in the taxonomy and that of their Least Common Subsumer (most specific ancestor node)
print(run, run.wup_similarity(run))
print(drive, run.wup_similarity(drive))
print(jump, run.wup_similarity(jump))
print(sit, run.wup_similarity(sit))
print(climb, run.wup_similarity(climb))

Synset('run.v.01') 1.0
Synset('drive.v.01') 0.2
Synset('jump.v.01') 0.2857142857142857
Synset('sit.v.01') 0.3333333333333333
Synset('climb.v.01') 0.25


### Sit is closest word for run

In [3]:
TEXT = """Cristiano Ronaldo dos Santos Aveiro GOIH ComM (born 5 February 1985) is a Portuguese professional footballer who plays as a forward for Serie A club Juventus and captains the Portugal national team. Often considered the best player in the world and widely regarded as one of the greatest players of all time, Ronaldo has won five Ballons d'Or[note 3] and four European Golden Shoes, both of which are records for a European player. He has won 30 major trophies in his career, including seven league titles, five UEFA Champions Leagues, one UEFA European Championship, and one UEFA Nations League title. Ronaldo holds the records for the most goals (130) and assists (41) in the history of the UEFA Champions League. He is one of the few recorded players to have made over 1,000 professional career appearances and has scored over 700 senior career goals for club and country. He is also the second player to score 100 international goals, and the first European to achieve the feat."""

# Какое самое популярное слово в приведенном тексте

In [84]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/dbevz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/dbevz/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
stop_words = stopwords.words('english')

In [109]:
# Append some punctuation options
punctiations = [',', '.', '(', ')']
for token in punctiations:
    stop_words.append(token)

In [111]:
text_tokens = word_tokenize(TEXT.lower())
text_tokens_cleaned = [token for token in text_tokens if token not in stop_words]
print("Length of all tokens: ",len(text_tokens))
print("Length of tokens w/o stop words: ",len(text_tokens_cleaned))

Length of all tokens:  189
Length of tokens w/o stop words:  107


In [112]:
nltk.FreqDist(text_tokens).most_common()[:1]

[('the', 12)]

# Какие самые популярные слова после правильной токенизации и приведения к нижнему регистру

In [113]:
nltk.FreqDist(text_tokens_cleaned).most_common()[:10]

[('one', 4),
 ('european', 4),
 ('uefa', 4),
 ('ronaldo', 3),
 ('player', 3),
 ('career', 3),
 ('league', 3),
 ('goals', 3),
 ('professional', 2),
 ('club', 2)]

# К какой форме будет приведено слово "spinning" после стемминге, если указать что это adjective

In [17]:
# Word stemming means removing affixes from words and return the root word. Ex: The stem of the word working => work
# affixes are: Prefix, Suffix, Circumfix, Duplifix, Infix, Interfix, Transfix, Simulfix
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
print(stemmer.stem('spinning'))
print(lemmatizer.lemmatize('spinning'))

spin
spinning


# Укажите в правильном порядке персон, которые были в тексте

In [117]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 430 kB/s eta 0:00:01
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.3.1-py3-none-any.whl size=12047106 sha256=89c4ac87a437a6ff46dcd91286a6a62991a59354b5c4cbae42419e41136d16ea
  Stored in directory: /private/var/folders/sc/ww936dxd6tb4cl55pkz06tys5llj4c/T/pip-ephem-wheel-cache-45pt_mb2/wheels/ee/4d/f7/563214122be1540b5f9197b52cb3ddb9c4a8070808b22d5a84
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-2.3.1
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [4]:
doc = nlp(TEXT)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Cristiano Ronaldo 0 17 PERSON
dos Santos 18 28 PERSON
ComM 41 45 GPE
5 February 1985 52 67 DATE
Portuguese 74 84 NORP
Serie A 136 143 ORG
Juventus 149 157 ORG
Portugal 175 183 GPE
one 268 271 CARDINAL
Ronaldo 309 316 PERSON
five 325 329 CARDINAL
3 348 349 CARDINAL
four 355 359 CARDINAL
European 360 368 NORP
European 415 423 NORP
30 443 445 CARDINAL
seven 486 491 CARDINAL
five 507 511 CARDINAL
UEFA Champions Leagues 512 534 ORG
one 536 539 CARDINAL
UEFA European Championship 540 566 ORG
one 572 575 CARDINAL
UEFA Nations League 576 595 ORG
Ronaldo 603 610 PERSON
130 649 652 CARDINAL
41 667 669 CARDINAL
the UEFA Champions League 689 714 ORG
one 722 725 CARDINAL
over 1,000 767 777 CARDINAL
over 700 825 833 CARDINAL
second 891 897 ORDINAL
100 914 917 CARDINAL
first 947 952 ORDINAL
European 953 961 NORP


# Какие прилагательные модификаторы связанные со словам футболист в первом предложении

In [15]:
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

first_sentence = sent_tokenize(TEXT)[0]
doc = nlp(first_sentence)
displacy.render(doc, style="dep", jupyter=True, options={'distance': 140})