<a href="https://www.kaggle.com/code/faressayah/spacy-chapter-2-large-scale-data-analysis?scriptVersionId=117768781" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# 1. Strings to Hashes

In [1]:
import spacy

print(spacy.__version__)

3.3.2


In [2]:
from spacy.lang.en import English

nlp = English()
doc = nlp("I have a cat")

cat_hash = nlp.vocab.strings['cat']
print(cat_hash)

cat_string = nlp.vocab.strings[cat_hash]
print(cat_string)

5439657043933447811
cat


In [3]:
person_hash = nlp.vocab.strings['PERSON']
print(person_hash)

person_string = nlp.vocab.strings[person_hash]
print(person_string)

380
PERSON


# 2. Creating a Doc

In [4]:
from spacy.lang.en import English
from spacy.tokens import Doc

nlp = English()

words = ['spaCy', 'is', 'cool', '!']
spaces = [True, True, False, False]

doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

spaCy is cool!


In [5]:
words = ['Go', ',', 'get', 'started', '!']
spaces = [False, True, True, False, False]

doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

Go, get started!


# 3. Docs, Spans, and entities from Scratch

In [6]:
from spacy.lang.en import English
from spacy.tokens import Doc, Span

nlp = English()

words = ['I', 'like', 'David', 'Bowie']
spaces = [True, True, True, False]

doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

span = Span(doc, start=2, end=4, label='PERSON')
print(span.text, span.label_)

doc.ents = [span]

print([(ent.text, ent.label_) for ent in doc.ents])

I like David Bowie
David Bowie PERSON
[('David Bowie', 'PERSON')]


In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin looks like a nice city")

# Collect all proper nouns that are followed by a verb
for token in doc:
    if token.pos_ == "PROPN":
        if doc[token.i + 1].pos_ == "VERB":
            print("Found proper noun before a verb: ", token.text)

Found proper noun before a verb:  Berlin


# 4. Inspecting Word Vectors

In [8]:
!python3 -m spacy download en_core_web_md

Collecting en-core-web-md==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.3.0/en_core_web_md-3.3.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.3.0
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [9]:
import en_core_web_md

nlp = en_core_web_md.load()

doc = nlp("Two bananas in pyjamas")

bananas_vector = doc[1].vector
print(bananas_vector)

[-0.6334     0.18981   -0.53544   -0.52658   -0.30001    0.30559
 -0.49303    0.14636    0.012273   0.96802    0.0040354  0.25234
 -0.29864   -0.014646  -0.24905   -0.67125   -0.053366   0.59426
 -0.068034   0.10315    0.66759    0.024617  -0.37548    0.52557
  0.054449  -0.36748   -0.28013    0.090898  -0.025687  -0.5947
 -0.24269    0.28603    0.686      0.29737    0.30422    0.69032
  0.042784   0.023701  -0.57165    0.70581   -0.20813   -0.03204
 -0.12494   -0.42933    0.31271    0.30352    0.09421   -0.15493
  0.071356   0.15022   -0.41792    0.066394  -0.034546  -0.45772
  0.57177   -0.82755   -0.27885    0.71801   -0.12425    0.18551
  0.41342   -0.53997    0.55864   -0.015805  -0.1074    -0.29981
 -0.17271    0.27066    0.043996   0.60107   -0.353      0.6831
  0.20703    0.12068    0.24852   -0.15605    0.25812    0.007004
 -0.10741   -0.097053   0.085628   0.096307   0.20857   -0.23338
 -0.077905  -0.030906   1.0494     0.55368   -0.10703    0.052234
  0.43407   -0.13926    0

# 5. Comparing Similarities

In [10]:
doc_1 = nlp("It's a warm summer day")
doc_2 = nlp("It's sunny outside")

similarity = doc_1.similarity(doc_2)
print(similarity)

0.845685397409251


In [11]:
doc = nlp("TV and Books")
token_1, token_2 = doc[0], doc[2]

similarity = token_1.similarity(token_2)
print(similarity)

0.18317238986492157


In [12]:
doc = nlp("This was a great restaurant. Afterwards, we went to a really nice bar.")

span_1 = doc[3:5]
span_2 = doc[12:15]

print(span_1)
print(span_2)

similarity = span_1.similarity(span_2)
print(similarity)

great restaurant
really nice bar
0.7541285157203674


# 6. Debugging Patterns

In [13]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)

pattern_1 = [{"LOWER": "amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
pattern_2 = [{"LOWER": "ad"}, {"IS_PUNCT":True}, {"LOWER":"free"}, {"POS": "NOUN"}]

matcher = Matcher(nlp.vocab)
matcher.add("PATTERN_1", [pattern_1])
matcher.add("PATTERN_2", [pattern_2])

for match_id, start, end in matcher(doc):
    print(doc.vocab.strings[match_id], doc[start:end].text)

PATTERN_1 Amazon Prime
PATTERN_2 ad-free viewing
PATTERN_1 Amazon Prime
PATTERN_2 ad-free viewing
PATTERN_2 ad-free viewing
PATTERN_2 ad-free viewing
