In [3]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [28]:
import spacy

# Load the small English language model
nlp = spacy.load("en_core_web_sm")

text = "Albert Einstein, a theoretical physicist, was born in Germany. He developed the theory of relativity, which is one of the two pillars of modern physics."

# Process the text with the spaCy pipeline
doc = nlp(text)

#Sentence Segmentation
print("Segmented Sentences:")
for sent in doc.sents:
    print(sent.text)

Segmented Sentences:
Albert Einstein, a theoretical physicist, was born in Germany.
He developed the theory of relativity, which is one of the two pillars of modern physics.


In [29]:
#Tokenization
# Extract the tokens and count them
tokens = [token.text for token in doc]
token_count = len(tokens)

print("Tokens:", tokens)
print(f"Number of Tokens: {token_count}")

Tokens: ['Albert', 'Einstein', ',', 'a', 'theoretical', 'physicist', ',', 'was', 'born', 'in', 'Germany', '.', 'He', 'developed', 'the', 'theory', 'of', 'relativity', ',', 'which', 'is', 'one', 'of', 'the', 'two', 'pillars', 'of', 'modern', 'physics', '.']
Number of Tokens: 30


In [41]:
# Remove stop words
filtered_tokens = [token.text for token in doc if not token.is_stop]
print("Original Text:", text)
print("-" * 30)
print("Filtered Tokens:", filtered_tokens)

Original Text: Albert Einstein, a theoretical physicist, was born in Germany. He developed the theory of relativity, which is one of the two pillars of modern physics.
------------------------------
Filtered Tokens: ['Albert', 'Einstein', ',', 'theoretical', 'physicist', ',', 'born', 'Germany', '.', 'developed', 'theory', 'relativity', ',', 'pillars', 'modern', 'physics', '.']


In [42]:
#Lemmatization: Extract the base form of words
for token in doc:
    print(f"{token.text} -> {token.lemma_}")

Albert -> Albert
Einstein -> Einstein
, -> ,
a -> a
theoretical -> theoretical
physicist -> physicist
, -> ,
was -> be
born -> bear
in -> in
Germany -> Germany
. -> .
He -> he
developed -> develop
the -> the
theory -> theory
of -> of
relativity -> relativity
, -> ,
which -> which
is -> be
one -> one
of -> of
the -> the
two -> two
pillars -> pillar
of -> of
modern -> modern
physics -> physic
. -> .


In [43]:
#POS Tagging
print(f"{'Token':<15} {'POS':<10} {'Explanation'}")
print("-" * 40)
for token in doc:
    print(f"{token.text:<15} {token.pos_:<10} {spacy.explain(token.pos_)}")

Token           POS        Explanation
----------------------------------------
Albert          PROPN      proper noun
Einstein        PROPN      proper noun
,               PUNCT      punctuation
a               DET        determiner
theoretical     ADJ        adjective
physicist       NOUN       noun
,               PUNCT      punctuation
was             AUX        auxiliary
born            VERB       verb
in              ADP        adposition
Germany         PROPN      proper noun
.               PUNCT      punctuation
He              PRON       pronoun
developed       VERB       verb
the             DET        determiner
theory          NOUN       noun
of              ADP        adposition
relativity      NOUN       noun
,               PUNCT      punctuation
which           PRON       pronoun
is              AUX        auxiliary
one             NUM        numeral
of              ADP        adposition
the             DET        determiner
two             NUM        numeral
pillars 

In [44]:
#NER extraction
for ent in doc.ents:
    print(ent.text, ent.label_)

Albert Einstein PERSON
Germany GPE
one CARDINAL
two CARDINAL
