# Stemming

In [1]:
import nltk

from nltk.stem.porter import PorterStemmer

In [2]:
porter_stemmer = PorterStemmer()

In [3]:
porter_stemmer.stem('caresses')

'caress'

In [4]:
sample_words = ['caresses', 'ponies', 'pony', 'cats', 'running', 'runner', 'climber', 'easily', 'quickly']

for word in sample_words:
    print(word, '-->', porter_stemmer.stem(word))

caresses --> caress
ponies --> poni
pony --> poni
cats --> cat
running --> run
runner --> runner
climber --> climber
easily --> easili
quickly --> quickli


In [5]:
# Porter2 Stemmer
from nltk.stem.snowball import SnowballStemmer

In [6]:
snowball_stemmer = SnowballStemmer(language='english')

In [7]:
for word in sample_words:
    print(word, '-->', snowball_stemmer.stem(word))

caresses --> caress
ponies --> poni
pony --> poni
cats --> cat
running --> run
runner --> runner
climber --> climber
easily --> easili
quickly --> quick


# Lemmatization

In [8]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [9]:
doc_object = nlp("I like to jog. I hated it in my childhood though.")

In [10]:
for token in doc_object:
    print(token, '-->', spacy.explain(token.pos_))

I --> pronoun
like --> verb
to --> particle
jog --> verb
. --> punctuation
I --> pronoun
hated --> verb
it --> pronoun
in --> adposition
my --> adjective
childhood --> noun
though --> adverb
. --> punctuation


In [11]:
# examine root words
for word in doc_object:
    print(word.text, '-->', word.lemma_, '-->', spacy.explain(word.lemma_))

I --> -PRON- --> None
like --> like --> None
to --> to --> None
jog --> jog --> None
. --> . --> punctuation mark, sentence closer
I --> -PRON- --> None
hated --> hat --> None
it --> -PRON- --> None
in --> in --> None
my --> -PRON- --> None
childhood --> childhood --> None
though --> though --> None
. --> . --> punctuation mark, sentence closer


In [12]:
for word in sample_words:
    obj = nlp(word)
    for token in obj:
        print(token.text, '-->', token.lemma_, '-->', token.pos_, '-->', spacy.explain(token.lemma_))

caresses --> caress --> NOUN --> None
ponies --> pony --> NOUN --> None
pony --> pony --> NOUN --> None
cats --> cat --> NOUN --> None
running --> run --> VERB --> None
runner --> runner --> NOUN --> None
climber --> climber --> NOUN --> None
easily --> easily --> ADV --> None
quickly --> quickly --> ADV --> None


In [13]:
!python -m spacy info


    [93mInfo about spaCy[0m

    spaCy version      2.0.16         
    Location           C:\Users\L00151142\.conda\envs\AI2_course\lib\site-packages\spacy
    Platform           Windows-10-10.0.18362-SP0
    Python version     3.7.6          
    Models                            



In [16]:
# create a function that allows you to input some text and then output
# the txt with its lemma, structure the output

# The brown fox is quick and he is jumping over the lazy dog

def lemmatize():
    text = input()
    nlp_text = nlp(text)
    for word in nlp_text:
        print(word.text, '-->', word.lemma_, '-->', word.pos_, '-->', spacy.explain(word.lemma_))
        
lemmatize()

The brown fox is quick and he is jumping over the lazy dog
The --> the --> DET --> None
brown --> brown --> ADJ --> None
fox --> fox --> NOUN --> None
is --> be --> VERB --> None
quick --> quick --> ADJ --> None
and --> and --> CCONJ --> None
he --> -PRON- --> PRON --> None
is --> be --> VERB --> None
jumping --> jump --> VERB --> None
over --> over --> ADP --> None
the --> the --> DET --> None
lazy --> lazy --> ADJ --> None
dog --> dog --> NOUN --> None


# Parts-of-seech (POS Tag)

In [21]:
for token in doc_object:
    print(f"{token.text:{20}} {token.pos_:{20}} {token.tag_:{20}} {spacy.explain(token.tag_):{20}}")

I                    PRON                 PRP                  pronoun, personal   
like                 VERB                 VBP                  verb, non-3rd person singular present
to                   PART                 TO                   infinitival to      
jog                  VERB                 VB                   verb, base form     
.                    PUNCT                .                    punctuation mark, sentence closer
I                    PRON                 PRP                  pronoun, personal   
hated                VERB                 VBD                  verb, past tense    
it                   PRON                 PRP                  pronoun, personal   
in                   ADP                  IN                   conjunction, subordinating or preposition
my                   ADJ                  PRP$                 pronoun, possessive 
childhood            NOUN                 NN                   noun, singular or mass
though               AD

In [32]:
doc_obj = nlp("Can you Google it?")

In [33]:
for token in doc_obj:
    print(f"{token.text:{20}} {token.pos_:{20}} {token.tag_:{20}} {spacy.explain(token.tag_):{20}}")

Can                  VERB                 MD                   verb, modal auxiliary
you                  PRON                 PRP                  pronoun, personal   
Google               VERB                 VB                   verb, base form     
it                   PRON                 PRP                  pronoun, personal   
?                    PUNCT                .                    punctuation mark, sentence closer


In [34]:
doc_obj = nlp("Google")

In [35]:
for token in doc_obj:
    print(f"{token.text:{20}} {token.pos_:{20}} {token.tag_:{20}} {spacy.explain(token.tag_):{20}}")

Google               PROPN                NNP                  noun, proper singular


In [36]:
POS_frequency = doc_object.count_by(spacy.attrs.POS)
POS_frequency

{96: 2, 99: 3, 84: 1, 83: 1, 85: 1, 91: 1, 93: 1, 94: 3}

In [38]:
doc_object.vocab[94].text

'PRON'

In [42]:
for tag_id, occurrences in POS_frequency.items():
    print(f"{tag_id:{10}} {occurrences:{10}} {doc_object.vocab[tag_id].text:{10}} {spacy.explain(doc_object.vocab[tag_id].text):{10}}")

        96          2 PUNCT      punctuation
        99          3 VERB       verb      
        84          1 ADP        adposition
        83          1 ADJ        adjective 
        85          1 ADV        adverb    
        91          1 NOUN       noun      
        93          1 PART       particle  
        94          3 PRON       pronoun   


# Rules-based matching

In [43]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [44]:
file_name = open("stop-words.txt")
sentence = file_name.read()
doc_object = nlp(sentence)

In [48]:
print(doc_object)

Words like "a" and "the" are called stop---words.
Sometimes this can be written as stop-words or stopwords.
Each stop word can be filtered from the text to be processed.
spaCy holds a built-in list of some 305 English stop--words.


In [None]:
token_match1 = [{"LOWER": "stopword"}]
token_match2 = [{"LOWER": "stopwords"}]
token_match3 = [{"LOWER": "stop"}, {"IS_PUNCT": True}, {"LOWER": "word"}]
token_match4 = [{"LOWER": "stop"}, {"IS_PUNCT": True}, {"LOWER": "word"}]
token_match5 = [{"LOWER": "stop"}, {"IS_PUNCT": True}, {"LOWER": "word"}]

matcher.add()

In [50]:
token_matches = matcher(doc_object)
print(token_matches)
for token in token_matches:
    print(f"{token}")

[]
