Models that enable spaCy to predict linguistic attributes in context

* Part-of-speech tags
* Syntactic dependencies
* Named entities

In [None]:
!python -m spacy download en_core_web_sm

Spacy Model: https://spacy.io/models/en

In [None]:
import spacy

nlp = ("")

# Process a text
doc = nlp("She ate the pizza")

# Iterate over the tokens
for token in doc:
    # Print the text and the predicted part-of-speech tag
    print(token.text, token.pos_)
    # print(token.text, token.lemma_, token.pos_, token.dep_,  token.head.text)

#### POS Values

* ADJ: adjective
* ADP: adposition
* ADV: adverb
* AUX: auxiliary
* CCONJ: coordinating conjunction
* DET: determiner
* INTJ: interjection
* NOUN: noun
* NUM: numeral
* PART: particle
* PRON: pronoun
* PROPN: proper noun
* PUNCT: punctuation
* SCONJ: subordinating conjunction
* SYM: symbol
* VERB: verb

#### Use Case of POS:

* Text Cleaning
* Feature Engineering tasks
* Word sense disambiguation

https://spacy.io/api/doc#init

In [None]:
?doc

In [None]:
import spacy

# Load the small English pipeline
nlp = spacy.load("en_core_web_sm")

# Process a text
doc = nlp("Eskwelabs is an edtech startup based in Manila that provides online courses on data science and analytics to help people in underserved communities in the Philippines, such as stay-at-home moms and young people without university degrees, find better work in the digital age. Since its launch in 2019, Eskwelabs taught more than 3,000 people, about 90% of whom found better-paying work within 90 days of completing its course.")

# Iterate over the tokens
for token in doc.:
    print('---------------------------')
    print(token.text)

In [None]:
from spacy import 

(doc, style="dep")

### Text Generation: Markov Chain

In [None]:
!pip install markovify

In [None]:
import re
import markovify
import nltk
from nltk.corpus import gutenberg
import warnings
nltk.download('gutenberg')

#### Inspect Gutenberg Corpus

In [None]:
print(gutenberg.fileids())

#### Import novels as text objects

In [None]:
hamlet = gutenberg.('shakespeare-hamlet.txt')
macbeth = gutenberg.('shakespeare-macbeth.txt')
caesar = gutenberg.('shakespeare-caesar.txt')
print('\nRaw:\n', hamlet[:100])
print('\nRaw:\n', macbeth[:100])
print('\nRaw:\n', caesar[:100])

#### Data Cleaning

In [None]:
def text_cleaner(text):
    text = re.sub(r'--', ' ', text)
    text = re.sub('[\[].*?[\]]', '', text)
    text = re.sub(r'(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b','', text)
    text = ' '.join(text.split())
    
    return text

In [None]:
hamlet = re.sub(r'Chapter \d+', '', hamlet)
macbeth = re.sub(r'Chapter \d+', '', macbeth)
caesar = re.sub(r'Chapter \d+', '', caesar)

hamlet = text_cleaner(hamlet)
caesar = text_cleaner(caesar)
macbeth = text_cleaner(macbeth)

#### Initialize Spacy

In [None]:
nlp = 
hamlet_doc = (hamlet)
macbeth_doc = (macbeth)
caesar_doc = (caesar)

In [None]:
hamlet_sents = ' '.join([sent.text for sent in hamlet_doc.sents if len(sent.text) > 1])
macbeth_sents = ' '.join([sent.text for sent in macbeth_doc.sents if len(sent.text) > 1])
caesar_sents = ' '.join([sent.text for sent in caesar_doc.sents if len(sent.text) > 1])

shakespeare_sents = hamlet_sents + macbeth_sents + caesar_sents
print(shakespeare_sents)

#### Create text generator using markovify

In [None]:
generator_1 = (shakespeare_sents, state_size=2)

#### We will randomly generate five sentences

In [None]:
print("Longer sentences:")
for i in range(5):
    print(generator_1.make_sentence())

In [None]:
print("Shorter sentences:")

for i in range(5):
    print(generator_1.make_short_sentence(max_chars=100))

#### Improving the result using POSifiedText Spacy

In [None]:
class POSifiedText(markovify.Text):   
    def word_split(self, sentence):
        return ['::'.join((word.text, word.pos_)) for word in nlp(sentence)]   
    
    def word_join(self, words):
        sentence = ' '.join(word.split('::')[0] for word in words)
        
        return sentence
    
generator_2 = POSifiedText(shakespeare_sents, state_size=2)

In [None]:
print("Longer sentences:")

#now we will use the above generator to generate sentences
for i in range(5):
    print(generator_2.make_sentence())

In [None]:
print("Shorter sentences:")
#print 100 characters or less sentences
for i in range(5):
    print(generator_2.make_short_sentence(max_chars=100))

#### Exercise

#### Number 1

In [None]:
import spacy

# Load the "en_core_web_sm" pipeline
nlp = ____

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = ____

# Print the document text
print(____.____)

#### Number 2

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = ____

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = ____.____
    token_pos = ____.____
    token_dep = ____.____
    # This is for formatting only
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")