In [1]:
# python -m spacy download en_core_web_sm
import spacy

nlp = spacy.load("en_core_web_sm")

text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'tell', 'shake', 'turn', 'be', 'talk', 'say']
Sebastian Thrun PERSON
Google ORG
2007 DATE
American NORP
Thrun GPE
earlier this week DATE


In [2]:
## disable part-of-speech tagging and lemmatization

## to disable part of speech tagging and lemmatization
# Note: English doesn't include a morphologizer
# nlp = spacy.load("en_core_web_sm", disable=["tagger", "attribute_ruler", "lemmatizer"])
# nlp = spacy.load("en_core_web_trf", disable=["tagger", "attribute_ruler", "lemmatizer"])

In [3]:
## use the senter and disable parser
# nlp = spacy.load("en_core_web_sm")
# nlp.disable_pipe("parser")
# nlp.enable_pipe("senter")

In [4]:
## switch from rule-based to lookup lemmatization
## For the Dutch, English, French, Greek, Macedonian, Norwegian and Spanish pipelines, 
## you can switch from the default rule-based lemmatizer to a lookup lemmatizer:
## Requirements: pip install spacy-lookups-data
# nlp = spacy.load("en_core_web_sm")
# nlp.remove_pipe("lemmatizer")
# nlp.add_pipe("lemmatizer", config={"mode": "lookup"}).initialize()

In [5]:
## Disable everything except NER
## For the non-transformer models, the ner component is independent, so you can disable everything else
# nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])

In [6]:
## in the transformer models, ner listens to the transformer component, 
## so you can disable all components related tagging, parsing, and lemmatization.
# nlp = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])

In [7]:
## Move NER to the end of the pipeline
##For access to POS and LEMMA features in an entity_ruler, 
##move ner to the end of the pipeline after attribute_ruler and lemmatizer:

## load without NER
# nlp = spacy.load("en_core_web_sm", exclude=["ner"])

## source NER from the same pipeline package as the last component
# nlp.add_pipe("ner", source=spacy.load("en_core_web_sm"))

## insert the entity ruler
# nlp.add_pipe("entity_ruler", before="ner")

In [8]:
# from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
# # from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL

# config = {
#     'threshold': 0.5,
#     'model': DEFAULT_SINGLE_TEXTCAT_MODEL,
# }

# model = nlp.add_pipe('textcat', config=config)

In [9]:
#doc = nlp("this is sentence.")
# predicted = model(doc)
# print(predicted)
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [10]:
# model.add_label('pos')
# model.add_label('neg')

In [11]:
# from spacy.lang.en import English


# nlp = English()
# docs = nlp(text)
# docs

In [12]:
# tokens = [w.text for w in docs]
# print(tokens)

In [13]:
# # sent = nlp.create_pipe('sentencizer')
# nlp.add_pipe('sentencizer')
# docs = nlp(text)

In [14]:
# docs = nlp(text)
# print(nlp.pipe_names)

# sentences = [s.text for s in docs.sents]
# print(len(sentences), sentences)

In [15]:
from spacy.lang.en.stop_words import STOP_WORDS

## remove stop_words
clean = [w for w in doc if not w.is_stop]
print(clean)

[Sebastian, Thrun, started, working, self, -, driving, cars, Google, 2007, ,, people, outside, company, took, seriously, ., “, tell, senior, CEOs, major, American, car, companies, shake, hand, turn, away, worth, talking, ,, ”, said, Thrun, ,, interview, Recode, earlier, week, .]


In [16]:
# nlp.add_pipe('lemmatizer')
# lem = nlp("run runs ran running runners")
# _lem = [w.lemma_ for w in lem]
# print(_lem)

In [17]:
## Par-of-speech (pos) tagging

## noun, adj identified as an object, verb as action

from spacy import displacy
# import en_core_web_sm

# nlp = en_core_web_sm.load()

docs = nlp("This is really helpful for quickly extracting information from text," 
           "since you cannot can't quickly pick out important topics or indentify key sections of text")

# entities = [(i, i.label_, i.label) for i in docs.ents]
entities = [(w.text, w.pos_) for w in docs]
entities

[('This', 'DET'),
 ('is', 'AUX'),
 ('really', 'ADV'),
 ('helpful', 'ADJ'),
 ('for', 'ADP'),
 ('quickly', 'ADV'),
 ('extracting', 'VERB'),
 ('information', 'NOUN'),
 ('from', 'ADP'),
 ('text', 'NOUN'),
 (',', 'PUNCT'),
 ('since', 'SCONJ'),
 ('you', 'PRON'),
 ('can', 'AUX'),
 ('not', 'PART'),
 ('ca', 'AUX'),
 ("n't", 'PART'),
 ('quickly', 'ADV'),
 ('pick', 'VERB'),
 ('out', 'ADP'),
 ('important', 'ADJ'),
 ('topics', 'NOUN'),
 ('or', 'CCONJ'),
 ('indentify', 'VERB'),
 ('key', 'ADJ'),
 ('sections', 'NOUN'),
 ('of', 'ADP'),
 ('text', 'NOUN')]

In [18]:
lemm = [(w.text, w.lemma_) for w in docs if w.pos_.startswith('V') or w.pos_ == 'AUX']
lemm

[('is', 'be'),
 ('extracting', 'extract'),
 ('can', 'can'),
 ('ca', 'ca'),
 ('pick', 'pick'),
 ('indentify', 'indentify')]

In [19]:
## entity Detection, tagging words or n-gram of words with entity recognition 
## such as places, people, organizations,...
## language input string text, 

nytimes= nlp(u"""New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases.

At least 285 people have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four Zip codes there, Mayor Bill de Blasio (D) said Tuesday.

The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000.""")

entities=[(i, i.label_, i.label) for i in nytimes.ents]
entities

## GPE: specific location, CARDINAL: important number

[(New York City, 'GPE', 384),
 (Tuesday, 'DATE', 391),
 (At least 285, 'CARDINAL', 397),
 (September, 'DATE', 391),
 (Brooklyn, 'GPE', 384),
 (four, 'CARDINAL', 397),
 (Zip, 'PERSON', 380),
 (Bill de Blasio, 'PERSON', 380),
 (Tuesday, 'DATE', 391),
 (Orthodox, 'NORP', 381),
 (6 months old, 'DATE', 391),
 (up to $1,000, 'MONEY', 394)]

In [20]:
## display visualize output text with colored entities (style='ent')
displacy.render(nytimes, style='ent', jupyter=True)

In [21]:
## Dependency Parsing, 
## determine the meaning of a sentence by how constructed itself based on individual words
docp = nlp (" In pursuit of a wall, President Trump ran into one.")

for chunk in docp.noun_chunks:
   print(f'raw: {chunk.text}, root: {chunk.root.text}, dependency: {chunk.root.dep_}, header: {chunk.root.head.text}')


raw: pursuit, root: pursuit, dependency: pobj, header: In
raw: a wall, root: wall, dependency: pobj, header: of
raw: President Trump, root: Trump, dependency: nsubj, header: ran


In [22]:
displacy.render(docp, style='dep', jupyter=True)

In [23]:
## word vector space representation

## A word vector is a numeric representation of a word that commuicates its relationship to other words.

mango = nlp(u'mango')
print(mango.vector.shape)
print(mango.vector)

(96,)
[-0.462305   -0.97013503 -0.3536405   0.28740364 -0.01573728 -0.24513936
 -1.215326   -0.8796606  -0.33882028 -0.85366464  1.1009694  -0.40891293
  0.22952707  0.32104927 -0.16520308  0.19346984  0.18104246 -0.25050682
 -0.86570626 -0.5158702   0.13842583 -1.1441295  -1.2371405  -0.31056306
 -0.77198493 -0.7328714   0.821449    0.46671125  0.46151486 -0.3285221
  0.5737759   0.5633069   0.81746995 -0.1666174  -0.31984073  0.10492463
 -1.0577446   0.35842416  0.47972912 -0.29047596 -0.07571032  1.112559
 -0.21457072  1.0962675  -0.1150732   0.00683655  0.3471359   0.7762994
 -0.18421805 -0.4036425   0.42345917  0.25998825  0.43403518 -0.3259907
  1.1417992  -0.21782616  0.6406765   0.25259757 -0.17306823 -0.4783872
  0.89445263  0.16820912 -0.01807833  0.81608206 -0.42206132 -0.67849445
  0.26927558 -0.5631349   0.6785864   1.0213488   1.2156711   0.1348517
 -0.53032076 -0.13513318  0.05589192 -0.2488123   0.3784064  -0.9752467
 -0.5351178   0.12411818  0.12952325  0.01232275 -0.5

## Text classification
Combination models, sklearn, spacy, supervised classification, prediction label (feedback)
![text classification](data/text_classification.jpg)

In [24]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [25]:
df_amazon = pd.read_csv('data/amazon_alexa.tsv', sep='\t')
df_amazon.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [26]:
display(df_amazon.shape, df_amazon.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            3150 non-null   int64 
 1   date              3150 non-null   object
 2   variation         3150 non-null   object
 3   verified_reviews  3150 non-null   object
 4   feedback          3150 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 123.2+ KB


(3150, 5)

None

In [27]:
from string import punctuation
from sklearn.model_selection import train_test_split


stop_words = STOP_WORDS

def clean_tokens(topic):
    
    table = str.maketrans('', '', punctuation)
    tokens = nlp(topic)
    tokens = [w.lemma_ for w in tokens]
    #tokens = [w.text for w in tokens]
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [w.translate(table) for w in tokens]
    tokens = [w for w in tokens if len(w) > 1]
        
    return tokens


## customize transformer using Spacy
class predictors(TransformerMixin):
    
    def transform(self, X, **transform_params):
        tokens = [topic.strip().lower() for topic in X]
        return tokens
    
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    
    def get_params(self, deep=True):
        return {}
    
#generate BoW matrix (word to matrix vector space) with spacy_tokenizer (clean_tokens) as tokenizer    
bow_vector = CountVectorizer(tokenizer=clean_tokens, ngram_range=(1,1))

#normalize word vector, and representation of word importance in the corpus.
tfidf_vector = TfidfVectorizer(tokenizer=clean_tokens)

# split train and test set
X = df_amazon['verified_reviews']
y = df_amazon['feedback']

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.1)
display(Xtrain.shape, Xtest.shape)

(2835,)

(315,)

In [28]:
from sklearn.linear_model import LogisticRegression

## training with spacy 
classifier = LogisticRegression()
pipe = Pipeline([('cleaner', predictors()),
                ('vectorizer', bow_vector),
                ('classifier', classifier)])

pipe.fit(Xtrain, ytrain)

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7f87a08c4d10>),
                ('vectorizer',
                 CountVectorizer(tokenizer=<function clean_tokens at 0x7f87a08b8c20>)),
                ('classifier', LogisticRegression())])

In [29]:
from sklearn import metrics as m


yhat = pipe.predict(Xtest)

print(f'accuracy= {m.accuracy_score(ytest, yhat)}, precision= {m.precision_score(ytest, yhat)}, recall= {m.recall_score(ytest, yhat)}')


accuracy= 0.9301587301587302, precision= 0.9342105263157895, recall= 0.993006993006993
