# Tutorial: Text Classification in Python Using spaCy
[source](https://www.dataquest.io/blog/tutorial-text-classification-in-python-using-spacy/)<br>
[Github sobre receitas](https://github.com/gdmarmerola/whats-cooking)

In [1]:
# Word tokenization
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
my_doc = nlp(text)

# Create list of word tokens
token_list = []
for token in my_doc:
    token_list.append(token.text)
print(token_list)

['When', 'learning', 'data', 'science', ',', 'you', 'should', "n't", 'get', 'discouraged', '!', '\n', 'Challenges', 'and', 'setbacks', 'are', "n't", 'failures', ',', 'they', "'re", 'just', 'part', 'of', 'the', 'journey', '.', 'You', "'ve", 'got', 'this', '!']


In [2]:
# sentence tokenization

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

# Create the pipeline 'sentencizer' component
sbd = nlp.create_pipe('sentencizer')

# Add the component to the pipeline
nlp.add_pipe(sbd)

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# create list of sentence tokens
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)

["When learning data science, you shouldn't get discouraged!", "\nChallenges and setbacks aren't failures, they're just part of the journey.", "You've got this!"]


In [3]:
#Stop words
#importing stop words from English language.
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

#Printing the total number of stop words:
print('Number of stop words: %d' % len(spacy_stopwords))

#Printing first ten stop words:
print('First ten stop words: %s' % list(spacy_stopwords)[:20])

Number of stop words: 305
First ten stop words: ['else', 'whole', 'very', 'does', 'amount', 'but', 'had', 'can', 'of', 'everyone', 'thus', 'between', 'to', 'will', 'bottom', 'cannot', 'at', 'somehow', 'both', 'whether']


In [4]:
from spacy.lang.en.stop_words import STOP_WORDS

#Implementation of stop words:
filtered_sent=[]

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# filtering stop words
for word in doc:
    if word.is_stop==False:
        filtered_sent.append(word)
print("Filtered Sentence:",filtered_sent)
print()
print(doc)

Filtered Sentence: [When, learning, data, science, ,, n't, discouraged, !, 
, Challenges, setbacks, n't, failures, ,, 're, journey, ., You, 've, got, !]

When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!


In [5]:
# Implementing lemmatization
lem = nlp("run runs running runner")
# finding lemma for each word
for word in lem:
    print(word.text,word.lemma_)

run run
runs run
running run
runner runner


In [6]:
# POS tagging

# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

#  "nlp" Objectis used to create documents with linguistic annotations.
docs = nlp(u"All is well that ends well.")

for word in docs:
    print(word.text,word.pos_)

All DET
is VERB
well ADV
that ADJ
ends VERB
well ADV
. PUNCT


In [7]:
#for visualization of Entity detection importing displacy from spacy:

from spacy import displacy

nytimes= nlp(u"""New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases.

At least 285 people have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four Zip codes there, Mayor Bill de Blasio (D) said Tuesday.

The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000.""")

entities=[(i, i.label_, i.label) for i in nytimes.ents]
entities

[(New York City, 'GPE', 382),
 (Tuesday, 'DATE', 388),
 (At least 285, 'CARDINAL', 394),
 (September, 'DATE', 388),
 (Brooklyn, 'GPE', 382),
 (Williamsburg, 'GPE', 382),
 (four, 'CARDINAL', 394),
 (Zip, 'PERSON', 378),
 (Bill de Blasio, 'PERSON', 378),
 (Tuesday, 'DATE', 388),
 (Orthodox, 'NORP', 379),
 (Jews, 'NORP', 379),
 (6 months old, 'DATE', 388),
 (1,000, 'MONEY', 391)]

In [8]:
displacy.render(nytimes, style = "ent",jupyter = True)

In [9]:
docp = nlp (" In pursuit of a wall, President Trump ran into one.")

for chunk in docp.noun_chunks:
   print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

pursuit pursuit pobj In
a wall wall pobj of
President Trump Trump nsubj ran


In [10]:
displacy.render(docp, style="dep", jupyter= True)


In [11]:
import en_core_web_sm
nlp = en_core_web_sm.load()
mango = nlp(u'mango')
print(mango.vector.shape)
print(mango.vector)

(384,)
[ 1.81898966e-01 -5.30110955e-01  2.66826534e+00  6.92421913e-01
 -1.97660923e+00  3.68705654e+00 -4.39795113e+00 -9.98801529e-01
  4.40463841e-01  2.16392413e-01 -3.65440428e-01 -7.81078488e-02
 -2.61334181e-02 -2.29889107e+00 -4.02843088e-01  2.03411436e+00
 -1.13863659e+00 -2.47938895e+00 -6.85229659e-01  2.18901825e+00
  2.21150208e+00  1.11644948e+00  1.71971321e-01  4.38696891e-01
 -1.64694738e+00 -4.35405135e-01 -3.02480370e-01  8.34271789e-01
 -1.12027764e+00  7.75548279e-01 -5.96542239e-01 -1.65593290e+00
  5.41058362e-01 -3.40727329e-01 -3.47570002e-01  5.06469607e-01
  3.71737838e-01 -9.64704275e-01 -8.57091308e-01  8.52468789e-01
 -3.29184246e+00  4.53453016e+00  2.02872857e-01 -1.16222143e-01
 -1.18046355e+00  4.02978033e-01 -5.31236649e-01 -9.04555857e-01
  1.07802987e+00  3.54202747e-01 -1.02040005e+00 -1.33428836e+00
 -3.28955436e+00  6.58582449e-01 -4.01282102e-01  3.08273196e-01
  4.82804203e+00 -1.29300404e+00 -2.84544325e+00 -1.12305379e+00
 -5.03153086e-01  

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [13]:
# Loading TSV file
df_amazon = pd.read_csv ("../input/amazon_alexa.tsv", sep="\t")

In [14]:
df_amazon.head()


Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [15]:
df_amazon.shape

(3150, 5)

In [16]:
df_amazon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
rating              3150 non-null int64
date                3150 non-null object
variation           3150 non-null object
verified_reviews    3150 non-null object
feedback            3150 non-null int64
dtypes: int64(2), object(3)
memory usage: 123.1+ KB


In [17]:
df_amazon['feedback'].value_counts()

1    2893
0     257
Name: feedback, dtype: int64

In [18]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [19]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [20]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [21]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [22]:
from sklearn.model_selection import train_test_split

X = df_amazon['verified_reviews'] # the features we want to analyze
ylabels = df_amazon['feedback'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

In [23]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)



Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x12630e208>),
                ('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 toke...?u)\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x1161f5b70>,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
            

In [24]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))


Logistic Regression Accuracy: 0.9312169312169312
Logistic Regression Precision: 0.9336235038084875
Logistic Regression Recall: 0.9953596287703016
