### TEXT CLASSIFICATION USING SPACY PYTHON

#### Installing Spacy

In [4]:
!pip install spacy
!python -m spacy download en



You should consider upgrading via the 'python -m pip install --upgrade pip' command.


Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0MB)
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py): started
  Building wheel for en-core-web-sm (setup.py): finished with status 'done'
  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.2.5-cp37-none-any.whl size=12011744 sha256=b5968992c7c8bccb5ea2646de0f17b3ddfb7193f41e9ead32937859efd666f55
  Stored in directory: C:\Users\akshe\AppData\Local\Temp\pip-ephem-wheel-cache-3qqbkbr3\wheels\6a\47\fb\6b5a0b8906d8e8779246c67d4658fd8a544d4a03a75520197a
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-2.2.5
[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')
[x] Couldn't link model to 'en'
Creating a symlink in spacy/data failed. Make sure you have the re

You should consider upgrading via the 'python -m pip install --upgrade pip' command.
You do not have sufficient privilege to perform this operation.


## Tokenization
**Tokenization is the process of splitting the text into pieces called tokens**

In [1]:
# Word tokenization : breaking up the text into individual words
from spacy.lang.en import English
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

text = """Over the past decade, more people have begun to openly acknowledge that their identities don’t fit in with existing conceptions of gender, race, and ethnicity. 
The way we see ourselves has evolved to better reflect the nuances and complexities of being human. 
“He” and “she” are no longer the only acceptable pronouns. """

#  "nlp" is used to create documents with linguistic annotations.
my_doc = nlp(text)

# Create list of word tokens
token_list = []
for token in my_doc:
    token_list.append(token.text)
print(token_list)

['Over', 'the', 'past', 'decade', ',', 'more', 'people', 'have', 'begun', 'to', 'openly', 'acknowledge', 'that', 'their', 'identities', 'do', 'n’t', 'fit', 'in', 'with', 'existing', 'conceptions', 'of', 'gender', ',', 'race', ',', 'and', 'ethnicity', '.', '\n', 'The', 'way', 'we', 'see', 'ourselves', 'has', 'evolved', 'to', 'better', 'reflect', 'the', 'nuances', 'and', 'complexities', 'of', 'being', 'human', '.', '\n', '“', 'He', '”', 'and', '“', 'she', '”', 'are', 'no', 'longer', 'the', 'only', 'acceptable', 'pronouns', '.']


**When performing sentence tokenization, the tokenizer looks for specific characters that fall between sentences, 
like periods, exclaimation points, and newline characters**

In [3]:
# sentence tokenization : break the text into sentences rather than words

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

# Create the pipeline 'sentencizer' component
sbd = nlp.create_pipe('sentencizer')

# Add the component to the pipeline
nlp.add_pipe(sbd)

text = """Over the past decade, more people have begun to openly acknowledge that their identities don’t fit in with existing conceptions of gender, race, and ethnicity. 
The way we see ourselves has evolved to better reflect the nuances and complexities of being human. 
“He” and “she” are no longer the only acceptable pronouns.It is becoming more widely understood that racial and ethnic identities can change across time and place. """

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# create list of sentence tokens
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)

['Over the past decade, more people have begun to openly acknowledge that their identities don’t fit in with existing conceptions of gender, race, and ethnicity.', '\nThe way we see ourselves has evolved to better reflect the nuances and complexities of being human.', '\n“He” and “she” are no longer the only acceptable pronouns.', 'It is becoming more widely understood that racial and ethnic identities can change across time and place.']


## Cleaning Text Data : Removing Stop Words
**Text data contain lots of words that we are not useful for us. These words are called stop words, removing these words
helps us to eliminate the noise and better efficiency in data analysis**


In [4]:
#Stop words
#importing stop words from English language.
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

#Printing the total number of stop words:
print('Number of stop words: %d' % len(spacy_stopwords))

#Printing first ten stop words:
print('First ten stop words: %s' % list(spacy_stopwords)[:20])

#Customizing Stop Words List
STOP_WORDS = set("""a about above """.split()) #add words that need to be eliminated in the quotes

Number of stop words: 326
First ten stop words: ['when', 'could', 'n‘t', 'never', '’ll', 'does', 'yourselves', 'your', 'can', 'thru', 'am', 'often', 'under', '’re', 'various', 'really', "'ll", 'same', 'next', 'more']


In [5]:
#Removing Stop Words from Data
from spacy.lang.en.stop_words import STOP_WORDS

#Implementation of stop words:
filtered_sent=[]

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# filtering stop words
for word in doc:
    if word.is_stop==False:
        filtered_sent.append(word)
print("Filtered Sentence:",filtered_sent)

Filtered Sentence: [past, decade, ,, people, begun, openly, acknowledge, identities, fit, existing, conceptions, gender, ,, race, ,, ethnicity, ., 
, way, evolved, better, reflect, nuances, complexities, human, ., 
, “, ”, “, ”, longer, acceptable, pronouns, ., widely, understood, racial, ethnic, identities, change, time, place, .]


## Lexicon Normalization
**Stemming which involved identifying prefixes and suffixes and removing it to get the original word**

In [6]:
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
lookups = Lookups()
lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
lemmatizer = Lemmatizer(lookups)
lemmas = lemmatizer("ducks","NOUN")

In [7]:
lemmas

['duck']

### Lemmatization using Nltk

In [42]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akshe\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [49]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 
  
print(lemmatizer.lemmatize("connecting")) 
print(lemmatizer.lemmatize("corpora")) 
print(lemmatizer.lemmatize("better", pos="a"))

connecting
corpus
good


## Part of Speech (POS) Tagging
**A word’s part of speech defines its function within a sentence**
**For Eg: Noun identifies an object.Adjective describes an object. Verb describes action.Identifying and tagging each word’s part of speech in the context of a sentence is called Part-of-Speech Tagging**

In [52]:
# POS tagging

# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

#  "nlp" Objectis used to create documents with linguistic annotations.
docs = nlp(u"Over the past decade, more people have begun to openly acknowledge that their identities don’t fit in with existing conceptions of gender, race.")

for word in docs:
    print(word.text,word.pos_)

Over ADP
the DET
past ADJ
decade NOUN
, PUNCT
more ADJ
people NOUN
have AUX
begun VERB
to PART
openly ADV
acknowledge VERB
that SCONJ
their DET
identities NOUN
do AUX
n’t PART
fit VERB
in ADP
with ADP
existing VERB
conceptions NOUN
of ADP
gender NOUN
, PUNCT
race NOUN
. PUNCT


## Entity Detection
**Entity detection also called entity recognition or NER, is a more advanced form of language processing that identifies important elements like places, people, organizations, and languages within an input string of text.**

In [56]:
#for visualization of Entity detection importing displacy from spacy:

from spacy import displacy

nytimes= nlp(u"""New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases.

At least 285 people have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four Zip codes there, Mayor Bill de Blasio (D) said Tuesday.

The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000.""")

entities=[(i, i.label_, i.label) for i in nytimes.ents]
entities

[(New York City, 'GPE', 384),
 (Tuesday, 'DATE', 391),
 (At least 285, 'CARDINAL', 397),
 (September, 'DATE', 391),
 (Brooklyn, 'GPE', 384),
 (Williamsburg, 'GPE', 384),
 (four, 'CARDINAL', 397),
 (Bill de Blasio, 'PERSON', 380),
 (Tuesday, 'DATE', 391),
 (Orthodox Jews, 'PERSON', 380),
 (6 months old, 'DATE', 391),
 (up to $1,000, 'MONEY', 394)]

### Visualizing the text

In [57]:
displacy.render(nytimes, style = "ent",jupyter = True)

## Dependency Parsing

**Dependency parsing is a language processing technique that allows us to better determine the meaning of a sentence by analyzing how it’s constructed to determine how the individual words relate to each other.**

In [19]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,chunk.root.head.text)


Autonomous amod cars NOUN []
cars nsubj shift VERB [Autonomous]
shift ROOT shift VERB [cars, liability]
insurance compound liability NOUN []
liability dobj shift VERB [insurance, toward]
toward prep liability NOUN [manufacturers]
manufacturers pobj toward ADP []


In [22]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
displacy.render(doc, style='dep')
#for token in doc:
 #   print(token.text, token.dep_, token.head.text, token.head.pos_,
  #          [child for child in token.children])

## Word Vector Representation
**A word vector is a numeric representation of a word that commuicates its relationship to other words.**

In [26]:
import en_core_web_sm
nlp = en_core_web_sm.load()
ball = nlp(u'ball')
print(ball.vector.shape)
print(ball.vector)

(96,)
[ 2.1699312   1.2381297   1.2922517  -0.77823     2.4184482   2.895599
  0.38871086  0.9228186  -1.4188715   1.3812609   4.6242867  -2.0462914
  1.4690908  -1.9076229   1.1835039  -0.31850654  2.2159605   2.973522
 -1.1628941  -0.02781215 -0.22490394  0.3083955  -0.75508803 -2.7778203
 -0.55281657 -1.5979923   1.0033127  -2.1035383   1.6843514  -2.545497
  1.6911331  -0.21506464 -1.8032296   1.7690172   1.5559021  -1.8992751
  3.0936809   1.7872239  -1.734004    2.763948    1.5628004   5.0861325
 -1.8975734  -1.2712145   0.38718024 -2.6765368   1.8389429  -0.8248207
 -0.44743764  1.8109179   0.77916026 -2.7958894  -1.0457646  -1.8186121
 -2.4687874  -0.2998603  -0.318413   -0.62184393  0.8306081   1.5282382
 -0.8732164   1.7262902   0.5704607  -2.1326153  -3.2951672  -2.5835054
  2.5733938  -3.537287   -1.6539438  -0.13548037 -3.2372298   1.0134547
  0.9677227   1.0165101  -1.8342416   0.1394816   3.6130774  -1.0283897
 -1.9866921   0.5463333  -1.8661555  -2.749812    1.1914829  