<a href="https://colab.research.google.com/github/codemel33/17_Classification/blob/master/SpacyTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
###Topic Classifier Test with Spacy 

In [0]:
##Section A : Tokenizing 

In [0]:
#word tekenizer 

In [0]:
###Note - In order to install spacy open your navigator as Admin

In [0]:
!python -m spacy download en


[93m    Linking successful[0m
    C:\Users\melco\Anaconda3\envs\MLPython\lib\site-packages\en_core_web_sm
    -->
    C:\Users\melco\Anaconda3\envs\MLPython\lib\site-packages\spacy\data\en

    You can now load the model via spacy.load('en')



In [0]:
from spacy.lang.en import English 

In [0]:
#Load English Tokenizer , Tagger, Parser , NER and word vectors

In [0]:
nlp = English()

In [0]:
text = """When learning data science, you shouldn't get discouraged! Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

In [0]:
###Use an "nlp" object to create documents with linguistic annotations.

In [0]:
my_doc=nlp(text)

In [0]:
##create a list of word tokens 

In [0]:
token_list=[]
for token in my_doc:
    token_list.append(token.text)
print(token_list)    

['When', 'learning', 'data', 'science', ',', 'you', 'should', "n't", 'get', 'discouraged', '!', 'Challenges', 'and', 'setbacks', 'are', "n't", 'failures', ',', 'they', "'re", 'just', 'part', 'of', 'the', 'journey', '.', 'You', "'ve", 'got', 'this', '!']


In [0]:
##sentence tokenization 

In [0]:
#Load english tokenizer, tagger, parser, NER and word vectors

In [0]:
nlp=English()

In [0]:
#create the pipeline sentencizer component

In [0]:
sbd=nlp.create_pipe('sentencizer')

In [0]:
#Add component to the pipeline

In [0]:
nlp.add_pipe(sbd)

In [0]:
text="""When learning data science, you shouldn't get discouraged! Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

In [0]:
#nlp object is used to create documents with linguistic annotations

In [0]:
doc=nlp(text)

In [0]:
#create a list of sentence tokens 

In [0]:
sents_list=[]
for sent in doc.sents:
    sents_list.append(sent.text)
    print(sents_list)

["When learning data science, you shouldn't get discouraged!"]
["When learning data science, you shouldn't get discouraged!", "Challenges and setbacks aren't failures, they're just part of the journey."]
["When learning data science, you shouldn't get discouraged!", "Challenges and setbacks aren't failures, they're just part of the journey.", "You've got this!"]


In [0]:
###Section B : Cleaning Text Data 

In [0]:
## Removing Stop Words 

In [0]:
##Importing sstop woords from English Language

In [0]:
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [0]:
###Printing the total number of stop words

In [0]:
print('Number of stop words: %d' %len(spacy_stopwords))

Number of stop words: 305


In [0]:
###Printing first top ten stop words

In [0]:
print('First ten stop words: %s' % list(spacy_stopwords)[:20])

First ten stop words: ['such', 'whatever', 'should', 'therefore', 'have', 'get', 'thru', 'various', 'whence', 'used', 'per', 'due', 'thence', 'hereafter', 'which', 'is', 'just', 'mostly', 'nine', 'anyone']


In [0]:
###Removing Stop Words from our data 

In [0]:
from spacy.lang.en.stop_words import STOP_WORDS

In [0]:
##Implementation of stop words 

In [0]:
filtered_sent=[]

In [0]:
#use 'nlp' object to create documents with linguistic annotations

In [0]:
doc=nlp(text)

In [0]:
##Filtering stop words

In [0]:
for word in doc:
   if word.is_stop==False:
    filtered_sent.append(word)
print("Filtered Sentence:",filtered_sent)

Filtered Sentence: [When, learning, data, science, ,, n't, discouraged, !, Challenges, setbacks, n't, failures, ,, 're, journey, ., You, 've, got, !]


In [0]:
###Lexicom Normalization 

In [0]:
###Processing words in order to reduce them to their roots

In [0]:
#Lemmatization 

In [0]:
###Stemming - looping off easily identified prefixes and suffixes to produce the simplest version of a word 

In [0]:
## Implementing Lemmatization 

In [0]:
lem = nlp("run runs running runner")

In [0]:
#finding lemma for each word 

In [0]:
for word in lem:
    print(word.text,word.lemma_)

run run
runs run
running run
runner runner


In [0]:
###Part of Speech (POS) Tagging 

In [0]:
# importing the model en_core_web_sm of English for vocabulary, syntax & entities

In [0]:
import en_core_web_sm

In [0]:
#load en_core_web_sm of English for vocabulary , syntax & entities 

In [0]:
nlp = en_core_web_sm.load()

In [0]:
#"nlp" object is used to create documents with linguistic annotations

In [0]:
docs = nlp(u"All is well that ends well.")

In [0]:
for word in docs:
    print(word.text,word.pos_)

All DET
is VERB
well ADV
that ADJ
ends VERB
well ADV
. PUNCT


In [0]:
#### Entity Detection - identifies  important elements like places , people, organizations and languages

In [0]:
from spacy import displacy 

In [0]:
nytimes= nlp(u"""New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases. At least 285 people have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four Zip codes there, Mayor Bill de Blasio (D) said Tuesday. The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000""")

In [0]:
entities=[(i, i.label_, i.label)for i in nytimes.ents]
entities
       

[(New York City, 'GPE', 382),
 (Tuesday, 'DATE', 388),
 (At least 285, 'CARDINAL', 394),
 (September, 'DATE', 388),
 (Brooklyn, 'GPE', 382),
 (Williamsburg, 'GPE', 382),
 (four, 'CARDINAL', 394),
 (Zip, 'PERSON', 378),
 (Bill de Blasio, 'PERSON', 378),
 (Tuesday, 'DATE', 388),
 (Orthodox Jews, 'NORP', 379),
 (6 months old, 'DATE', 388),
 (1,000, 'MONEY', 391)]

In [0]:
displacy.render(nytimes, style = "ent", jupyter = True)

In [0]:
###Dependancy Parsing 

In [0]:
###- language processing technique allowing us to determine the meaning of a sentence  by analyzing how its constructed to determine how the individual

In [0]:
docp = nlp ("In pursuit of a wall, President Trump ran into one.")
for chunk in docp.noun_chunks:
        print(chunk.text, chunk.root.text, chunk.root.dep_,
             chunk.root.head.text)

pursuit pursuit pobj In
a wall wall pobj of
President Trump Trump nsubj ran


In [0]:
##word vector representation 

In [0]:
import en_core_web_sm

In [0]:
nlp = en_core_web_sm.load()

In [0]:
mango = nlp(u'mango')

print(mango.vector.shape)

In [0]:
print(mango.vector)

[ 1.81897774e-01 -5.30110955e-01  2.66826391e+00  6.92421973e-01
 -1.97661090e+00  3.68705726e+00 -4.39795399e+00 -9.98800993e-01
  4.40461993e-01  2.16392577e-01 -3.65440488e-01 -7.81092346e-02
 -2.61331797e-02 -2.29889154e+00 -4.02843028e-01  2.03411436e+00
 -1.13863659e+00 -2.47938967e+00 -6.85229659e-01  2.18901682e+00
  2.21150327e+00  1.11644936e+00  1.71971530e-01  4.38695967e-01
 -1.64694953e+00 -4.35404658e-01 -3.02480996e-01  8.34270179e-01
 -1.12027848e+00  7.75547445e-01 -5.96542180e-01 -1.65593314e+00
  5.41057348e-01 -3.40727836e-01 -3.47570151e-01  5.06468892e-01
  3.71737957e-01 -9.64704514e-01 -8.57092083e-01  8.52468848e-01
 -3.29184222e+00  4.53452921e+00  2.02872336e-01 -1.16222560e-01
 -1.18046391e+00  4.02978599e-01 -5.31236649e-01 -9.04555917e-01
  1.07802987e+00  3.54201555e-01 -1.02039969e+00 -1.33428931e+00
 -3.28955460e+00  6.58582807e-01 -4.01281625e-01  3.08272541e-01
  4.82804203e+00 -1.29300475e+00 -2.84544349e+00 -1.12305391e+00
 -5.03154039e-01  1.29261

In [0]:
####Section D : Text Classification

In [0]:
###Libraries for use on top of Spacy

In [0]:
import pandas as pd

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
from sklearn.base import TransformerMixin

In [0]:
from sklearn.pipeline import Pipeline

In [0]:
###Loading Data from sklearn.base import TransformerMixin

In [0]:
##Loading TSV File

In [0]:
df_amazon =pd.read_csv("C:/Users/melco/Desktop/ML/amazon_alexa.tsv", sep="\t")

In [0]:
#Top 5 records 

In [0]:
df_amazon.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [0]:
#shape of dataframe

In [0]:
df_amazon.shape

(3150, 5)

In [0]:
#View data information 

In [0]:
df_amazon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            3150 non-null   int64 
 1   date              3150 non-null   object
 2   variation         3150 non-null   object
 3   verified_reviews  3150 non-null   object
 4   feedback          3150 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 123.2+ KB


In [0]:
##Feedback Value Count 

In [0]:
df_amazon.feedback.value_counts()

1    2893
0     257
Name: feedback, dtype: int64

In [0]:
###Tokening  the data with SpaCy

In [0]:
import string 
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

In [0]:
# Create our list of punctuation marks 

In [0]:
punctuations = string.punctuation

In [0]:
#create our list of stopwords

In [0]:
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [0]:
# Load English tokenizer , tagger , parser , NER and word vectors

In [0]:
parser = English()

In [0]:
# Creating our tekenizer function
#creating our token object which is used to create documents with linguistic annotations 
#Lemmatizing each token and converting each token to lowercase 
#Removoing Stop Words
#return preprocessed list of tokens

In [0]:
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [word.lemma_.lower().strip()if word.lemma_!="-PRON" else word.lower_ for word in mytokens]
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations]
    return mytokens

In [0]:
###Defining a custom Transformer

In [0]:
#custom transformer using spaCy
#Cleaning Text

In [0]:
class predictors(TransformerMixin):
    def transform(self, X , **transform_params):
        
        return[clean_text(text) for text in X]
    
    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
         return{}

In [0]:
##Basic function to clean text 
#Removing spaces and converting text into lowercase

In [0]:
def clean_text(text):
    return text.strip().lower()

In [0]:
####Vectotization Feature Engineering(TF-IDF)

In [0]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [0]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [0]:
###Splitting Data ingto Training and Test Sets 

In [0]:
from sklearn.model_selection import train_test_split
X = df_amazon['verified_reviews'] # the features we want to analyze
ylabels =df_amazon['feedback']# the labels, or answers , we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

In [0]:
###Creating a Pipeline and Generating the Model 

In [0]:
#Logistic Regression Classifier

In [0]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

In [0]:
###Create pipeline using Bag of Words

In [0]:
pipe = Pipeline([("cleaner",predictors()),
                ('vectorizer', bow_vector),
                ('classifier', classifier)])

In [0]:
##model generation

In [0]:
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x0000026B33530E08>),
                ('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x0000026B2AA99678>,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_

In [0]:
from sklearn import metrics

In [0]:
#predicting with a test data set

In [0]:
predicted = pipe.predict(X_test)

In [0]:
#Model Accuracy

In [0]:
print("Logistic Regression Accuracy:", metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:", metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.9375661375661376
Logistic Regression Precision: 0.9442013129102844
Logistic Regression Recall: 0.9908151549942594
