## Tokenization

In [85]:
from spacy.lang.en import English

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

# For Words

nlp = English()

my_doc = nlp(text)

token_list = []
for token in my_doc:
    token_list.append(token.text)

print(token_list)

['When', 'learning', 'data', 'science', ',', 'you', 'should', "n't", 'get', 'discouraged', '!', '\n', 'Challenges', 'and', 'setbacks', 'are', "n't", 'failures', ',', 'they', "'re", 'just', 'part', 'of', 'the', 'journey', '.', 'You', "'ve", 'got', 'this', '!']


In [86]:
# For Sentences

nlp = English()

sbd = nlp.create_pipe('sentencizer')

nlp.add_pipe(sbd)

doc = nlp(text)

sents_list = []
for sent in doc.sents:
    sents_list.append(sent)

print(sents_list)

[When learning data science, you shouldn't get discouraged!, 
Challenges and setbacks aren't failures, they're just part of the journey., You've got this!]


## Removing Stopwords

In [87]:
from spacy.lang.en.stop_words import STOP_WORDS

filtered_sent = []

doc = nlp(text)

for word in doc:
    if word.is_stop == False:
        filtered_sent.append(word)

print("Filtered Sentence:", filtered_sent)

Filtered Sentence: [learning, data, science, ,, discouraged, !, 
, Challenges, setbacks, failures, ,, journey, ., got, !]


## Lemmatization

In [88]:
lem = nlp("run runs running runner")
for word in lem:
    print(word.text, word.lemma_)

run run
runs runs
running running
runner runner


## POS Tagging

In [89]:
import en_core_web_sm

nlp = en_core_web_sm.load()

docs = nlp(u"All is well that ends well.")
for word in docs:
    print(word.text, word.pos_)

All DET
is AUX
well ADJ
that DET
ends VERB
well ADV
. PUNCT


## Entity Detection

In [90]:
from spacy import displacy

nytimes = nlp(u"""
New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases.

At least 285 people have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four Zip codes there, Mayor Bill de Blasio (D) said Tuesday.

The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000.""")

entities = [(i, i.label_, i.label) for i in nytimes.ents]
print(entities)

[(New York City, 'GPE', 384), (Tuesday, 'DATE', 391), (At least 285, 'CARDINAL', 397), (September, 'DATE', 391), (Brooklyn, 'GPE', 384), (Williamsburg, 'GPE', 384), (four, 'CARDINAL', 397), (Bill de Blasio, 'PERSON', 380), (Tuesday, 'DATE', 391), (Orthodox Jews, 'PERSON', 380), (6 months old, 'DATE', 391), (up to $1,000, 'MONEY', 394)]


In [91]:
displacy.render(nytimes, style="ent", jupyter=True)

## Dependency Parsing

In [92]:
docp = nlp(" In pursuit of a wall, President Trump ran into one.")

for chunk in docp.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

pursuit pursuit pobj In
a wall wall pobj of
President Trump Trump nsubj ran


In [93]:
displacy.render(docp, style="ent", jupyter=True)

## Word Vector Representation

In [94]:
nlp = en_core_web_sm.load()

mango = nlp(u"mango")

print(mango.vector.shape)
print(mango.vector)

(96,)
[ 0.20538223 -1.6033714   0.27122334  0.4102599   3.2985601   3.4889512
  1.8090308  -2.1398475   2.31565     1.5809067   4.1519527  -1.0185633
 -0.0325011  -2.7471437  -0.4177467  -2.4292274  -0.6153387   2.4422317
  0.8078671  -2.4846377   2.0988142   1.4448209  -0.552992   -1.3411183
 -0.69847786 -0.45548356  3.8267968  -4.0225782   0.81215733  0.3766132
  0.15751392 -1.1428392  -1.3328214   0.7187766   2.1567593  -3.018766
  3.4919028   0.6938907  -1.1943094  -0.10796624  4.7029977   3.551554
 -0.71505725 -4.4580555  -0.26480573  0.6314918  -0.538128   -1.1131921
 -1.1251849   0.5740081  -1.1976193  -3.5157654   0.425157   -1.7545594
 -3.058784    0.01680815  0.97784567  1.7633746   0.4561966   2.5090182
  0.35267782  0.8351371  -1.394351    0.5082075   0.75960976 -3.3654122
  2.3440146  -2.4311178   1.2401564  -1.4498216  -2.3708577   1.274456
  2.6584334   2.505236    0.24999112  0.45838034  0.7396465  -3.0134087
 -1.1449497   2.441533    0.58746856 -0.47240722 -0.99527466 

## Text Classification

In [95]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [96]:
df_amazon = pd.read_csv("datasets/amazon_alexa.tsv", sep="\t")
df_amazon.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [97]:
df_amazon.shape

(3150, 5)

In [98]:
df_amazon.info

<bound method DataFrame.info of       rating       date         variation  \
0          5  31-Jul-18  Charcoal Fabric    
1          5  31-Jul-18  Charcoal Fabric    
2          4  31-Jul-18    Walnut Finish    
3          5  31-Jul-18  Charcoal Fabric    
4          5  31-Jul-18  Charcoal Fabric    
...      ...        ...               ...   
3145       5  30-Jul-18        Black  Dot   
3146       5  30-Jul-18        Black  Dot   
3147       5  30-Jul-18        Black  Dot   
3148       5  30-Jul-18        White  Dot   
3149       4  29-Jul-18        Black  Dot   

                                       verified_reviews  feedback  
0                                         Love my Echo!         1  
1                                             Loved it!         1  
2     Sometimes while playing a game, you can answer...         1  
3     I have had a lot of fun with this thing. My 4 ...         1  
4                                                 Music         1  
...                

In [99]:
df_amazon.feedback.value_counts()

1    2893
0     257
Name: feedback, dtype: int64

In [100]:
import string

punctuations = string.punctuation

nlp = en_core_web_sm.load()

parser = English()

def spacy_tokenizer(sentence):
    my_tokens = parser(sentence)
    my_tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in my_tokens]
    my_tokens = [word for word in my_tokens if word not in STOP_WORDS and word not in punctuations]
    return my_tokens

def clean_text(text):
    return text.strip().lower()

class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {}

In [101]:
bow_vector = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1,1))
tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer)

In [102]:
from sklearn.model_selection import train_test_split

X = df_amazon['verified_reviews']
y_labels = df_amazon['feedback']
X_train, X_test, y_train, y_test = train_test_split(X, y_labels, test_size=0.3)

In [105]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()

pipeline = Pipeline([("cleaner", predictors()),
                    ("vectorizer", bow_vector),
                    ("classifier", classifier)])

pipeline.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x1181f3dd8>),
                ('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 toke...?u)\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x1187f8f28>,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
            

In [106]:
from sklearn import metrics

predicted = pipeline.predict(X_test)

print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.9333333333333333
Logistic Regression Precision: 0.937636761487965
Logistic Regression Recall: 0.9930475086906141
