***Word tokenization***

In [1]:
from spacy.lang.en import English

nlp = English()

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

my_doc = nlp(text)


token_list = []
for token in my_doc:
    token_list.append(token.text)
print(token_list)

['When', 'learning', 'data', 'science', ',', 'you', 'should', "n't", 'get', 'discouraged', '!', '\n', 'Challenges', 'and', 'setbacks', 'are', "n't", 'failures', ',', 'they', "'re", 'just', 'part', 'of', 'the', 'journey', '.', 'You', "'ve", 'got', 'this', '!']


***Sentence Tokenization***

In [2]:
nlp = English()

sbd = nlp.create_pipe('sentencizer')

nlp.add_pipe(sbd)

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

doc = nlp(text)

sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)

["When learning data science, you shouldn't get discouraged!", "\nChallenges and setbacks aren't failures, they're just part of the journey.", "You've got this!"]


***Cleaning Text Data: Removing Stopwords***

In [3]:
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

print('Number of stop words: %d' % len(spacy_stopwords))

print('First ten stop words: %s' % list(spacy_stopwords)[:20])

Number of stop words: 326
First ten stop words: ['call', 'that', 'himself', 'anywhere', 'eleven', 'own', 'among', 'about', 'other', 'somewhere', 'six', '’re', '’s', 'through', 'formerly', 'into', 'move', 'made', 'us', 'done']


In [4]:
from spacy.lang.en.stop_words import STOP_WORDS

filtered_sent = []

doc = nlp(text)

for word in doc:
    if word.is_stop == False:
        filtered_sent.append(word)
print("Filtered Sentence: ", filtered_sent)

Filtered Sentence:  [learning, data, science, ,, discouraged, !, 
, Challenges, setbacks, failures, ,, journey, ., got, !]


***Lexicon Normalization***
- Lemmatization (stemming): reduces words into its roots

In [5]:
lem = nlp("run runs running runner")

for word in lem:
    print(word.text, word.lemma_)

run run
runs run
running run
runner runner


***Part of Speech (POS) Tagging***

In [6]:
# model: contains dictionary and grammatical information
import en_core_web_sm 
# if it fails from anaconda, you can installed directly:
# python -m spacy download en_core_web_sm

nlp = en_core_web_sm.load()

docs = nlp(u"All is well that ends well.")

for word in docs:
    print(word.text, word.pos_)

All DET
is VERB
well ADV
that DET
ends VERB
well ADV
. PUNCT


**Entity Detection**
A.K.A Entity recognition

In [7]:
from spacy import displacy

nytimes = nlp(u"""New York City on Tuesday declared a public health
emergency and ordered mandatory measles vaccinations amid an 
outbreak, becoming the latest national flash point over refusals to
inoculate against dangerous diseases.

At least 285 people have contracted measles in the city since 
September, mostly in Brooklyn’s Williamsburg neighborhood. The
order covers four Zip codes there, Mayor Bill de Blasio (D) said
Tuesday.

The mandate orders all unvaccinated people in the area, including a
concentration of Orthodox Jews, to receive inoculations, including
for children as young as 6 months old. Anyone who resists could be 
fined up to $1,000.""")

entities = [(i, i.label_, i.label) for i in nytimes.ents]
entities

displacy.render(nytimes, style = "ent", jupyter = True)

***Dependency Parsing***
Meaning of a sentence by analyzing how is constructed

In [8]:
docp = nlp ("In pursuit of a wall, President Trum ran into one.")

for chunk in docp.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)
    
displacy.render(docp, style="dep", jupyter=True)

pursuit pursuit pobj In
a wall wall pobj of
President Trum Trum nsubj ran


***Word Vector Representation*** Represent words to capture intrinsic connections

In [9]:
import en_core_web_sm
nlp = en_core_web_sm.load()
mango = nlp(u'mango')
print(mango.vector.shape)
print(mango.vector)

(96,)
[ 1.0466383  -1.5323697  -0.72177905 -2.4700649  -0.2715162   1.1589639
  1.7113379  -0.31615403 -2.0978343   1.837553    1.4681302   2.728043
 -2.3457408  -5.17184    -4.6110015  -0.21236466 -0.3029521   4.220028
 -0.6813917   2.4016762  -1.9546705  -0.85086954  1.2456163   1.5107994
  0.4684736   3.1612053   0.15542296  2.0598564   3.780035    4.6110964
  0.6375268  -1.078107   -0.96647096 -1.3939928  -0.56914186  0.51434743
  2.3150034  -0.93199825 -2.7970662  -0.8540115  -3.4250052   4.2857723
  2.5058174  -2.2150877   0.7860181   3.496335   -0.62606215 -2.0213525
 -4.47421     1.6821622  -6.0789204   0.22800982 -0.36950028 -4.5340714
 -1.7978683  -2.080299    4.125556    3.1852438  -3.286446    1.0892276
  1.017115    1.2736416  -0.10613725  3.5102775   1.1902348   0.05483437
 -0.06298041  0.8280688   0.05514218  0.94817173 -0.49377063  1.1512338
 -0.81374085 -1.6104267   1.8233354  -2.278403   -2.1321895   0.3029334
 -1.4510616  -1.0584296  -3.5698352  -0.13046083 -0.266833

**Text Classification**

1. Importing Libraries

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

2. Loading Data (Amazon Alexa product reviews)

In [11]:
df_amazon = pd.read_csv ("datasets/amazon_alexa.tsv", sep="\t")

In [12]:
df_amazon.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [13]:
df_amazon.shape

(3150, 5)

In [14]:
df_amazon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
rating              3150 non-null int64
date                3150 non-null object
variation           3150 non-null object
verified_reviews    3150 non-null object
feedback            3150 non-null int64
dtypes: int64(2), object(3)
memory usage: 123.1+ KB


In [15]:
df_amazon.feedback.value_counts()

1    2893
0     257
Name: feedback, dtype: int64

***Tokening the Data with spaCy***

In [16]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

punctuations = string.punctuation

nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

parser = English()

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    
    return mytokens


***Defining a Custom Transformer***

In [17]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {}
    
def clean_text(text):
    return text.strip().lower()

***Vectorization Feature Engineering (TF-IDF)***

scikit-learn CountVectorizer is useful for Bag of Words (BoW)

Term Frequency - Inverse Document Frequency --- way of normalizing our BoW.
How important is a particular term in the context of a given document, based on how many times the term appears and how many other documents that same term appears in. The higher the TF-IDF, the more important that term is to that document.

Its mathematical equation is:
idf(W) = log(#(documents)/#(documents containing word W))

In [18]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

***Spliting data into Training and Test Sets***

In [19]:
from sklearn.model_selection import train_test_split

X = df_amazon['verified_reviews'] 
ylabels = df_amazon['feedback'] 

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

***Creating a Pipeline and Generating the Model***

- Cleaner: uses our predictors
- Vectorizer: counvector object to create a TfIdf vector
- Classifier: Logistic regression for sentiment analysis

In [20]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

pipe = Pipeline([("cleaner", predictors()),
                ('vectorizer', tfidf_vector),
                ('classifier', classifier)])

pipe.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x7f812c48c940>), ('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

***Evaluating the Model***

We'll put our test data through the pipeline to come up with predictions.
Functions of the metrics of the sklearn:
- Accuracy: refers to the percentage of the total predictions our model makes that are completely correct.
- Precision: describes the ratio of true positives to true positives plus false positivies in our predictions.
- Recall: describes the ratio of true positives to true postives plus false negatives in our predictions.

In [21]:
from sklearn import metrics

predicted = pipe.predict(X_test)

print("Logistic Regression Accuracy: ", metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision: ", metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall: ", metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy:  0.9238095238095239
Logistic Regression Precision:  0.923728813559322
Logistic Regression Recall:  1.0


***Questions***:

1. How this can be executed in HPC, e.g. training a new model for the corpus?
2. Is there a maximum lenght for the spacy object nlp?
3. Can we use the Noun Phrase to clean the text?