## Preprocess with spaCy

In [4]:
import os
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
# Loading file from path
def loading_file():
    file_dir = '/home/nbuser/library/1. Classifier/3. Exploratory Data Analysis'        
    file_list = glob.glob(file_dir + '/*.csv')
    csv_file = file_list[0]
    return csv_file

# Import file imto Pandas DataFrame
def importing_file(csv_file):
    df = pd.read_csv(csv_file, sep=",")
    return df

# Saving path
def saving_file(file, file_name, save_dir):
    file.to_csv(os.path.join(save_dir,file_name))


In [6]:
# Importing file + Loading  file
news_df = importing_file(loading_file())

# Top 5 records
news_df.head()

Unnamed: 0,file_name,title,news_text,category
0,348.txt,Berlin celebrates European cinema,Organisers say this year's Berlin Film Festiva...,entertainment
1,139.txt,U2 to play at Grammy awards show,Irish rock band U2 are to play live at the Gra...,entertainment
2,125.txt,Snow Patrol feted at Irish awards,Snow Patrol were the big winners in Ireland's ...,entertainment
3,267.txt,T in the Park sells out in days,Tickets for Scotland's biggest music festival ...,entertainment
4,311.txt,Corbett attacks 'dumbed-down TV',Ronnie Corbett has joined fellow comedy stars ...,entertainment


In [3]:
import spacy

In [12]:
!spacy download en

Collecting en_core_web_sm==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#egg=en_core_web_sm==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz (11.1MB)
[K     |████████████████████████████████| 11.1MB 9.9MB/s eta 0:00:01    |██████████                      | 3.5MB 1.1MB/s eta 0:00:08
[?25hBuilding wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.1.0-cp36-none-any.whl size=11075237 sha256=519bd820dba9dd424a048baa76bdec13ba0e857f3048020355ccb2bda98aca08
  Stored in directory: /tmp/pip-ephem-wheel-cache-rxfzkbxt/wheels/39/ea/3b/507f7df78be8631a7a3d7090962194cf55bc1158572c0be77f
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-2.1.0
You sh

#### Tokenizing the Text


In [15]:
# Load English tokenizer, tagger, parser, NER and word vectors
spacy.load('en')
from spacy.lang.en import English

nlp = English()

# Create the pipeline 'sentencizer' component
sbd = nlp.create_pipe('sentencizer')

# Add the component to the pipeline
nlp.add_pipe(sbd)

text = news_df.loc[15, 'news_text']

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# create list of sentence tokens
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)

['The man who said he got Oscar-nominated movie The Aviator off the ground and signed up Leonardo DiCaprio has been shut out of the Academy Awards race.', "Charles Evans Jr battled over his role with the people who eventually made the film, and won a producer's credit.", 'But he is not on the list of producers who can win a best film Oscar due to a limit on the number of nominees.', "The Oscars organisers have picked two of The Aviator's four producers to be nominated for best film.", 'Up to three producers can be named per film but the studios behind The Aviator and Million Dollar Baby failed to trim their credits - so the Academy of Motion Pictures Arts and Sciences (Ampas) has done it for them.', "The Aviator's nominated producers are Michael Mann and Graham King - with Mr Evans and Sandy Climan, Mr Mann's former deputy, left off.", 'Mr Evans sued Mr Mann in 2001, claiming he came up with the idea, spent years developing it and persuaded DiCaprio to play Hughes - but said he was lat

#### Cleaning Text Data: Removing Stopwords

In [16]:
#Stop words
#importing stop words from English language.
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

#Printing the total number of stop words:
print('Number of stop words: %d' % len(spacy_stopwords))

#Printing first ten stop words:
print('First ten stop words: %s' % list(spacy_stopwords)[:20])

Number of stop words: 326
First ten stop words: ['sixty', 'already', 'whence', 'many', 'seem', 'being', 'towards', 'out', 'sometimes', 'formerly', 'them', '’re', 'by', 'namely', 'get', 'every', 'otherwise', 'rather', 'as', 'an']


#### Removing Stopwords from Our Data

In [17]:
from spacy.lang.en.stop_words import STOP_WORDS

#Implementation of stop words:
filtered_sent=[]

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# filtering stop words
for word in doc:
    if word.is_stop==False:
        filtered_sent.append(word)
print("Filtered Sentence:",filtered_sent)

Filtered Sentence: [man, said, got, Oscar, -, nominated, movie, Aviator, ground, signed, Leonardo, DiCaprio, shut, Academy, Awards, race, ., Charles, Evans, Jr, battled, role, people, eventually, film, ,, won, producer, credit, ., list, producers, win, best, film, Oscar, limit, number, nominees, ., Oscars, organisers, picked, Aviator, producers, nominated, best, film, ., producers, named, film, studios, Aviator, Million, Dollar, Baby, failed, trim, credits, -, Academy, Motion, Pictures, Arts, Sciences, (, Ampas, ), ., Aviator, nominated, producers, Michael, Mann, Graham, King, -, Mr, Evans, Sandy, Climan, ,, Mr, Mann, deputy, ,, left, ., Mr, Evans, sued, Mr, Mann, 2001, ,, claiming, came, idea, ,, spent, years, developing, persuaded, DiCaprio, play, Hughes, -, said, later, excluded, project, ., sides, settled, court, deal, remained, secret, apart, fact, Mr, Evans, ', appeared, producer, film, credits, roll, ., Golden, Globes, ,, Mr, Evans, -, named, winners, film, won, best, drama, fil

### Lexicon Normalization

#### Lemmatization

In [19]:
# Implementing lemmatization
lem = nlp(text)
# finding lemma for each word
for word in lem:
    print(word.text,word.lemma_)

The The
man man
who who
said say
he he
got get
Oscar Oscar
- -
nominated nominate
movie movie
The The
Aviator Aviator
off off
the the
ground grind
and and
signed sign
up up
Leonardo Leonardo
DiCaprio DiCaprio
has have
been be
shut shut
out out
of of
the the
Academy Academy
Awards Awards
race race
. .
Charles Charles
Evans Evans
Jr Jr
battled battle
over over
his his
role role
with with
the the
people people
who who
eventually eventually
made make
the the
film film
, ,
and and
won win
a a
producer producer
's have
credit credit
. .
But But
he he
is be
not not
on on
the the
list list
of of
producers producer
who who
can can
win win
a a
best well
film film
Oscar Oscar
due due
to to
a a
limit limit
on on
the the
number numb
of of
nominees nominee
. .
The The
Oscars Oscars
organisers organiser
have have
picked pick
two two
of of
The The
Aviator Aviator
's have
four four
producers producer
to to
be be
nominated nominate
for for
best well
film film
. .
Up Up
to to
three three
producers produc

#### Part of Speech (POS) Tagging

In [20]:
# POS tagging

# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

#  "nlp" Objectis used to create documents with linguistic annotations.
docs = nlp(text)

for word in docs:
    print(word.text,word.pos_)

The DET
man NOUN
who PRON
said VERB
he PRON
got VERB
Oscar PROPN
- PUNCT
nominated VERB
movie NOUN
The DET
Aviator NOUN
off ADP
the DET
ground NOUN
and CCONJ
signed VERB
up PART
Leonardo PROPN
DiCaprio PROPN
has VERB
been VERB
shut VERB
out ADP
of ADP
the DET
Academy PROPN
Awards PROPN
race NOUN
. PUNCT
Charles PROPN
Evans PROPN
Jr PROPN
battled VERB
over ADP
his DET
role NOUN
with ADP
the DET
people NOUN
who PRON
eventually ADV
made VERB
the DET
film NOUN
, PUNCT
and CCONJ
won VERB
a DET
producer NOUN
's PART
credit NOUN
. PUNCT
But CCONJ
he PRON
is VERB
not ADV
on ADP
the DET
list NOUN
of ADP
producers NOUN
who PRON
can VERB
win VERB
a DET
best ADJ
film NOUN
Oscar PROPN
due ADP
to ADP
a DET
limit NOUN
on ADP
the DET
number NOUN
of ADP
nominees NOUN
. PUNCT
The DET
Oscars PROPN
organisers NOUN
have VERB
picked VERB
two NUM
of ADP
The DET
Aviator PROPN
's PART
four NUM
producers NOUN
to PART
be VERB
nominated VERB
for ADP
best ADJ
film NOUN
. PUNCT
Up ADP
to PART
three NUM
producers NO

#### Entity Detection

In [21]:
#for visualization of Entity detection importing displacy from spacy:

from spacy import displacy

nytimes= nlp(text)

entities=[(i, i.label_, i.label) for i in nytimes.ents]
entities

[(Oscar, 'GPE', 384),
 (The Aviator, 'ORG', 383),
 (Leonardo DiCaprio, 'PERSON', 380),
 (Academy Awards, 'ORG', 383),
 (Charles Evans Jr, 'PERSON', 380),
 (Oscar, 'ORG', 383),
 (Oscars, 'GPE', 384),
 (two, 'CARDINAL', 397),
 (Aviator, 'ORG', 383),
 (four, 'CARDINAL', 397),
 (three, 'CARDINAL', 397),
 (the Academy of Motion Pictures Arts and Sciences (Ampas, 'ORG', 383),
 (Aviator, 'ORG', 383),
 (Michael Mann, 'PERSON', 380),
 (Graham King - with Mr Evans, 'PERSON', 380),
 (Sandy Climan, 'PERSON', 380),
 (Mr Mann's, 'ORG', 383),
 (Mr Evans, 'PERSON', 380),
 (Mr Mann, 'PERSON', 380),
 (2001, 'DATE', 391),
 (years, 'DATE', 391),
 (DiCaprio, 'PERSON', 380),
 (Hughes, 'ORG', 383),
 (two, 'CARDINAL', 397),
 (Mr Evans', 'PERSON', 380),
 (the Golden Globes, 'FAC', 9191306739292312949),
 (Mr Evans - who, 'PERSON', 380),
 (DiCaprio, 'PERSON', 380),
 (Martin Scorsese, 'PERSON', 380),
 (Mr Mann, 'PERSON', 380),
 (Mr King, 'PERSON', 380),
 (Ampas, 'PERSON', 380),
 (Shakespeare, 'PERSON', 380),
 (Lo

In [22]:
displacy.render(nytimes, style = "ent",jupyter = True)

#### Dependency Parsing

In [23]:
docp = nlp (text)

for chunk in docp.noun_chunks:
   print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

The man man ROOT man
who who nsubj said
he he nsubj got
Oscar-nominated movie movie dobj got
The Aviator Aviator nsubjpass shut
the ground ground pobj off
Leonardo DiCaprio DiCaprio dobj signed
the Academy Awards race race pobj of
Charles Evans Jr Jr nsubj battled
his role role pobj over
the people people pobj with
who who nsubj made
the film film dobj made
a producer's credit credit dobj won
he he nsubj is
the list list pobj on
producers producers pobj of
who who nsubj win
a best film film dobj win
Oscar Oscar appos film
a limit limit pobj due
the number number pobj on
nominees nominees pobj of
The Oscars organisers organisers nsubj picked
The Aviator's four producers producers pobj of
best film film pobj for
Up to three producers producers nsubjpass named
film film pobj per
the studios studios nsubj failed
The Aviator and Million Dollar Baby Baby pobj behind
their credits credits dobj trim
Motion Motion pobj of
it it dobj done
them them pobj for
The Aviator's nominated producers prod

#### Word Vector Representation

In [24]:
import en_core_web_sm
nlp = en_core_web_sm.load()
mango = nlp(u'mango')
print(mango.vector.shape)
print(mango.vector)

(96,)
[ 1.0466383  -1.5323697  -0.72177905 -2.4700649  -0.2715162   1.1589639
  1.7113379  -0.31615403 -2.0978343   1.837553    1.4681302   2.728043
 -2.3457408  -5.17184    -4.6110015  -0.21236466 -0.3029521   4.220028
 -0.6813917   2.4016762  -1.9546705  -0.85086954  1.2456163   1.5107994
  0.4684736   3.1612053   0.15542296  2.0598564   3.780035    4.6110964
  0.6375268  -1.078107   -0.96647096 -1.3939928  -0.56914186  0.51434743
  2.3150034  -0.93199825 -2.7970662  -0.8540115  -3.4250052   4.2857723
  2.5058174  -2.2150877   0.7860181   3.496335   -0.62606215 -2.0213525
 -4.47421     1.6821622  -6.0789204   0.22800982 -0.36950028 -4.5340714
 -1.7978683  -2.080299    4.125556    3.1852438  -3.286446    1.0892276
  1.017115    1.2736416  -0.10613725  3.5102775   1.1902348   0.05483437
 -0.06298041  0.8280688   0.05514218  0.94817173 -0.49377063  1.1512338
 -0.81374085 -1.6104267   1.8233354  -2.278403   -2.1321895   0.3029334
 -1.4510616  -1.0584296  -3.5698352  -0.13046083 -0.266833

### SpaCy Text Classification

#### Importing Libraries

In [26]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

#### Tokening the Data With spaCy

In [29]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

#### Defining a Custom Transformer

In [30]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

#### Vectorization Feature Engineering (TF-IDF)

In [36]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1), binary=False)

In [37]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

#### Splitting The Data into Training and Test Sets

In [38]:
from sklearn.model_selection import train_test_split

X = news_df['news_text'] # the features we want to analyze
ylabels = news_df['category'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

#### Creating a Pipeline and Generating the Model

In [39]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x7f5a632f99b0>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ng...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

#### Evaluating the Model

In [40]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)


# Model Accuracy

# Accuracy refers to the percentage of the total predictions our model makes that are completely correct.
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))

# Precision describes the ratio of true positives to true positives plus false positives in our predictions.
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))

# Recall describes the ratio of true positives to true positives plus false negatives in our predictions.
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.961730449251248


ValueError: Target is multiclass but average='binary'. Please choose another average setting.