## Preprocess with spaCy

In [1]:
import os
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Loading file from path
def loading_file():
    file_dir = '/home/nbuser/library/1. Classifier/3. Exploratory Data Analysis'        
    file_list = glob.glob(file_dir + '/*.csv')
    csv_file = file_list[0]
    return csv_file

# Import file imto Pandas DataFrame
def importing_file(csv_file):
    df = pd.read_csv(csv_file, sep=",")
    return df

# Saving path
def saving_file(file, file_name, save_dir):
    file.to_csv(os.path.join(save_dir,file_name))


In [3]:
# Importing file + Loading  file
news_df = importing_file(loading_file())

# Top 5 records
news_df.head()

Unnamed: 0,file_name,title,news_text,category
0,348.txt,Berlin celebrates European cinema,Organisers say this year's Berlin Film Festiva...,entertainment
1,139.txt,U2 to play at Grammy awards show,Irish rock band U2 are to play live at the Gra...,entertainment
2,125.txt,Snow Patrol feted at Irish awards,Snow Patrol were the big winners in Ireland's ...,entertainment
3,267.txt,T in the Park sells out in days,Tickets for Scotland's biggest music festival ...,entertainment
4,311.txt,Corbett attacks 'dumbed-down TV',Ronnie Corbett has joined fellow comedy stars ...,entertainment


In [6]:
!pip install spacy
!spacy download en

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/95/9c/afd55bb35cc03e4b3dadc41dd48bc26e0678b08d59f32411735c35bda550/spacy-2.1.8-cp36-cp36m-manylinux1_x86_64.whl (30.8MB)
[K     |████████████████████████████████| 30.9MB 22kB/s  eta 0:00:01    |▌                               | 440kB 2.7MB/s eta 0:00:12
[?25hCollecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading https://files.pythonhosted.org/packages/3d/61/9b0520c28eb199a4b1ca667d96dd625bba003c14c75230195f9691975f85/cymem-2.0.2-cp36-cp36m-manylinux1_x86_64.whl
Collecting wasabi<1.1.0,>=0.2.0 (from spacy)
  Downloading https://files.pythonhosted.org/packages/f4/c1/d76ccdd12c716be79162d934fe7de4ac8a318b9302864716dde940641a79/wasabi-0.2.2-py3-none-any.whl
Collecting blis<0.3.0,>=0.2.2 (from spacy)
[?25l  Downloading https://files.pythonhosted.org/packages/34/46/b1d0bb71d308e820ed30316c5f0a017cb5ef5f4324bcbc7da3cf9d3b075c/blis-0.2.4-cp36-cp36m-manylinux1_x86_64.whl (3.2MB)
[K     |█████████████████████████

In [8]:
import spacy

#### Tokenizing the Text


In [11]:
# Load English tokenizer, tagger, parser, NER and word vectors
spacy.load('en')
from spacy.lang.en import English

nlp = English()

# Create the pipeline 'sentencizer' component
sbd = nlp.create_pipe('sentencizer')

# Add the component to the pipeline
nlp.add_pipe(sbd)

text = news_df.loc[2, 'news_text']

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# create list of sentence tokens
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)

["Snow Patrol were the big winners in Ireland's top music honours, the Meteor Awards, picking up accolades for best Irish band and album on Thursday.", "The Belfast-born, Glasgow-based band collected the prizes at the ceremony at Dublin's Point Theatre.", 'Westlife won the award for best Irish pop act, voted for by the public, beating former member Brian McFadden.', 'Franz Ferdinand picked up best international band and album while Paddy Casey collected best Irish male.', 'Singer-songwriter Casey beat Brian McFadden and Damien Rice.', 'Juliette Turner was named best Irish female.', 'In the international categories, Morrissey beat Eminem, Usher and Robbie Williams to best male while PJ Harvey pipped Kylie Minogue, Joss Stone, Anastacia and Natasha Bedingfield to the female crown.', 'The 8,000 fans at the ceremony were treated to performances from US rapper Snoop Dogg, Brian McFadden with Delta Goodrem and The Thrills featuring Rolling Stones star Ronnie Wood.', 'Snow Patrol\'s success c

#### Cleaning Text Data: Removing Stopwords

In [12]:
#Stop words
#importing stop words from English language.
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

#Printing the total number of stop words:
print('Number of stop words: %d' % len(spacy_stopwords))

#Printing first ten stop words:
print('First ten stop words: %s' % list(spacy_stopwords)[:20])

Number of stop words: 326
First ten stop words: ['are', 'became', 'many', 'down', 'whatever', 'side', 'via', 'enough', 'give', 'something', 'another', 'then', 'hereafter', 'than', 'some', 'towards', 'top', 'from', 'onto', 'could']


#### Removing Stopwords from Our Data

In [13]:
from spacy.lang.en.stop_words import STOP_WORDS

#Implementation of stop words:
filtered_sent=[]

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# filtering stop words
for word in doc:
    if word.is_stop==False:
        filtered_sent.append(word)
print("Filtered Sentence:",filtered_sent)

Filtered Sentence: [Snow, Patrol, big, winners, Ireland, music, honours, ,, Meteor, Awards, ,, picking, accolades, best, Irish, band, album, Thursday, ., Belfast, -, born, ,, Glasgow, -, based, band, collected, prizes, ceremony, Dublin, Point, Theatre, ., Westlife, won, award, best, Irish, pop, act, ,, voted, public, ,, beating, member, Brian, McFadden, ., Franz, Ferdinand, picked, best, international, band, album, Paddy, Casey, collected, best, Irish, male, ., Singer, -, songwriter, Casey, beat, Brian, McFadden, Damien, Rice, ., Juliette, Turner, named, best, Irish, female, ., international, categories, ,, Morrissey, beat, Eminem, ,, Usher, Robbie, Williams, best, male, PJ, Harvey, pipped, Kylie, Minogue, ,, Joss, Stone, ,, Anastacia, Natasha, Bedingfield, female, crown, ., 8,000, fans, ceremony, treated, performances, rapper, Snoop, Dogg, ,, Brian, McFadden, Delta, Goodrem, Thrills, featuring, Rolling, Stones, star, Ronnie, Wood, ., Snow, Patrol, success, came, year, chart, breakthro

### Lexicon Normalization

#### Lemmatization

In [14]:
# Implementing lemmatization
lem = nlp(text)
# finding lemma for each word
for word in lem:
    print(word.text, word.lemma_)

Snow Snow
Patrol Patrol
were be
the the
big big
winners winner
in in
Ireland Ireland
's have
top top
music music
honours honour
, ,
the the
Meteor Meteor
Awards Awards
, ,
picking pick
up up
accolades accolade
for for
best well
Irish Irish
band band
and and
album album
on on
Thursday Thursday
. .
The The
Belfast Belfast
- -
born bear
, ,
Glasgow Glasgow
- -
based base
band band
collected collect
the the
prizes prize
at at
the the
ceremony ceremony
at at
Dublin Dublin
's have
Point Point
Theatre Theatre
. .
Westlife Westlife
won win
the the
award award
for for
best well
Irish Irish
pop pop
act act
, ,
voted vote
for for
by by
the the
public public
, ,
beating beat
former former
member member
Brian Brian
McFadden McFadden
. .
Franz Franz
Ferdinand Ferdinand
picked pick
up up
best well
international international
band band
and and
album album
while while
Paddy Paddy
Casey Casey
collected collect
best well
Irish Irish
male male
. .
Singer Singer
- -
songwriter songwriter
Casey Casey
beat b

#### Part of Speech (POS) Tagging

In [15]:
# POS tagging

# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

#  "nlp" Objectis used to create documents with linguistic annotations.
docs = nlp(text)

for word in docs:
    print(word.text,word.pos_)

Snow PROPN
Patrol PROPN
were VERB
the DET
big ADJ
winners NOUN
in ADP
Ireland PROPN
's PART
top ADJ
music NOUN
honours NOUN
, PUNCT
the DET
Meteor PROPN
Awards PROPN
, PUNCT
picking VERB
up PART
accolades NOUN
for ADP
best ADJ
Irish ADJ
band NOUN
and CCONJ
album NOUN
on ADP
Thursday PROPN
. PUNCT
The DET
Belfast PROPN
- PUNCT
born VERB
, PUNCT
Glasgow PROPN
- PUNCT
based VERB
band NOUN
collected VERB
the DET
prizes NOUN
at ADP
the DET
ceremony NOUN
at ADP
Dublin PROPN
's PART
Point PROPN
Theatre PROPN
. PUNCT
Westlife PROPN
won VERB
the DET
award NOUN
for ADP
best ADJ
Irish ADJ
pop NOUN
act NOUN
, PUNCT
voted VERB
for ADP
by ADP
the DET
public NOUN
, PUNCT
beating VERB
former ADJ
member NOUN
Brian PROPN
McFadden PROPN
. PUNCT
Franz PROPN
Ferdinand PROPN
picked VERB
up PART
best ADJ
international ADJ
band NOUN
and CCONJ
album NOUN
while ADP
Paddy PROPN
Casey PROPN
collected VERB
best ADJ
Irish ADJ
male NOUN
. PUNCT
Singer NOUN
- PUNCT
songwriter NOUN
Casey PROPN
beat VERB
Brian PROPN
Mc

#### Entity Detection

In [16]:
#for visualization of Entity detection importing displacy from spacy:

from spacy import displacy

bbc= nlp(text)

entities=[(i, i.label_, i.label) for i in bbc.ents]
entities

[(Ireland, 'GPE', 384),
 (Irish, 'NORP', 381),
 (Thursday, 'DATE', 391),
 (Belfast, 'GPE', 384),
 (Glasgow, 'GPE', 384),
 (Dublin, 'GPE', 384),
 (Point Theatre, 'FAC', 9191306739292312949),
 (Irish, 'NORP', 381),
 (Brian McFadden, 'PERSON', 380),
 (Franz Ferdinand, 'ORG', 383),
 (Paddy Casey, 'PERSON', 380),
 (Irish, 'NORP', 381),
 (Casey, 'PERSON', 380),
 (Brian McFadden, 'PERSON', 380),
 (Damien Rice, 'PERSON', 380),
 (Juliette Turner, 'PERSON', 380),
 (Irish, 'NORP', 381),
 (Morrissey, 'PERSON', 380),
 (Eminem, 'PERSON', 380),
 (Usher, 'PERSON', 380),
 (Robbie Williams, 'PERSON', 380),
 (PJ Harvey, 'ORG', 383),
 (Kylie Minogue, 'PERSON', 380),
 (Joss Stone, 'PERSON', 380),
 (Anastacia, 'PERSON', 380),
 (Natasha Bedingfield, 'PERSON', 380),
 (8,000, 'CARDINAL', 397),
 (US, 'GPE', 384),
 (Snoop Dogg, 'PERSON', 380),
 (Brian McFadden, 'PERSON', 380),
 (Delta Goodrem, 'PERSON', 380),
 (Thrills, 'PERSON', 380),
 (Ronnie Wood, 'PERSON', 380),
 (Snow Patrol's, 'PERSON', 380),
 (a year, 'DA

In [17]:
displacy.render(bbc, style = "ent",jupyter = True)

#### Dependency Parsing

In [18]:
docp = nlp (text)

for chunk in docp.noun_chunks:
   print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

Snow Patrol Patrol nsubj were
the big winners winners attr were
Ireland's top music honours honours pobj in
the Meteor Awards Awards appos winners
accolades accolades dobj picking
best Irish band band pobj for
album album conj band
Thursday Thursday pobj on
The Belfast-born, Glasgow-based band band nsubj collected
the prizes prizes dobj collected
the ceremony ceremony pobj at
Dublin's Point Theatre Theatre pobj at
Westlife Westlife nsubj won
the award award dobj won
best Irish pop act act pobj for
the public public pobj by
former member Brian McFadden McFadden dobj beating
Franz Ferdinand Ferdinand nsubj picked
best international band band dobj picked
album album conj band
Paddy Casey Casey nsubj collected
best Irish male male dobj collected
Singer-songwriter Casey Casey nsubj beat
Brian McFadden McFadden dobj beat
Damien Rice Rice conj McFadden
Juliette Turner Turner nsubjpass named
the international categories categories pobj In
Morrissey Morrissey nsubj beat
Eminem Eminem dobj beat


### SpaCy Text Classification

#### Importing Libraries

In [20]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

#### Tokening the Data With spaCy

In [21]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

#### Defining a Custom Transformer

In [22]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

#### Vectorization Feature Engineering (TF-IDF)

In [23]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1), binary=False)

In [24]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

#### Splitting The Data into Training and Test Sets

In [25]:
from sklearn.model_selection import train_test_split

X = news_df['news_text'] # the features we want to analyze
ylabels = news_df['category'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

#### Creating a Pipeline and Generating the Model

In [26]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)



Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x7f7394043e10>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ng...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

#### Evaluating the Model

In [29]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)


# Model Accuracy

# Accuracy refers to the percentage of the total predictions our model makes that are completely correct.
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))

# Precision describes the ratio of true positives to true positives plus false positives in our predictions.
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted, average='weighted'))

# Recall describes the ratio of true positives to true positives plus false negatives in our predictions.
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted, average='weighted'))

Logistic Regression Accuracy: 0.9833610648918469
Logistic Regression Precision: 0.98350248089453
Logistic Regression Recall: 0.9833610648918469
