## Preprocess with spaCy

In [1]:
import os
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
# Loading file from path
def loading_file():
    file_dir = '/home/nbuser/library/1. Classifier/3. Exploratory Data Analysis'        
    file_list = glob.glob(file_dir + '/*.csv')
    csv_file = file_list[1]
    return csv_file

# Import file imto Pandas DataFrame
def importing_file(csv_file):
    df = pd.read_csv(csv_file, sep=",")
    return df

# Saving path
def saving_file(file, file_name, save_dir):
    file.to_csv(os.path.join(save_dir, file_name))


In [3]:
# Importing file + Loading  file
news_df = importing_file(loading_file())

# Top 5 records
news_df.head()

Unnamed: 0,file_name,title,news_text,category
0,214.txt,Mansfield 0-1 Leyton Orient,An second-half goal from Andy Scott condemned ...,sports
1,303.txt,Film production 'falls' 40% in UK,The number of British films produced in the UK...,entertainment
2,083.txt,Hague 'given up' his PM ambition,Former Conservative leader William Hague says ...,politics
3,190.txt,SA return to Mauritius,Top seeds South Africa return to the scene of ...,sports
4,103.txt,Minimum rate for foster parents,Foster carers are to be guaranteed a minimum a...,politics


In [5]:
!pip install spacy
!spacy download en

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/95/9c/afd55bb35cc03e4b3dadc41dd48bc26e0678b08d59f32411735c35bda550/spacy-2.1.8-cp36-cp36m-manylinux1_x86_64.whl (30.8MB)
[K     |████████████████████████████████| 30.9MB 5.7MB/s eta 0:00:01    |███████▏                        | 6.9MB 2.9MB/s eta 0:00:09     |██████████████████████████▍     | 25.5MB 4.3MB/s eta 0:00:02
[?25hCollecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading https://files.pythonhosted.org/packages/3d/61/9b0520c28eb199a4b1ca667d96dd625bba003c14c75230195f9691975f85/cymem-2.0.2-cp36-cp36m-manylinux1_x86_64.whl
Collecting preshed<2.1.0,>=2.0.1 (from spacy)
[?25l  Downloading https://files.pythonhosted.org/packages/20/93/f222fb957764a283203525ef20e62008675fd0a14ffff8cc1b1490147c63/preshed-2.0.1-cp36-cp36m-manylinux1_x86_64.whl (83kB)
[K     |████████████████████████████████| 92kB 4.3MB/s eta 0:00:011
[?25hCollecting plac<1.0.0,>=0.9.6 (from spacy)
  Downloading https://files.pythonhosted.

In [6]:
import spacy

#### Tokenizing the Text


In [7]:
# Load English tokenizer, tagger, parser, NER and word vectors
spacy.load('en')
from spacy.lang.en import English

nlp = English()

# Create the pipeline 'sentencizer' component
sbd = nlp.create_pipe('sentencizer')

# Add the component to the pipeline
nlp.add_pipe(sbd)

text = news_df.loc[2, 'news_text']

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# create list of sentence tokens
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)

['Former Conservative leader William Hague says he will not stand for the leadership again, having given up his ambition to be prime minister.', 'Mr Hague, 43, told the Daily Telegraph he would now find a life dominated by politics too "boring" and unfulfilling.', "Mr Hague, who stepped down after his party's 2001 election defeat, does not rule out a return to the front bench.", 'He also told the paper he hopes to remain MP for Richmond, North Yorks, and start a family with wife Ffion.', 'Mr Hague, who recently had published the biography of William Pitt the Younger, also said he wanted to continue writing books and speech-writing.', 'He told the newspaper: "I don\'t know whether I will ever go back on to the front, but don\'t rush me."', 'Asked if he would stand for the leadership again, Mr Hague replied: "No.', 'Definitely not."', 'His determination to stay away from a central role will disappoint some senior Conservative members, who say the party needs him.', 'Tim Collins, the shad

#### Cleaning Text Data: Removing Stopwords

In [8]:
#Stop words
#importing stop words from English language.
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

#Printing the total number of stop words:
print('Number of stop words: %d' % len(spacy_stopwords))

#Printing first ten stop words:
print('First ten stop words: %s' % list(spacy_stopwords)[:20])

Number of stop words: 326
First ten stop words: ['ourselves', "n't", 'you', 'so', '‘re', 'due', 'whoever', 'myself', 'toward', 'an', 'nowhere', 'now', 'why', '‘ll', 'since', 'these', '’m', 'on', 'well', 'moreover']


#### Removing Stopwords from Our Data

In [9]:
from spacy.lang.en.stop_words import STOP_WORDS

#Implementation of stop words:
filtered_sent=[]

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# filtering stop words
for word in doc:
    if word.is_stop==False:
        filtered_sent.append(word)
print("Filtered Sentence:",filtered_sent)

Filtered Sentence: [Conservative, leader, William, Hague, says, stand, leadership, ,, having, given, ambition, prime, minister, ., Mr, Hague, ,, 43, ,, told, Daily, Telegraph, find, life, dominated, politics, ", boring, ", unfulfilling, ., Mr, Hague, ,, stepped, party, 2001, election, defeat, ,, rule, return, bench, ., told, paper, hopes, remain, MP, Richmond, ,, North, Yorks, ,, start, family, wife, Ffion, ., Mr, Hague, ,, recently, published, biography, William, Pitt, Younger, ,, said, wanted, continue, writing, books, speech, -, writing, ., told, newspaper, :, ", know, ,, rush, ., ", Asked, stand, leadership, ,, Mr, Hague, replied, :, ", ., Definitely, ., ", determination, stay, away, central, role, disappoint, senior, Conservative, members, ,, party, needs, ., Tim, Collins, ,, shadow, education, secretary, ,, said, week, ", huge, boost, ", party, Mr, Hague, returned, bench, ., Mr, Hague, MP, 27, Leader, Opposition, 36, ., said, :, ", feel, fortunate, ,, age, 40, ,, crammed, entire,

### Lexicon Normalization

#### Lemmatization

In [10]:
# Implementing lemmatization
lem = nlp(text)
# finding lemma for each word
for word in lem:
    print(word.text, word.lemma_)

Former Former
Conservative Conservative
leader leader
William William
Hague Hague
says say
he he
will will
not not
stand stand
for for
the the
leadership leadership
again again
, ,
having have
given give
up up
his his
ambition ambition
to to
be be
prime prime
minister minister
. .
Mr Mr
Hague Hague
, ,
43 43
, ,
told tell
the the
Daily Daily
Telegraph Telegraph
he he
would would
now now
find find
a a
life life
dominated dominate
by by
politics politic
too too
" "
boring bore
" "
and and
unfulfilling unfulfilling
. .
Mr Mr
Hague Hague
, ,
who who
stepped step
down down
after after
his his
party party
's have
2001 2001
election election
defeat defeat
, ,
does doe
not not
rule rule
out out
a a
return return
to to
the the
front front
bench bench
. .
He He
also also
told tell
the the
paper paper
he he
hopes hope
to to
remain remain
MP MP
for for
Richmond Richmond
, ,
North North
Yorks Yorks
, ,
and and
start start
a a
family family
with with
wife wife
Ffion Ffion
. .
Mr Mr
Hague Hague
, ,
w

#### Part of Speech (POS) Tagging

In [11]:
# POS tagging

# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

#  "nlp" Objectis used to create documents with linguistic annotations.
docs = nlp(text)

for word in docs:
    print(word.text,word.pos_)

Former ADJ
Conservative ADJ
leader NOUN
William PROPN
Hague PROPN
says VERB
he PRON
will VERB
not ADV
stand VERB
for ADP
the DET
leadership NOUN
again ADV
, PUNCT
having VERB
given VERB
up PART
his DET
ambition NOUN
to PART
be VERB
prime ADJ
minister NOUN
. PUNCT
Mr PROPN
Hague PROPN
, PUNCT
43 NUM
, PUNCT
told VERB
the DET
Daily PROPN
Telegraph PROPN
he PRON
would VERB
now ADV
find VERB
a DET
life NOUN
dominated VERB
by ADP
politics NOUN
too ADV
" PUNCT
boring ADJ
" PUNCT
and CCONJ
unfulfilling ADJ
. PUNCT
Mr PROPN
Hague PROPN
, PUNCT
who PRON
stepped VERB
down ADV
after ADP
his DET
party NOUN
's PART
2001 NUM
election NOUN
defeat NOUN
, PUNCT
does VERB
not ADV
rule VERB
out PART
a DET
return NOUN
to ADP
the DET
front ADJ
bench NOUN
. PUNCT
He PRON
also ADV
told VERB
the DET
paper NOUN
he PRON
hopes VERB
to PART
remain VERB
MP PROPN
for ADP
Richmond PROPN
, PUNCT
North PROPN
Yorks PROPN
, PUNCT
and CCONJ
start VERB
a DET
family NOUN
with ADP
wife NOUN
Ffion PROPN
. PUNCT
Mr PROPN
Hagu

#### Entity Detection

In [12]:
#for visualization of Entity detection importing displacy from spacy:

from spacy import displacy

bbc= nlp(text)

entities=[(i, i.label_, i.label) for i in bbc.ents]
entities

[(Conservative, 'ORG', 383),
 (William Hague, 'PERSON', 380),
 (Mr Hague, 'PERSON', 380),
 (43, 'DATE', 391),
 (the Daily Telegraph, 'ORG', 383),
 (Mr Hague, 'PERSON', 380),
 (2001, 'DATE', 391),
 (Richmond, 'GPE', 384),
 (North Yorks, 'GPE', 384),
 (Ffion, 'PERSON', 380),
 (Mr Hague, 'PERSON', 380),
 (William Pitt the Younger, 'PERSON', 380),
 (Mr Hague, 'PERSON', 380),
 (Conservative, 'NORP', 381),
 (Tim Collins, 'PERSON', 380),
 (last week, 'DATE', 391),
 (Mr Hague, 'PERSON', 380),
 (Mr Hague, 'PERSON', 380),
 (27, 'CARDINAL', 397),
 (36, 'CARDINAL', 397),
 (the age of 40, 'DATE', 391),
 (Cabinet, 'ORG', 383),
 (Mr Hague, 'ORG', 383)]

In [13]:
displacy.render(bbc, style = "ent",jupyter = True)

#### Dependency Parsing

In [14]:
docp = nlp (text)

for chunk in docp.noun_chunks:
   print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

Former Conservative leader William Hague Hague nsubj says
he he nsubj stand
the leadership leadership pobj for
his ambition ambition dobj given
prime minister minister attr be
Mr Hague Hague nsubj told
the Daily Telegraph Telegraph dobj told
he he nsubj find
a life life dobj find
politics politics pobj by
Mr Hague Hague nsubj rule
who who nsubj stepped
his party's 2001 election defeat defeat pobj after
a return return dobj rule
the front bench bench pobj to
He He nsubj told
the paper paper dobj told
he he nsubj hopes
MP MP attr remain
Richmond Richmond pobj for
North Yorks Yorks appos Richmond
a family family dobj start
wife Ffion Ffion pobj with
Mr Hague Hague nsubj said
who who nsubj published
the biography biography dobj published
William Pitt Pitt pobj of
the Younger Younger appos Pitt
he he nsubj wanted
books books dobj writing
speech-writing writing conj books
He He nsubj told
the newspaper newspaper dobj told
"I I nsubj know
I I nsubj go
the front front pobj to
me me dobj rush
h

### SpaCy Text Classification

#### Importing Libraries

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

#### Tokening the Data With spaCy

In [16]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

#### Defining a Custom Transformer

In [17]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

#### Vectorization Feature Engineering (TF-IDF)

In [18]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1), binary=False)

In [19]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

#### Splitting The Data into Training and Test Sets

In [20]:
from sklearn.model_selection import train_test_split

X = news_df['news_text'] # the features we want to analyze
y = news_df['category'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#### Creating a Pipeline and Generating the Model

In [21]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)



Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x7f068f444240>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ng...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

#### Evaluating the Model

In [22]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)


# Model Accuracy

# Accuracy refers to the percentage of the total predictions our model makes that are completely correct.
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))

# Precision describes the ratio of true positives to true positives plus false positives in our predictions.
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted, average='weighted'))

# Recall describes the ratio of true positives to true positives plus false negatives in our predictions.
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted, average='weighted'))

Logistic Regression Accuracy: 0.9402985074626866
Logistic Regression Precision: 0.9489092996555684
Logistic Regression Recall: 0.9402985074626866


### Testing on validation dataset

In [23]:
# Import new data
os.chdir('/home/nbuser/library/1. Classifier/3. Exploratory Data Analysis')
new_data = pd.read_csv('news_df_validation.csv')
new_data.head()

Unnamed: 0,file_name,title,news_text,category
0,214.txt,Mansfield 0-1 Leyton Orient,An second-half goal from Andy Scott condemned ...,sports
1,303.txt,Film production 'falls' 40% in UK,The number of British films produced in the UK...,entertainment
2,083.txt,Hague 'given up' his PM ambition,Former Conservative leader William Hague says ...,politics
3,190.txt,SA return to Mauritius,Top seeds South Africa return to the scene of ...,sports
4,103.txt,Minimum rate for foster parents,Foster carers are to be guaranteed a minimum a...,politics


In [24]:
new_data_pred =pipe.predict(new_data['news_text'])

In [25]:
new_data_cat = new_data['category']

In [27]:
print("Logistic Regression Accuracy:",metrics.accuracy_score(new_data_cat, new_data_pred))

# Precision describes the ratio of true positives to true positives plus false positives in our predictions.
print("Logistic Regression Precision:",metrics.precision_score(new_data_cat, new_data_pred, average='weighted'))

# Recall describes the ratio of true positives to true positives plus false negatives in our predictions.
print("Logistic Regression Recall:",metrics.recall_score(new_data_cat, new_data_pred, average='weighted'))

Logistic Regression Accuracy: 0.9819819819819819
Logistic Regression Precision: 0.9828535364249649
Logistic Regression Recall: 0.9819819819819819


## Vezi average='none' - toate clasele, average='macro'
