In [1]:
########################### Stop words ###########################

In [3]:
# Importing the required libraries

import spacy

from spacy.lang.en.stop_words import STOP_WORDS

In [4]:
# Checking the length of stop words in English in spacy

len(STOP_WORDS)

326

In [5]:
# Checking the stop words in English in spacy

STOP_WORDS

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [6]:
# Loading the spacy english language model

model = spacy.load("en_core_web_sm")

text = model("I am Tabinda Hayat. I am an aspiring Data Scientist!")

for words in text:
    if words.is_stop: # method is_stop checks if the word is a stop word or not
        print(words) # printing the stop words in the provided text

I
am
I
am
an


In [7]:
# For removing the punctuation marks, we can use another method is_

text = model("I am Tabinda Hayat. I am an aspiring Data Scientist!")

for words in text:
    if words.is_punct: # method is_stop checks if the word is a stop word or not
        print(words) 

.
!


In [27]:
# So, we want to do the pre-processing and remove all the stop words including punctuation marks

def preprocessing(text): # creating a function to pre-process
    text = model(text)
    
    preprocessed_text = [words.text for words in text if not words.is_stop and not words.is_punct]
    return " ".join(preprocessed_text) # returning string instead of a list

In [28]:
# Printing the non-stop words

print(preprocessing("I am Tabinda Hayat. I am an aspiring Data Scientist!"))

Tabinda Hayat aspiring Data Scientist


In [16]:
# Carrying out preprocessing on the dataset which is in json format. The dataset used here is from the 

# Department of Justice 2009-2018 Press Release from Kaggle.

import pandas as pd

df = pd.read_json("doj_press.json", lines=True)

df.head()

Unnamed: 0,id,title,contents,date,topics,components
0,,Convicted Bomb Plotter Sentenced to 30 Years,"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01T00:00:00-04:00,[],[National Security Division (NSD)]
1,12-919,$1 Million in Restitution Payments Announced t...,WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25T00:00:00-04:00,[],[Environment and Natural Resources Division]
2,11-1002,$1 Million Settlement Reached for Natural Reso...,BOSTON– A $1-million settlement has been...,2011-08-03T00:00:00-04:00,[],[Environment and Natural Resources Division]
3,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08T00:00:00-05:00,[],[Environment and Natural Resources Division]
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]


In [17]:
# Checking the rows and columns of the dataset

df.shape

(13087, 6)

In [22]:
# We can take first 50 rows for pre-processing

df = df[:50]
len(df)

50

In [25]:
# We will do pre-processing or remove the stop words from all the 50 entries in the contents column of the df

df["contents"].iloc[2] # 3rd row of contents column

"      BOSTON– A $1-million settlement has been reached for natural resource damages (NRD) at the Blackburn & Union Privileges Superfund Site in Walpole, Mass., the Departments of Justice and Interior (DOI), and the Office of the Massachusetts Attorney General announced today.        \xa0       The Blackburn & Union Privileges Superfund Site includes 22 acres of contaminated land and water in Walpole. The contamination resulted from the operations of various industrial facilities dating back to the 19th century that exposed the site to asbestos, arsenic, lead and other hazardous substances.        \xa0       The private parties involved in the settlement include two former owners and operators of the site, W.R. Grace & Co.– Conn. and Tyco Healthcare Group LP, as well as the current owners, BIM Investment Corp. and Shaffer Realty Nominee Trust.       \xa0       From about 1915 to 1936, a predecessor of W.R. Grace manufactured asbestos brake linings and clutch linings on a large portion 

In [26]:
# Let's see it's length before pre-processing

len(df["contents"].iloc[2])

5599

In [29]:
# Let's apply the pre-processing function on the 50 rows using apply()

df["contents_preprocessed"] = df["contents"].apply(preprocessing)

df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["contents_preprocessed"] = df["contents"].apply(preprocessing)


Unnamed: 0,id,title,contents,date,topics,components,contents_preprocessed
0,,Convicted Bomb Plotter Sentenced to 30 Years,"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01T00:00:00-04:00,[],[National Security Division (NSD)],PORTLAND Oregon Mohamed Osman Mohamud 23 convi...
1,12-919,$1 Million in Restitution Payments Announced t...,WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25T00:00:00-04:00,[],[Environment and Natural Resources Division],WASHINGTON North Carolina Waccamaw River wa...
2,11-1002,$1 Million Settlement Reached for Natural Reso...,BOSTON– A $1-million settlement has been...,2011-08-03T00:00:00-04:00,[],[Environment and Natural Resources Division],BOSTON $ 1 million settlement reached n...
3,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08T00:00:00-05:00,[],[Environment and Natural Resources Division],WASHINGTON federal grand jury Las Vegas tod...
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division],U.S. Department Justice U.S. Environmental Pro...


In [30]:
# Let's check back the content and length of 3rd row after removing the stop words

df["contents_preprocessed"].iloc[2]

'       BOSTON $ 1 million settlement reached natural resource damages NRD Blackburn Union Privileges Superfund Site Walpole Mass. Departments Justice Interior DOI Office Massachusetts Attorney General announced today        \xa0        Blackburn Union Privileges Superfund Site includes 22 acres contaminated land water Walpole contamination resulted operations industrial facilities dating 19th century exposed site asbestos arsenic lead hazardous substances        \xa0        private parties involved settlement include owners operators site W.R. Grace Co. Conn. Tyco Healthcare Group LP current owners BIM Investment Corp. Shaffer Realty Nominee Trust       \xa0        1915 1936 predecessor W.R. Grace manufactured asbestos brake linings clutch linings large portion property 1946 1983 predecessor Tyco Healthcare operated cotton fabric manufacturing business caustic solutions portion property       \xa0        2010 settlement U.S. Environmental Protection Agency EPA private parties agreed p

In [32]:
len(df["contents_preprocessed"].iloc[2]) # length reduced from 5599 to 4241, woah!!

4241

In [34]:
# Let me show you firs 100 characters to see the change of what all stop words were removed

df["contents"].iloc[2][:100] # before preprocessing

'      BOSTON– A $1-million settlement has been reached for natural resource damages (NRD) at the Bla'

In [35]:
df["contents_preprocessed"].iloc[2][:100] # after preprocessing

'       BOSTON $ 1 million settlement reached natural resource damages NRD Blackburn Union Privileges'