In [20]:
#stop words in spacy 
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [21]:
len(STOP_WORDS)

326

In [22]:
nlp = spacy.load('en_core_web_sm')

In [23]:
doc = nlp("Hello! This is a sample document to demonstrate stop word removal using spaCy.")


In [24]:
#removing stop words from the doc 
tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
tokens

['Hello',
 'sample',
 'document',
 'demonstrate',
 'stop',
 'word',
 'removal',
 'spaCy']

In [35]:
#this function is applied to generate tokens after removing stop words and lemmatizing the words

def preprocess_text(text):
    doc = nlp(text)
    tokens = []
    for token in doc:
        if  token.is_stop or token.is_punct:
            continue
        tokens.append(token.lemma_)
    return " ".join(tokens)

In [36]:
preprocess_text("hiii !!! This is another example to test the preprocessing function. you ate a apple")

'hiii example test preprocessing function eat apple'

In [28]:
#performing lemmatization and removing stop words on a json file 
import pandas as pd 

In [29]:
df = pd.read_json('combined.json', lines=True)

df.shape

(13087, 6)

In [10]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components
0,,Convicted Bomb Plotter Sentenced to 30 Years,"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01T00:00:00-04:00,[],[National Security Division (NSD)]
1,12-919,$1 Million in Restitution Payments Announced t...,WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25T00:00:00-04:00,[],[Environment and Natural Resources Division]
2,11-1002,$1 Million Settlement Reached for Natural Reso...,BOSTON– A $1-million settlement has been...,2011-08-03T00:00:00-04:00,[],[Environment and Natural Resources Division]
3,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08T00:00:00-05:00,[],[Environment and Natural Resources Division]
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]


In [37]:
#lets remove the columns with null topics
df = df[df["topics"].str.len() != 0]
df.head()

Unnamed: 0,id,title,contents,date,topics,components,new_contents
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division],"[U.S., Department, Justice, U.S., Environmenta..."
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division],"[131, count, criminal, indictment, unseal, tod..."
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U...","[United, States, Attorney, Office, Middle, Dis..."
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division],"[21st, Century, Oncology, LLC, agree, pay, $, ..."
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]","[21st, Century, Oncology, Inc., certain, subsi..."


In [16]:
df.shape

(4688, 6)

In [38]:
df["new_contents"] = df["contents"].apply(preprocess_text)
df.head()

Unnamed: 0,id,title,contents,date,topics,components,new_contents
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division],U.S. Department Justice U.S. Environmental Pro...
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division],131 count criminal indictment unseal today Bos...
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U...",United States Attorney Office Middle District ...
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division],21st Century Oncology LLC agree pay $ 19.75 mi...
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]",21st Century Oncology Inc. certain subsidiary ...


In [39]:
# now we have a new column with preprocessed text
#lets check the length of the new column's particular row 
df["new_contents"][4]

'U.S. Department Justice U.S. Environmental Protection Agency EPA Rhode Island Department Environmental Management RIDEM announce today subsidiary Stanley Black Decker Inc.—Emhart Industries Inc. Black Decker Inc.—have agree clean dioxin contaminate sediment soil Centredale Manor Restoration Project Superfund Site North Providence Johnston Rhode Island \xa0  pleased reach resolution collaborative work responsible party EPA stakeholder say \xa0 Acting Assistant Attorney General Jeffrey H. Wood Justice Department \xa0 Environment Natural Resources Division today settlement end protract litigation allow important work underway restore healthy environment citizen live Centredale Manor Site Woonasquatucket River settlement demonstrate tremendous progress achieve work responsible party state federal partner expedite site entire Superfund remediation process say EPA Acting Administrator Andrew Wheeler Centredale Manor Site National Priorities List 18 year take charge ensure Agency make good p

In [32]:
len(df["contents"][4])

6286