# Using NLTK library in order to label undesired article content (clutter)

### Importing libraries and installing NLTK data

In [1]:
import json, jmespath
import pandas as pd
import nltk
from nltk.chunk import *
from nltk.chunk.util import *
from nltk.chunk.regexp import *
from nltk.chunk import conlltags2tree, tree2conlltags

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')
nltk.download('ieer')
nltk.download('tagsets')

[nltk_data] Downloading package punkt to C:\Users\Felipe
[nltk_data]     Marineli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Felipe Marineli\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to C:\Users\Felipe
[nltk_data]     Marineli\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to C:\Users\Felipe
[nltk_data]     Marineli\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Felipe
[nltk_data]     Marineli\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package ieer to C:\Users\Felipe
[nltk_data]     Marineli\AppData\Roaming\nltk

True

### Loading/parsing the json file and extracting the text field

In [2]:
# url for reference: https://www.wsav.com/news/national-news/student-suspended-after-posting-warnings-of-rapist/

path_to_json = "C:/Users/Felipe Marineli/sample2.json"
f = "body.content.text"

with open(path_to_json, 'rb') as json_sample:
    parsed_json = json.load(json_sample)

expression = jmespath.compile(f)

sentence = expression.search(parsed_json)
sentence

'CAPE ELIZABETH, Maine (WCSH) – A sophomore at Maine’s Cape Elizabeth High School was suspended for bullying after she left sticky notes in two girls bathrooms reading, “There’s a rapist in our school and you know who it is.” “It makes me angry that I’m being punished for bullying and a rapist isn’t being punished for raping people,” Aela Mansmann said Friday. “I felt this was important — that this was common knowledge.” Mansmann said allegations of sexual assaults have occurred for years and the administration hasn’t listened to the allegations. She said she posted the notes because she and other students don’t feel safe at school. “On a day-to-day level we don’t feel believed,” she said. “We don’t feel supported.” School officials say the law requires they are required by law to investigate any time a student bullies another student, and said they are confident in their investigation of the incident. Read more: http://bit.ly/2AYfgGE Related Content Police: High school coach lifted $4

### Tokenizing the text body

In [3]:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

tokens = sent_detector.tokenize(sentence.strip())

print('\n-----\n'.join(tokens))

CAPE ELIZABETH, Maine (WCSH) – A sophomore at Maine’s Cape Elizabeth High School was suspended for bullying after she left sticky notes in two girls bathrooms reading, “There’s a rapist in our school and you know who it is.” “It makes me angry that I’m being punished for bullying and a rapist isn’t being punished for raping people,” Aela Mansmann said Friday.
-----
“I felt this was important — that this was common knowledge.” Mansmann said allegations of sexual assaults have occurred for years and the administration hasn’t listened to the allegations.
-----
She said she posted the notes because she and other students don’t feel safe at school.
-----
“On a day-to-day level we don’t feel believed,” she said.
-----
“We don’t feel supported.” School officials say the law requires they are required by law to investigate any time a student bullies another student, and said they are confident in their investigation of the incident.
-----
Read more: http://bit.ly/2AYfgGE Related Content Police

### Saving the content in a data frame and labeling the sentences based on _strings_

In [4]:
df = pd.DataFrame(tokens)
df.columns = ['text_chunk']
df

Unnamed: 0,text_chunk
0,"CAPE ELIZABETH, Maine (WCSH) – A sophomore at ..."
1,“I felt this was important — that this was com...
2,She said she posted the notes because she and ...
3,"“On a day-to-day level we don’t feel believed,..."
4,“We don’t feel supported.” School officials sa...
5,Read more: http://bit.ly/2AYfgGE Related Conte...


In [5]:
df.loc[df['text_chunk'].str.contains("Read more"),'clutter'] = 1
df.loc[df['text_chunk'].str.contains("Related content"),'clutter'] = 1
df = df.fillna(0)

df

Unnamed: 0,text_chunk,clutter
0,"CAPE ELIZABETH, Maine (WCSH) – A sophomore at ...",0.0
1,“I felt this was important — that this was com...,0.0
2,She said she posted the notes because she and ...,0.0
3,"“On a day-to-day level we don’t feel believed,...",0.0
4,“We don’t feel supported.” School officials sa...,0.0
5,Read more: http://bit.ly/2AYfgGE Related Conte...,1.0


## Another approach for finding clutter: tokenization by words + tagging + regex

In [6]:
df2 = pd.DataFrame(tokens)
df2.columns = ['text_chunk']

### Tokenizing by words

In [7]:
word_tokens = nltk.word_tokenize(sentence)
word_tokens

['CAPE',
 'ELIZABETH',
 ',',
 'Maine',
 '(',
 'WCSH',
 ')',
 '–',
 'A',
 'sophomore',
 'at',
 'Maine',
 '’',
 's',
 'Cape',
 'Elizabeth',
 'High',
 'School',
 'was',
 'suspended',
 'for',
 'bullying',
 'after',
 'she',
 'left',
 'sticky',
 'notes',
 'in',
 'two',
 'girls',
 'bathrooms',
 'reading',
 ',',
 '“',
 'There',
 '’',
 's',
 'a',
 'rapist',
 'in',
 'our',
 'school',
 'and',
 'you',
 'know',
 'who',
 'it',
 'is.',
 '”',
 '“',
 'It',
 'makes',
 'me',
 'angry',
 'that',
 'I',
 '’',
 'm',
 'being',
 'punished',
 'for',
 'bullying',
 'and',
 'a',
 'rapist',
 'isn',
 '’',
 't',
 'being',
 'punished',
 'for',
 'raping',
 'people',
 ',',
 '”',
 'Aela',
 'Mansmann',
 'said',
 'Friday',
 '.',
 '“',
 'I',
 'felt',
 'this',
 'was',
 'important',
 '—',
 'that',
 'this',
 'was',
 'common',
 'knowledge.',
 '”',
 'Mansmann',
 'said',
 'allegations',
 'of',
 'sexual',
 'assaults',
 'have',
 'occurred',
 'for',
 'years',
 'and',
 'the',
 'administration',
 'hasn',
 '’',
 't',
 'listened',
 'to',

### Tagging

In [8]:
tagged = nltk.pos_tag(word_tokens)
tagged

[('CAPE', 'NNP'),
 ('ELIZABETH', 'NNP'),
 (',', ','),
 ('Maine', 'NNP'),
 ('(', '('),
 ('WCSH', 'NNP'),
 (')', ')'),
 ('–', 'VBZ'),
 ('A', 'DT'),
 ('sophomore', 'NN'),
 ('at', 'IN'),
 ('Maine', 'NNP'),
 ('’', 'NNP'),
 ('s', 'NN'),
 ('Cape', 'NNP'),
 ('Elizabeth', 'NNP'),
 ('High', 'NNP'),
 ('School', 'NNP'),
 ('was', 'VBD'),
 ('suspended', 'VBN'),
 ('for', 'IN'),
 ('bullying', 'NN'),
 ('after', 'IN'),
 ('she', 'PRP'),
 ('left', 'VBD'),
 ('sticky', 'JJ'),
 ('notes', 'NNS'),
 ('in', 'IN'),
 ('two', 'CD'),
 ('girls', 'NNS'),
 ('bathrooms', 'NNS'),
 ('reading', 'VBG'),
 (',', ','),
 ('“', 'CC'),
 ('There', 'EX'),
 ('’', 'NNP'),
 ('s', 'VBD'),
 ('a', 'DT'),
 ('rapist', 'NN'),
 ('in', 'IN'),
 ('our', 'PRP$'),
 ('school', 'NN'),
 ('and', 'CC'),
 ('you', 'PRP'),
 ('know', 'VBP'),
 ('who', 'WP'),
 ('it', 'PRP'),
 ('is.', 'VBZ'),
 ('”', 'JJ'),
 ('“', 'IN'),
 ('It', 'PRP'),
 ('makes', 'VBZ'),
 ('me', 'PRP'),
 ('angry', 'JJ'),
 ('that', 'IN'),
 ('I', 'PRP'),
 ('’', 'VBP'),
 ('m', 'JJ'),
 ('being',

### Finding and labeling clutter with regex

In [9]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [10]:
grammar = r"""
  RC:     {<VBN><RBR><:>}
  RC2:    {<NNP><NNP|NNS><NNP><:>}
"""

cp = nltk.RegexpParser(grammar)

parsed_cp = cp.parse(tagged)
print(parsed_cp)

(S
  CAPE/NNP
  ELIZABETH/NNP
  ,/,
  Maine/NNP
  (/(
  WCSH/NNP
  )/)
  –/VBZ
  A/DT
  sophomore/NN
  at/IN
  Maine/NNP
  ’/NNP
  s/NN
  Cape/NNP
  Elizabeth/NNP
  High/NNP
  School/NNP
  was/VBD
  suspended/VBN
  for/IN
  bullying/NN
  after/IN
  she/PRP
  left/VBD
  sticky/JJ
  notes/NNS
  in/IN
  two/CD
  girls/NNS
  bathrooms/NNS
  reading/VBG
  ,/,
  “/CC
  There/EX
  ’/NNP
  s/VBD
  a/DT
  rapist/NN
  in/IN
  our/PRP$
  school/NN
  and/CC
  you/PRP
  know/VBP
  who/WP
  it/PRP
  is./VBZ
  ”/JJ
  “/IN
  It/PRP
  makes/VBZ
  me/PRP
  angry/JJ
  that/IN
  I/PRP
  ’/VBP
  m/JJ
  being/VBG
  punished/VBN
  for/IN
  bullying/NN
  and/CC
  a/DT
  rapist/NN
  isn/NN
  ’/NNP
  t/NN
  being/VBG
  punished/VBN
  for/IN
  raping/VBG
  people/NNS
  ,/,
  ”/JJ
  Aela/NNP
  Mansmann/NNP
  said/VBD
  Friday/NNP
  ./.
  “/NN
  I/PRP
  felt/VBD
  this/DT
  was/VBD
  important/JJ
  —/NN
  that/IN
  this/DT
  was/VBD
  common/JJ
  knowledge./NN
  ”/NNP
  Mansmann/NNP
  said/VBD
  allegations/NNS
  

In [11]:
type(parsed_cp)

nltk.tree.Tree

### Extracting the labeled tree leaves into a list

In [12]:
ROOT = 'ROOT'

def getNodes(parent):
    for node in parent:
        if type(node) is nltk.Tree:
            if node.label() == ROOT:
                continue
            else:
                print("Label:", node.label())
                print("Leaves:", node.leaves())

getNodes(parsed_cp)

Label: RC
Leaves: [('Read', 'VBN'), ('more', 'RBR'), (':', ':')]
Label: RC2
Leaves: [('Related', 'NNP'), ('Content', 'NNP'), ('Police', 'NNP'), (':', ':')]


In [13]:
def extractNodes(parent):
    
    l = []
    
    for node in parent:
        if type(node) is nltk.Tree:
            if node.label() == 'RC' or 'RC2':
                l.append(node.leaves()[0][0])
                l.append(node.leaves()[1][0])
                l.append(node.leaves()[2][0])
    return l
    
extractNodes(parsed_cp)

['Read', 'more', ':', 'Related', 'Content', 'Police']

### Labeling each `text_chunk` based on the existance of at least `three values` in the extracted list

In [14]:
df2.loc[pd.concat([df2.text_chunk.str.contains(word) for word in extractNodes(parsed_cp)], axis = 1).sum(1) > 2, 'clutter'] = 1
df2 = df2.fillna(0)

df2

Unnamed: 0,text_chunk,clutter
0,"CAPE ELIZABETH, Maine (WCSH) – A sophomore at ...",0.0
1,“I felt this was important — that this was com...,0.0
2,She said she posted the notes because she and ...,0.0
3,"“On a day-to-day level we don’t feel believed,...",0.0
4,“We don’t feel supported.” School officials sa...,0.0
5,Read more: http://bit.ly/2AYfgGE Related Conte...,1.0


#### Future plans: parse many more _json_ files, extract features, and feed the resulting data set into a ML model

### Bonus: Playing around with Named Entity Recognition (using the rather inaccurate library of NLTK)

In [15]:
iob_tagged = tree2conlltags(parsed_cp)
ne_tree = ne_chunk(iob_tagged)

In [16]:
getNodes(ne_tree)

Label: GPE
Leaves: [('Maine', 'NNP', 'O')]
Label: ORGANIZATION
Leaves: [('WCSH', 'NNP', 'O')]
Label: FACILITY
Leaves: [('Maine', 'NNP', 'O')]
Label: PERSON
Leaves: [('Cape', 'NNP', 'O'), ('Elizabeth', 'NNP', 'O'), ('High', 'NNP', 'O'), ('School', 'NNP', 'O')]
Label: PERSON
Leaves: [('Aela', 'NNP', 'O'), ('Mansmann', 'NNP', 'O')]
Label: PERSON
Leaves: [('Mansmann', 'NNP', 'O')]
Label: ORGANIZATION
Leaves: [('School', 'NNP', 'O')]
Label: PERSON
Leaves: [('Related', 'NNP', 'B-RC2'), ('Content', 'NNP', 'I-RC2')]
Label: ORGANIZATION
Leaves: [('School', 'NNP', 'O')]
Label: PERSON
Leaves: [('Rapper', 'NNP', 'O')]
