In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import nltk
from nltk.tag import pos_tag

In [2]:
url = 'https://www.nytimes.com/2017/02/21/us/politics/transgender-students-trump-obama.html?ref=politics&_r=0'

In [3]:
def find(paragraph, ch='"'):
    idxs =  [i for i, ltr in enumerate(paragraph) if ltr == ch]
    start, stop = idxs[0::2], idxs[1::2]
    quotes = [paragraph[idx: stop[i] + 1] for i, idx in enumerate(start)]
    if len(quotes) > 0:
        return quotes

def new_york_times(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content, 'lxml')
    
    body = soup.find('article', {'id' : 'story'}).find_all('p', {'class' : 'story-body-text story-content'})
    text = ''''''
    quotes = []
    
    for paragraph in body:
        paragraph_text = paragraph.text.replace('“', '"').replace('”', '"').replace('."', '." ')
        text += paragraph_text + ' '
        results = find(paragraph_text)
        
        if results:
            quotes.extend(results)
        
    
    return text, quotes

article, quotes = new_york_times(url)

In [11]:
pos_tags = pd.read_excel('Part of Speech.xlsx', index_col=0)
pos_lookup = pos_tags.to_dict()['Description']

In [4]:
all_sentences = nltk.sent_tokenize(article[13:])
sentence_dict, frames = {}, []
for i, sentence in enumerate(all_sentences):
    sentence_dict[i] = nltk.pos_tag(nltk.word_tokenize(sentence))
    df = pd.concat([pd.Series(sentence_dict[i]).apply(lambda x: x[1]), pd.Series(sentence_dict[i]).apply(lambda x: x[0])], axis=1)
    df['Sentence'] = i+1
    frames.append(df)
# structure_frame
df = pd.concat(frames)

In [12]:
spicer = df[df['Sentence'] == 5]
spicer.columns = ['pos', 'word', 'Sentence']
spicer.replace({'pos' : pos_lookup
    })

Unnamed: 0,pos,word,Sentence
0,"Proper noun, singular",Mr.,5
1,"Proper noun, singular",Spicer,5
2,"Verb, past tense",said,5
3,Preposition or subordinating conjunction,that,5
4,"Noun, plural",officials,5
5,Preposition or subordinating conjunction,at,5
6,Determiner,the,5
7,"Proper noun, singular",Justice,5
8,Coordinating conjunction,and,5
9,"Proper noun, singular",Education,5


In [5]:
quotes

['"this is a states’ rights issue and not one for the federal government,"',
 '"not one for the federal government"',
 '"To cloak this in federalism ignores the vital and historic role that federal law plays in ensuring that all children (including L.G.B.T. students) are able to attend school free from discrimination,"',
 '"outrageous,"',
 '"a hugely consequential decision."',
 '"use the bathroom they feel is appropriate,"',
 '"I understand the demands for justice and fairness made by our L.G.B.T. community. I will ensure that the statutes protecting their civil rights and their safety are fully enforced."',
 '"It’s astounding to me that this administration would decide that it’s going to stop standing up for young children in crisis,"',
 '"That’s a bad development any way you look at it."']

In [14]:
article.find('said')

190

In [16]:
article

'WASHINGTON — President Trump appears on the verge of reversing protections put in place by the Obama administration to stop discrimination against transgender students in schools, officials said Tuesday. Civil rights advocates said that the possible rollback of protections, which could come as soon as Wednesday, would represent a major step backward after the enormous progress made by gay and transgender people in recent years. The administration signaled that an announcement was imminent on the question of whether transgender students should be able to use the bathroom of their choosing — and that Mr. Trump could well come down differently from his predecessor, President Barack Obama, a forceful advocate for transgender students. Mr. Trump believes that "this is a states’ rights issue and not one for the federal government," Sean Spicer, the White House press secretary, told reporters on Tuesday. Mr. Spicer said that officials at the Justice and Education Departments were reviewing a

In [105]:
def outside_quotes(article):

    # Find Quotes
    ch = '"'
    idxs =  [i for i, ltr in enumerate(article) if ltr == ch]

    # Get Text surrounding each quote
    starts = idxs[1::2]
    starts.insert(0, 0)

    ends = idxs[::2][1:]
    ends.insert(0, idxs[0])
    ends.append(-1)

    # Slice the text
    slices = pd.MultiIndex.from_arrays([starts, ends]).to_native_types()

    not_quotes = []
    for beg, end in slices:
        not_quotes.append(article[int(beg):int(end)])
    
    return not_quotes

not_quotes = outside_quotes(article)

In [112]:
# For each section AFTER quotes, split into sentences. 
# If in the first sentence after a quote, there is a 
#     NAME + {COMMA} + DESCRIPTION of PERSON + {COMMA} + VERB + Audience
sentences_after_first_quote = nltk.sent_tokenize(not_quotes[1])
if ',' in sentences_after_first_quote[0]:
    attributes = sentences_after_first_quote[0].split(',')
    name_title, position_background, action_audience = attributes

In [115]:
name_title, position_background, action_audience

('" Sean Spicer',
 ' the White House press secretary',
 ' told reporters on Tuesday.')

In [119]:
names, titles, actions = [], [], []
for section in not_quotes[1:]:
    first_sent = nltk.sent_tokenize(section)[0]
    if ',' in first_sent:
        attributes = first_sent.split(',')
        try:
            names.append(attributes[0]), titles.append(attributes[1]), actions.append(attributes[2])
            print('succes')
        except:
            print(attributes)

succes
['" conflicts squarely with the view of Obama administration officials', ' who saw gay and transgender issues as perhaps the biggest civil rights fight of the modern era.']
succes
succes


In [120]:
names

['" Sean Spicer',
 '" conflicts squarely with the view of Obama administration officials',
 '" and Rob Flaherty',
 '" and he said that Caitlyn Jenner']

In [121]:
quotes

['"this is a states’ rights issue and not one for the federal government,"',
 '"not one for the federal government"',
 '"To cloak this in federalism ignores the vital and historic role that federal law plays in ensuring that all children (including L.G.B.T. students) are able to attend school free from discrimination,"',
 '"outrageous,"',
 '"a hugely consequential decision."',
 '"use the bathroom they feel is appropriate,"',
 '"I understand the demands for justice and fairness made by our L.G.B.T. community. I will ensure that the statutes protecting their civil rights and their safety are fully enforced."',
 '"It’s astounding to me that this administration would decide that it’s going to stop standing up for young children in crisis,"',
 '"That’s a bad development any way you look at it."']

Unnamed: 0,0,1,Sentence
0,NNP,President,1
1,NNP,Trump,1
2,VBZ,appears,1
3,IN,on,1
4,DT,the,1
5,NN,verge,1
6,IN,of,1
7,VBG,reversing,1
8,NNS,protections,1
9,VBN,put,1


In [150]:
pd.concat([pd.Series(sentence_dict[i]).apply(lambda x: x[1]), pd.Series(sentence_dict[i]).apply(lambda x: x[0])], axis=1)

Unnamed: 0,0,1
0,``,``
1,VB,That’s
2,DT,a
3,JJ,bad
4,NN,development
5,DT,any
6,NN,way
7,PRP,you
8,VBP,look
9,IN,at
