## Scraping NYT Homepage for URL's to articles:

In [2]:
import re
import requests
nyt = requests.get('https://www.nytimes.com/section/todayspaper').text
print('Successfully requested today\'s paper from the NYT')

Successfully requested today's paper from the NYT


In [5]:
def unique(list1):
    # initialize a null list
    unique_list = []
     
    # traverse for all elements
    for x in list1:
        # check if exists in unique_list or not
        if x not in unique_list:
            unique_list.append(x.replace('\"',""))
    return unique_list

urls = re.findall(r'href=(["\/\d[\w\-\/\.]+?\.html)"', nyt)
unique_urls = unique(urls)
print('Successfully added unique article URLs to list')
print('Total articles: ',len(unique_urls))

Successfully added unique article URLs to list
Total articles:  48


# Storing text content of each article in to a list of lists:

In [6]:
articles_html = []

for i in range(len(unique_urls)):
    article = requests.get('https://www.nytimes.com' + unique_urls[i]).text
    articles_html.append(article)
print('Successfully added each article HTML data to list')
    
articles = []
try:
    for i in range(len(articles_html)):
        paras = re.findall(r'<p class="css-axufdj evys1bk0">(.+?)</p>', articles_html[i])
        paras = list(map(lambda x: re.sub(r'<[^>]+?>', '', x), paras))
        articles.append(paras)
except:
    pass

print('Successfully extracted article paragraph text data to list')

Successfully added each article HTML data to list
Successfully extracted article paragraph text data to list


# Individual Article Text Analysis:

##### Splitting all paragraphs of first article in to individual words:


In [12]:
index = 0
print('First 3 paragraphs of article 1 paragraphs split into words:\n')
for parag in articles[0][:3]:
    index += 1
    print('Paragraph ' + str(index) + ' in sentence form: ')
    print(parag)
    words = parag.split()
    print('\n')
    print('Paragraph ' + str(index) + ' in word form: ')
    print(words)
    print('\n')

First 3 paragraphs of article 1 paragraphs split into words:

Paragraph 1 in sentence form: 
Follow live coverage of the Bronx apartment fire. 


Paragraph 1 in word form: 
['Follow', 'live', 'coverage', 'of', 'the', 'Bronx', 'apartment', 'fire.']


Paragraph 2 in sentence form: 
Nineteen people, including nine children, were killed on Sunday when an apartment fire started by a malfunctioning space heater sent smoke billowing through a Bronx high-rise, officials said, in the deadliest fire New York City had seen in more than three decades.


Paragraph 2 in word form: 
['Nineteen', 'people,', 'including', 'nine', 'children,', 'were', 'killed', 'on', 'Sunday', 'when', 'an', 'apartment', 'fire', 'started', 'by', 'a', 'malfunctioning', 'space', 'heater', 'sent', 'smoke', 'billowing', 'through', 'a', 'Bronx', 'high-rise,', 'officials', 'said,', 'in', 'the', 'deadliest', 'fire', 'New', 'York', 'City', 'had', 'seen', 'in', 'more', 'than', 'three', 'decades.']


Paragraph 3 in sentence form: 


##### Cleaning up punctuation:

In [13]:
import string
print(string.punctuation)
#list of punctuation symbols in library

punc = string.punctuation + '’‘'
#adding more special characters to special punctuation library

def strip_punctuation(st):
     return ''.join(i for i in st if i not in punc)
     #returns only the characters in the string that are not in the special punctuation library

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


##### Creating count of each word in the article:

In [19]:
wordcounts = {}                          
for parag in articles[0][:]:          
    words = parag.split()
    for word in words:
        word = word.lower()
        word = strip_punctuation(word)
        if word in wordcounts:
            wordcounts[word] += 1
        else:
            wordcounts[word] = 1
            
sorted_wordcounts = sorted(wordcounts.items(), key=lambda x: x[1], reverse=True)
print('\nTop 5 words by number of appearances in article:\n', sorted_wordcounts[:5])

print('\nUnsorted wordcounts:\n', wordcounts)



Top 5 words by number of appearances in article:
 [('the', 97), ('in', 35), ('a', 34), ('and', 31), ('fire', 30)]

Unsorted wordcounts:
 {'follow': 1, 'live': 1, 'coverage': 1, 'of': 29, 'the': 97, 'bronx': 7, 'apartment': 8, 'fire': 30, 'nineteen': 1, 'people': 9, 'including': 3, 'nine': 1, 'children': 4, 'were': 13, 'killed': 2, 'on': 12, 'sunday': 6, 'when': 5, 'an': 5, 'started': 3, 'by': 13, 'a': 34, 'malfunctioning': 1, 'space': 2, 'heater': 3, 'sent': 1, 'smoke': 7, 'billowing': 1, 'through': 4, 'highrise': 1, 'officials': 4, 'said': 26, 'in': 35, 'deadliest': 2, 'new': 2, 'york': 1, 'city': 4, 'had': 7, 'seen': 1, 'more': 2, 'than': 1, 'three': 3, 'decades': 1, 'additional': 1, '44': 1, 'injured': 2, '13': 2, 'them': 5, 'critically': 1, 'after': 3, 'occupants': 1, 'thirdfloor': 2, 'where': 5, 'fled': 1, 'without': 1, 'closing': 1, 'door': 1, 'behind': 1, 'commissioner': 2, 'daniel': 1, 'nigro': 2, 'at': 10, 'news': 3, 'conference': 2, 'scene': 2, '“smoke': 1, 'spread': 2, 'thr