In [1]:
import requests

import pandas as pd

import os
import json
from pathlib import Path, PurePath


from newsworthy.utils import get_path_to_data_parsed_dir

from newsworthy.get_website import WebsiteSource, split

# Tests

In [2]:
test = True

In [3]:
if test:
    with open('../data/newsworthy_labels_and_urls.csv', 'r') as f:
        lines = f.readlines()

In [4]:
if test:
    idx = 2 # successful transmission of data
    #idx = 27 # client error: forbidden for this url
    #idx = 26 # timeout 
    #idx = 313
    is_newsworthy, url = split(lines[idx])
    print(f'{is_newsworthy}\n{url}')

False
https://guide.michelin.com/us/en/new-york-state/new-york/restaurant/speedy-romeo


In [5]:
if test:
    website_source = WebsiteSource(url, label=is_newsworthy, timeout=(4,4))

In [6]:
if test:
    website_source.make_http_request()

In [8]:
if test:
    print('1::', website_source.ok)
    print('2::', website_source._response)
    print('3::', website_source._error)
    print('4::', website_source)
    

1:: True
2:: <Response [200]>
4:: None
5:: URL:https://guide.michelin.com/us/en/new-york-state/new-york/restaurant/speedy-romeo
LABEL:False
HEADERS:{'Content-Type': 'text/html;charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Date': 'Tue, 18 Jan 2022 02:10:54 GMT', 'Server': 'Apache', 'x-frame-options': 'SAMEORIGIN', 'x-application-context': 'application:production', 'Set-Cookie': 'JSESSIONID=1FA3E1C94E15D4B252079E038FA8A854; Path=/; Secure; HttpOnly', 'Content-Language': 'en', 'Content-Encoding': 'gzip', 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'x-content-type-options': 'nosniff', 'referrer-policy': 'no-referrer-when-downgrade', 'content-security-policy': "default-src http: https:; script-src http: https: 'unsafe-inline' 'unsafe-eval'; style-src http: https: 'unsafe-inline'; img-src http: https: data:; connect-src http: https: wss://*.hotjar.com", 'feature-policy': "microphone 'none'; camera 'none'; midi 'none'", 'x-xss-protection'

# Get Raw Source Code For Webites

given the list of urls in the file `urls.csv`,
make the request to get the raw source code and store locally to avoid hitting multiple times


In [13]:
def process_url(file_index, line, reprocess=False, verbose=False, timeout=(20,40)):
    '''
    file_index:
        the line number in the `newsworthy_labels_and_urls.csv` file
        this will be used to name the metadata and html source files written to the data/parsed directory
    line: 
        the line loaded from the `newsworthy_labels_and_urls.csv` file
        this contains the label and URL
    reprocess: 
        if True, the URL source code is requested and any previously written files in the data/parsed/ directory are overwritten; 
        if False, skips any URLs that have already been processed and have files written in the data/parsed directory
    timeout:
        (int, int) contains the connect timeout and the read timeout
        - the connect timeout is the number of seconds Requests will wait for the client to establish a connection
        - the read timeout is the number of seconds the client will wait for the server to send a response
    '''
    
    # file names
    prefix = f'{str(file_index).zfill(4)}'
    json_file_name = f'{prefix}.json'
    html_file_name = f'{prefix}.html'
    
    if verbose: print(json_file_name, html_file_name)
    
    data_path = get_path_to_data_parsed_dir()
    
    fj = Path(data_path, json_file_name)
    fh = Path(data_path, html_file_name)
        
    if fj.exists() and fh.exists() and not reprocess:
        if verbose:
            print(f'files exist for: {prefix}')
        return False
    
    else:

        print(f'created files for: {prefix}')
        fj.touch()
        fh.touch()
    
        label, url = split(line)
    
        website_source = WebsiteSource(url, label=label, timeout=timeout)
        website_source.make_http_request()
        
        website_source.write_metadata_to_json(fj)        
        website_source.write_html_source_to_file(fh)
        
        return True

# Tests

In [14]:
if test:
    line = lines[idx]
    
    process_url(file_index=idx, line=line, reprocess=False, verbose=True)

0357.json 0357.html
files exist for: 0357


# Run All

In [16]:
with open('../data/newsworthy_labels_and_urls.csv', 'r') as f:
    lines = f.readlines()

In [19]:
verbose = False
reprocess = False

processed_file_count = 0
for idx, line in enumerate(lines):

    status = process_url(idx, line, reprocess=reprocess, verbose=verbose)
    processed_file_count += status

print(f'Done. Procesed {processed_file_count} files successfully')

Done procesed 0 files successfully
