In [None]:
from util_html import *

keyword="abortus"
url = "https://nos.nl/zoeken/?q=" + keyword

print('The search request URL:', url)

parser_content= url_to_html(url)

# The class for the search results has a weird name
# You can find it out when you look at the HTML source in your web browser
search_results = parser_content.find_all("a", {"class":"sc-f75afcb6-4 isiLEZ"})

# For comparison, print the full output, scroll through it and make sure you find the search results in there. 
print(search_results)

In [103]:
import os
def create_folders_if_not_exist(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)

def create_folders(dir):
    '''
    Creates folders in the directory if they do not exist
    Folder Structure:
    ├── * input directory
    │   ├── eng
    │   │   ├── train
    │   │   ├── test
    │   ├── nld
    │   │   ├── train
    │   │   ├── test
    '''
    create_folders_if_not_exist(dir)
    
    lang1_dir = os.path.join(dir, r'eng')
    create_folders_if_not_exist(lang1_dir)
    lang1_train_dir = os.path.join(lang1_dir, r'train')
    create_folders_if_not_exist(lang1_train_dir)
    lang1_test_dir = os.path.join(lang1_dir, r'test')
    create_folders_if_not_exist(lang1_test_dir)
        
    lang2_dir = os.path.join(dir, r'nld')
    create_folders_if_not_exist(lang2_dir)
    lang2_train_dir = os.path.join(lang2_dir, r'train')
    create_folders_if_not_exist(lang2_train_dir)
    lang2_test_dir = os.path.join(lang2_dir, r'test')
    create_folders_if_not_exist(lang2_test_dir)

    return lang1_train_dir, lang1_test_dir, lang2_train_dir, lang2_test_dir

dirs = create_folders('/home/arimo/Desktop/Studily/LD/labs/Lab1_Crawling/code/out')

In [18]:
import requests
import re
from bs4 import BeautifulSoup
import html5lib
import pandas as pd

In [5]:
def url_to_html(url):
    """Scrapes the html content from a web page. Takes a URL string as input and returns an html object. """
    
    # Get the html content
    headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
    }
    res = requests.get(url, headers=headers)
    #res = requests.get(url + ".pdf", headers={"User-Agent": "XY"})
    html = res.text
    parser_content = BeautifulSoup(html, 'html5lib')
    return parser_content

In [41]:
def get_nos_metadata(keyword, page_range):
    urls = []
    titles = []
    times = []
    domain = "https://nos.nl/"
    for i in range(1,page_range+1):
        search_url = "https://nos.nl/zoeken/?q=" + keyword + "&page=" + str(i)
        parser_content= url_to_html(search_url)
        result_url = parser_content.find_all("a", {"class":"sc-f75afcb6-4 isiLEZ"})
        result_times = parser_content.find_all("div",{"class":"sc-d6d7be46-0 jjSfnY sc-f75afcb6-6 hGGBnM"})
        result_titles = parser_content.find_all("h2",{"class":"sc-f75afcb6-3 lhteiV"})
        for j, url in enumerate(result_url):
            article_url = domain + url["href"]
            urls.append(article_url)
        for m, title in enumerate(result_titles):
            article_title = re.search(r'\<h2 class\=\"sc\-f75afcb6\-3 lhteiV\"\>(.*?)\<\/h2\>', str(title)).group(1)
            titles.append(article_title)
        for k,time in enumerate(result_times):
            article_time = re.search(r'time datetime\=\"(.*?)\"\>', str(time)).group(1)
            times.append(article_time)
    return times, titles, urls

In [79]:
nos_metadata = get_nos_metadata(keyword = "abortus", page_range = 35)

In [82]:
def remove_nos_videos(metadata):
    '''
    NOS search results sometimes includes video links
    example: https://nos.nl/nieuwsuur/video/2496544-cu-lid-annelijn-door-wie-wil-je-eigenlijk-aardig-gevonden-worden
    This function removes such links
    '''
    df = pd.DataFrame(metadata).T.rename(columns={0:'time', 1:'title', 2:'url'})
    video_pattern = "\/video\/"
    filter = df['url'].str.contains(video_pattern)
    cleaned_df = df[~filter].head(600)
    
    return cleaned_df

In [84]:
cleaned_nos_metadata = remove_nos_videos(nos_metadata)

In [85]:
def nos_metadata_train_test_split(df, n = 20):
    '''
    Split the train/test with n% of test data for nos
    '''
    #df = pd.DataFrame(metadata).T.rename(columns={0:'time', 1:'title', 2:'url'})
    rdf = df.sample(frac=1, random_state=1) # reproducibility
    
    test = int(len(df)*(n/100))
    train = len(df)
    
    test_index = rdf.index[0:test]
    train_index = rdf.index[test:train]
    
    test_metadata = rdf.head(test)
    train_metadata = rdf.tail(train-test)
    
    return train_metadata, test_metadata, train_index, test_index

In [86]:
nos_splittedmedatada = nos_metadata_train_test_split(cleaned_nos_metadata)

In [93]:
nos_splittedmedatada[2]

Int64Index([ 92, 632, 386, 605,  38,  75, 138,  44, 487, 460,
            ...
            198, 313, 434, 565, 649, 144, 160,  80, 261,  41],
           dtype='int64', length=480)

In [55]:
write_nos_metadata(dirs, 'abortus', nos_splittedmedatada)

In [88]:
def get_nos_content_and_section(url):
    '''
    Get the text from url link
    '''
    #print(url)
    soup = BeautifulSoup(requests.get(url).text,"html5lib")
    section_container = soup.select("p",{"class":"sc-f9df6382-7 cMuisv"}) # NOS section class
    section = re.search(r'cMuisv\"\>(.*?)\<\/p\>', str(section_container)).group(1)
    text = ""
    for item in soup.select("p",{"class":"sc-6d77a1d1-0 chzewu"}): # NOS body text class
        text += item.text.strip()
    return text, section

In [97]:
nos_splittedmedatada[0].reset_index()

Unnamed: 0,index,time,title,url
0,92,2023-10-23T03:47:29+0200,"Nog geen nieuwe president Argentinië, tweede s...",https://nos.nl//artikel/2495031-nog-geen-nieuw...
1,632,2023-10-23T03:47:29+0200,"Nog geen nieuwe president Argentinië, tweede s...",https://nos.nl//artikel/2495031-nog-geen-nieuw...
2,386,2023-08-08T23:05:23+0200,Referendum Ohio gaat toekomst abortusrechten b...,https://nos.nl//artikel/2485959-referendum-ohi...
3,605,2023-08-09T05:22:22+0200,"Regels referenda Ohio niet veranderd, overwinn...",https://nos.nl//artikel/2485970-regels-referen...
4,38,2023-08-25T17:29:46+0200,"Van der Staaij heeft omstreden standpunten, ma...",https://nos.nl//artikel/2487978-van-der-staaij...
...,...,...,...,...
475,144,2023-09-07T02:49:47+0200,Hoogste hof Mexico bekrachtigt recht op abortus,https://nos.nl//artikel/2489520-hoogste-hof-me...
476,160,2023-10-23T15:33:23+0200,"Podcast De Dag: Mike Johnson is onbekend, chri...",https://nos.nl//artikel/2495085-podcast-de-dag...
477,80,2023-10-23T15:33:23+0200,"Podcast De Dag: Mike Johnson is onbekend, chri...",https://nos.nl//artikel/2495085-podcast-de-dag...
478,261,2023-10-13T23:25:30+0200,Met vrouwenrechten als splijtzwam gaat Polen d...,https://nos.nl//artikel/2494020-met-vrouwenrec...


In [92]:
from tqdm import tqdm

In [98]:
def write_nos_output_and_section(splitted_metadata, keyword, dirs):
    # train
    print("Scraping Dutch training data from NOS..")
    train_section = []
    train = splitted_metadata[0].reset_index()
    for i in tqdm(range(len(train))):
        title = train['title'][i]
        url = train['url'][i]
        nos = get_nos_content_and_section(url)
        section = nos[1]
        train_section.append(section)
        text = nos[0]
        dir = dirs[2]
        filename = keyword + "_" + str(i) + "_"+ str(title).replace('/','_') + ".txt"
        with open(dir + "/" + filename, "w", encoding = "utf-8") as f:
            f.write(text)
    
    # test
    print("Scraping Dutch test data from NOS..")
    test_section = []
    test = splitted_metadata[1].reset_index()
    for j in tqdm(range(len(test))):
        title = test['title'][j]
        url = test['url'][j]
        nos = get_nos_content_and_section(url)
        section = nos[1]
        test_section.append(section)
        text = nos[0]
        dir = dirs[3]
        filename = keyword + "_" + str(j) + "_" + str(title).replace('/','_') + ".txt"
        with open(dir + "/" + filename, "w", encoding = "utf-8") as f:
            f.write(text)

    return train_section, test_section    

In [120]:
dirs[3]

'/home/arimo/Desktop/Studily/LD/labs/Lab1_Crawling/code/out/nld/test'

In [123]:
test = nos_splittedmedatada[1].reset_index()
keyword = 'abortus'
for j in tqdm(range(len(test))):
    title = test['title'][j]
    url = test['url'][j]
    nos = get_nos_content_and_section(url)
    text = nos[0]
    dir = dirs[3]
    filename = keyword + "_" + str(j) + "_" + str(title).replace('/','_') + ".txt"
    with open(dir + "/" + filename, "w", encoding = "utf-8") as f:
        f.write(text)

100%|█████████████████████████████████████████| 120/120 [00:18<00:00,  6.34it/s]


In [None]:
sections = write_nos_output_and_section(splitted_metadata = nos_splittedmedatada, keyword = 'abortus', dirs=dirs)

In [118]:
def write_nos_metadata(dirs, outfile_keywords, splittedmetadata, sections):
    trainfile = dirs[2] + '/' + outfile_keywords + "_train_metadata.csv"
    testfile = dirs[3] + '/' + outfile_keywords + "_test_metadata.csv"
    splittedmetadata[0]['content'] = sections[0]
    splittedmetadata[0]['content'] = sections[0]
    splittedmetadata[1]['content'] = sections[1]
    splittedmetadata[0].to_csv(trainfile)
    splittedmetadata[1].to_csv(testfile)

In [119]:
write_nos_metadata(dirs = dirs, outfile_keywords='abortus', splittedmetadata=nos_splittedmedatada, sections=sections)

In [2]:
keyword = "abortus"
for i in range(1,32):
    url = "https://nos.nl/zoeken/?q=" + keyword + "&page=" + str(i)
    parser_content= url_to_html(url)
    search_results = parser_content.find_all("a", {"class":"sc-f75afcb6-4 isiLEZ"})

    domain = "https://nos.nl/"
    for i, link in enumerate(search_results):    
        found_url = domain + link["href"]
        print(i, found_url)
    
    # Extract text and add the url as first line
        text = found_url + '\n'+ url_to_string(found_url) 
    
    # Save in file
        dir = "../results/nos_search_results/"
        filename = keyword + "_" + str(i) + ".txt"
        with open(dir + filename, "w", encoding = "utf-8") as f:
            f.write(text)

https://nos.nl/zoeken/?q=abortus&page=1
https://nos.nl/zoeken/?q=abortus&page=2
https://nos.nl/zoeken/?q=abortus&page=3
https://nos.nl/zoeken/?q=abortus&page=4
https://nos.nl/zoeken/?q=abortus&page=5
https://nos.nl/zoeken/?q=abortus&page=6
https://nos.nl/zoeken/?q=abortus&page=7
https://nos.nl/zoeken/?q=abortus&page=8
https://nos.nl/zoeken/?q=abortus&page=9
https://nos.nl/zoeken/?q=abortus&page=10
https://nos.nl/zoeken/?q=abortus&page=11
https://nos.nl/zoeken/?q=abortus&page=12
https://nos.nl/zoeken/?q=abortus&page=13
https://nos.nl/zoeken/?q=abortus&page=14
https://nos.nl/zoeken/?q=abortus&page=15
https://nos.nl/zoeken/?q=abortus&page=16
https://nos.nl/zoeken/?q=abortus&page=17
https://nos.nl/zoeken/?q=abortus&page=18
https://nos.nl/zoeken/?q=abortus&page=19
https://nos.nl/zoeken/?q=abortus&page=20
https://nos.nl/zoeken/?q=abortus&page=21
https://nos.nl/zoeken/?q=abortus&page=22
https://nos.nl/zoeken/?q=abortus&page=23
https://nos.nl/zoeken/?q=abortus&page=24
https://nos.nl/zoeken/?q=