In [1]:
import bs4

def load_document_fields(html):
    """Load selected html tags to python structure"""
    soup = bs4.BeautifulSoup(html)
    headers = {}
    for header in soup.html.head:
        if header.name is not None:
            if header.get('property'):
                headers[header.get('property')] = header.get('content')
    
    related = []
    
    article = []
    for tag in soup.body.article:
        if tag.name is not None:
            if tag.name == 'related':
                for link in tag.children:
                    if link.name is not None:
                        url = link.get('href')
                        if url:
                            related.append(url)
            else:
                article.append((tag.name, tag.text))
                        
    return headers, article, related

headers, article, related = load_document_fields(open('20191107/01/1105816787692523871.html'))
headers

{'og:url': 'https://tribune.com.pk/story/2094610/4-call-self-partnered-emma-watson-single/',
 'og:site_name': 'The Express Tribune',
 'article:published_time': '2019-11-07T01:31:43+00:00',
 'og:title': 'I call it self-partnered: Emma Watson on being single',
 'og:description': 'Actor opens up on being happil\xady on her own'}

In [2]:
dates = !ls . | grep 2019
dates

['20191101',
 '20191102',
 '20191103',
 '20191104',
 '20191105',
 '20191106',
 '20191107',
 '20191108',
 '20191109',
 '20191110',
 '20191111',
 '20191112',
 '20191113',
 '20191114',
 '20191115',
 '20191116',
 '20191117']

In [3]:
%%time

import glob
import tqdm
import joblib

def load_dataset(pattern='2019????/??/*.html'):
    dataset = []
    for name in glob.glob(pattern):
        headers, article, related = load_document_fields(open(name))
        date, hour, file_id = name.split('/')
        file_id = file_id.split('.')[0]
        info = {
            'filename': name,
            'date': date,
            'hour': hour,
            'file_id': file_id,
            'headers': headers,
            'article': article,
            'related': related,
        }
        dataset.append(info)
    return dataset
        
dataset_by_date = joblib.Parallel(8)(joblib.delayed(load_dataset)(
    '{}/??/*.html'.format(date)
) for date in dates)

CPU times: user 11.4 s, sys: 2.69 s, total: 14.1 s
Wall time: 2min 51s


In [4]:
import pandas as pd

dataset_indexes = []
for dataset in dataset_by_date:
    for article in dataset:
        dataset_indexes.append({
            'file_id': article['file_id'], 
            'date' : article['date'],
            'hour': article['hour'],
        })
dataset = pd.DataFrame(dataset_indexes)

In [5]:
uniqueness = dataset.groupby('file_id').nunique()['date']
uniqueness = uniqueness[uniqueness > 1]

In [6]:
dataset[dataset.file_id.isin(uniqueness.index)].sort_values('file_id')

Unnamed: 0,file_id,date,hour
420227,1055835840710796728,20191116,05
341402,1055835840710796728,20191114,19
328628,1211837495533936170,20191114,21
451914,1211837495533936170,20191117,04
342291,1211837496974472896,20191114,19
...,...,...,...
280059,9149753395620100820,20191113,20
385832,9215450053136366527,20191115,19
330474,9215450053136366527,20191114,15
342500,928191164687517445,20191114,02


In [7]:
dataset

Unnamed: 0,file_id,date,hour
0,6500716908851810728,20191101,07
1,3780707621951097142,20191101,07
2,2385754716373828899,20191101,07
3,5414891177363946578,20191101,07
4,832941841345060939,20191101,07
...,...,...,...
467478,5480941220001677487,20191117,13
467479,5110653854200034071,20191117,13
467480,7207864704140334579,20191117,13
467481,5575934301845427666,20191117,13


In [8]:
import pandas as pd

dataset_selected = []
for dataset in dataset_by_date:
    for article in dataset:
        dataset_selected.append(article['headers'])

dataset = pd.DataFrame(dataset_selected)

In [9]:
dataset

Unnamed: 0,og:url,og:site_name,article:published_time,og:title,og:description
0,http://uaprom.info/news/171894-nbu-prognozirue...,UAprom.info,2019-11-01T07:14:06+00:00,НБУ прогнозирует сокращения транзита газа,Объем транзита природного газа через Украину м...
1,https://fedpress.ru/news/46/ecology/2356006,ФедералПресс,2019-11-01T07:14:00+00:00,«Мираторг» хочет построить свинокомплекс на ме...,"Местные жители возмущены тем, что на месте бое..."
2,https://www.sigmalive.com/news/local/595172/vr...,SIGMALIVE,2019-11-01T07:10:00+00:00,Βροχές και καταιγίδες σε περιοχές της Κύπρου- ...,Χαμηλή πίεση αρχίζει να επηρεάζει την περιοχή....
3,http://dndz.tv/article-30015.html,dndz.tv,2019-11-01T07:18:00+00:00,Волейбольная команда «ДМК» г. Каменское стала ...,В Каменском (Днепродзержинске) завершен волейб...
4,http://regions.ru/news/2627555/,REGIONS.RU — новости Федерации,2019-11-01T07:03:00+00:00,Александр Шерин : Московские чиновники сами пр...,"Комментарий к статье Кто ближе к деньгам, тот ..."
...,...,...,...,...,...
467478,https://www.waz.de/staedte/essen/handball-spie...,Westdeutsche Allgemeine Zeitung,2019-11-17T13:12:00+00:00,Essen: Handball-Spieler bricht beim Match zusa...,Dramatischer Zwischenfall beim Handball-Heimsp...
467479,https://www.sportingnews.com/us/nfl/news/nfl-p...,Sporting News,2019-11-17T13:14:00+00:00,"NFL picks, predictions for Week 11: Texans get...",In our picks and predictions for Week 11 of th...
467480,https://nationalinterest.org/blog/buzz/75-year...,The National Interest,2019-11-17T13:00:00+00:00,75 Years Ago the USS Grayback Was Lost in the ...,Here is the story of what happened.
467481,https://www.businessinsider.com.au/erling-brau...,Business Insider Australia,2019-11-17T13:08:30+00:00,A 19-year-old scores so many goals he's starti...,They say actions speak louder than words. At l...
