In [1]:
import numpy as np
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
import json
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from bs4.element import Tag

In [2]:
def remove_hidden(l):
    return [el for el in l if el[0] != "."]

def get_relative_path_to_dirs(start_path):
    subdirs = [x[1] for x in os.walk(start_path)][0]
    subdirs = remove_hidden(subdirs)
    subdirs = [start_path + "/" + subdir for subdir in subdirs]
    return subdirs

def get_relative_path_to_files(start_path):
    files = [f for f in listdir(start_path) if isfile(join(start_path, f))]
    files = remove_hidden(files)
    files = [start_path + "/" + file for file in files]
    return files

In [3]:
home_articles_directory = "../articles"

subdirs = get_relative_path_to_dirs(home_articles_directory)
subdirs

['../articles/medium',
 '../articles/splinters',
 '../articles/tutorialspoint',
 '../articles/wikihow',
 '../articles/kdnuggets',
 '../articles/smartdatacollective']

## Read dataset

In [5]:
dataset = []

for subdir in subdirs:
    subsubdirs = get_relative_path_to_dirs(subdir)
    for subsubdir in subsubdirs:
        onlyfiles = get_relative_path_to_files(subsubdir)
        read_json_list = []
        for file in onlyfiles:
            with open(file, 'r') as infile:
                d = json.load(infile)
                dataset.append(d)

In [6]:
len(dataset)

838

## Extract content headers

In [37]:
hs = ["h1", "h2", "h3"]

for d in dataset:
    headers = []
    soup = BeautifulSoup(d["content_html"])
    for h in hs:
        headers += [el.text for el in soup.select(h)]
    d["headers"] = headers

In [40]:
print("Average number of headers per article: {0}".format(sum([len(d["headers"]) for d in dataset]) / len(dataset)))

Average number of headers per article: 5.140811455847255


In [77]:
import re

def clean_html(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    cleantext = re.sub("(<!--.*?-->)", "", cleantext, flags=re.DOTALL)
    return cleantext

def remove_newlines(content):
    return content.replace("\n", " ")

def remove_white_spaces(content):
    content = re.sub(' +', ' ', content)
    content = content.strip()
    return content

def remove_urls(content):
    content = re.sub(r'https?:\/\/.*[\r\n]*', '', content, flags=re.MULTILINE)
    content = re.sub(r'http?:\/\/.*[\r\n]*', '', content, flags=re.MULTILINE)
    return content

def remove_code(content):
    content = re.sub(r'(\w+(\.\w+)*\([^\)]*\))', '', content, flags=re.MULTILINE) # matches a.b.c(d)
    return content

def clean_content(content):
    content = clean_html(content)
    content = remove_newlines(content)
    content = remove_white_spaces(content)
    content = remove_urls(content)
    content = remove_code(content)
    return content

In [78]:
print(clean_content(dataset[643]["content"]))

