In [None]:
import os
os.chdir('..')

### Init files 

In [None]:
import io
import csv
import gzip
import requests

from tqdm.auto import tqdm

In [None]:
TMP_DIR = 'tmp'
DATA_PATH = 'data'

C4_URL = 'https://huggingface.co/datasets/allenai/c4/resolve/main/multilingual/c4-nl.tfrecord-{}-of-01024.json.gz'
N_FILES = 1024

if not os.path.exists(TMP_DIR):
    os.mkdir(TMP_DIR)
if not os.path.exists(DATA_PATH):
    os.mkdir(DATA_PATH)

In [None]:
csv_path = os.path.join(DATA_PATH, "dumps_url_extracted.csv")

fieldnames = ['url', 'timestamp', 'text_length', 'included']
csv_writer = csv.DictWriter(open(csv_path, 'w', newline=''), fieldnames=fieldnames)

csv_writer.writeheader()

### Get GPT quality filter (can be skipped)

In [None]:
USE_GPT_FILTER = True # set to false if you want to skip these steps 

In [None]:
!git clone https://huggingface.co/spaces/ssgrn/gpt3-quality-filter

In [None]:
import sys

sys.path.append("gpt3-quality-filter") 

In [None]:
import os
import json
import numpy as np

from sklearn.feature_extraction.text import (CountVectorizer, TfidfTransformer, HashingVectorizer,
                                             TfidfVectorizer)
from sklearn.linear_model import LogisticRegression

from lr.hyperparameters import SEARCH_SPACE, RandomSearch, HyperparameterSearch

def load_model(serialization_dir):
    with open(os.path.join(serialization_dir, "best_hyperparameters.json"), 'r') as f:
        hyperparameters = json.load(f)
    if hyperparameters.pop('stopwords') == 1:
        stop_words = 'english'
    else:
        stop_words = None
    weight = hyperparameters.pop('weight')
    if weight == 'binary':
        binary = True
    else:
        binary = False
    ngram_range = hyperparameters.pop('ngram_range')
    ngram_range = sorted([int(x) for x in ngram_range.split()])
    if weight == 'tf-idf':
        vect = TfidfVectorizer(stop_words=stop_words,
                               lowercase=True,
                               ngram_range=ngram_range)
    elif weight == 'hash':
        vect = HashingVectorizer(stop_words=stop_words,lowercase=True,ngram_range=ngram_range)
    else:
        vect = CountVectorizer(binary=binary,
                               stop_words=stop_words,
                               lowercase=True,
                               ngram_range=ngram_range)
    if weight != "hash":
        with open(os.path.join(serialization_dir, "vocab.json"), 'r') as f:
            vocab = json.load(f)
        vect.vocabulary_ = vocab
    hyperparameters['C'] = float(hyperparameters['C'])
    hyperparameters['tol'] = float(hyperparameters['tol'])
    classifier = LogisticRegression(**hyperparameters)
    if os.path.exists(os.path.join(serialization_dir, "archive", "idf.npy")):
        vect.idf_ = np.load(os.path.join(serialization_dir,  "archive", "idf.npy"))
    classifier.coef_ = np.load(os.path.join(serialization_dir,  "archive", "coef.npy"))
    classifier.intercept_ = np.load(os.path.join(serialization_dir,  "archive", "intercept.npy"))
    classifier.classes_ = np.load(os.path.join(serialization_dir,  "archive", "classes.npy"))
    return classifier, vect

def score(x, clf, vectorizer):
    return clf.predict_proba(vectorizer.transform([x]))

clf, vectorizer = load_model("gpt3-quality-filter/model/")

In [None]:
content = """
The British Museum is a public museum dedicated to human history, art and culture located in the Bloomsbury area of London. Its permanent collection of eight million works is among the largest and most comprehensive in existence.[3] It documents the story of human culture from its beginnings to the present.[a] The British Museum was the first public national museum to cover all fields of knowledge.[4]

In 2022 the museum received 4,097,253 visitors, an increase of 209 per cent from 2021. It ranked third in the list of most-visited art museums in the world.[5]

The museum was established in 1753, largely based on the collections of the Anglo-Irish physician and scientist Sir Hans Sloane.[6] It first opened to the public in 1759, in Montagu House, on the site of the current building. The museum's expansion over the following 250 years was largely a result of British colonisation and resulted in the creation of several branch institutions, or independent spin-offs, the first being the Natural History Museum in 1881. The right to ownership of some of its most well-known acquisitions, notably the Greek Elgin Marbles and the Egyptian Rosetta Stone, is subject to long-term disputes and repatriation claims.[7][8]

In 1973, the British Library Act 1972[9] detached the library department from the British Museum, but it continued to host the now separated British Library in the same Reading Room and building as the museum until 1997. The museum is a non-departmental public body sponsored by the Department for Digital, Culture, Media and Sport, and as with all national museums in the UK it charges no admission fee, except for loan exhibitions.[10]
"""

pred = score(content, clf, vectorizer)

print(pred)

### Download the dumps one by one, get the urls, classify them using GPT-3's quality filter (optional) and write them to a CSV (takes multiple hours) 

In [None]:
start = 0

for num in tqdm(range(start, N_FILES + 1)):
    num_str = str(num).zfill(5)
    url = C4_URL.format(num_str)
    
    tmp_path = '{}/tmp.json.gz'.format(TMP_DIR)
    response = requests.get(url, stream=True)

    with open(tmp_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)

    with gzip.open(tmp_path, 'rb') as f:
        with io.TextIOWrapper(f, encoding='utf-8') as text_file:
            for line in text_file:
                read_line = json.loads(line)
                content = read_line['text']
                
                if USE_GPT_FILTER:
                    pred = score(content, clf, vectorizer)
                    quality_prob = pred[0][1]
                    if np.random.pareto(9) > 1 - quality_prob: # Params from Brown et al. (2020), p. 43
                        included = 1
                    else:
                        included = 0
                else:
                    included = 0
                csv_writer.writerow({
                    'url': read_line['url'],
                    'timestamp': read_line['timestamp'],
                    'text_length': len(read_line['text'].split()),
                    'included': included
                })
                
    os.remove(tmp_path)
os.remove(tmp_path)

### Read CSV in chunks and get netlocs

In [None]:
import pandas as pd
from urllib.parse import urlparse

urldict = dict()
urldict_quality = dict()

chunk_size = 10000
csv_reader = pd.read_csv(csv_path, chunksize=chunk_size)

for chunk in tqdm(csv_reader, total = 9622):
    netloc = chunk.url.apply(lambda x: urlparse(x).netloc)
    for word_counts, included, netloc in zip(chunk.text_length, chunk.included, netloc):
        if netloc not in urldict:
            urldict[netloc] = word_counts
        else:
            urldict[netloc] += word_counts
        
        if included == 1:
            if netloc not in urldict_quality:
                urldict_quality[netloc] = word_counts
            else:
                urldict_quality[netloc] += word_counts

In [None]:
url_count_path = os.path.join(DATA_PATH, 'url_counts.csv')
url_counts = pd.DataFrame(urldict.items(), columns=['url', 'tokens'])
url_counts.sort_values('tokens', ascending=False).reset_index(drop=True)

quality_url_count_path = os.path.join(DATA_PATH, 'quality_url_counts.csv')
quality_url_counts = pd.DataFrame(urldict_quality.items(), columns=['url', 'tokens'])
quality_url_counts.sort_values('tokens',
                               ascending=False).reset_index(drop=True)

# url_counts.to_csv(url_count_path)

### Clean the data, add the labels

In [None]:
def clean_url(url):
    """
    Remove www. prefix and 'm.' prefixes
    """
    url = url.lower().strip()
    url = url.replace('www.', '')
    if url[:2] == 'm.':
        url = url[2:]
    elif '.m.' in url:
        url = url.replace('.m.', '.')
    return url

website_labels_path = os.path.join(DATA_PATH, 'website_labels.csv')
website_labels = pd.read_csv(website_labels_path, header=None)
website_labels.columns = ['url', 'label']
website_labels['url'] = website_labels.url.progress_apply(lambda x: clean_url(x[:-2]))

website_labels = website_labels.drop_duplicates(keep='last')
url2label = dict(zip(website_labels.url, website_labels.label))

In [None]:
url_counts['url'] = url_counts.url.apply(clean_url)
quality_url_counts['url'] = quality_url_counts.url.apply(clean_url)

distinct_counts = url_counts.groupby('url').sum().reset_index().sort_values('tokens', ascending=False).reset_index(drop=True)
distinct_quality_url_counts = quality_url_counts.groupby('url').sum().reset_index().sort_values('tokens', ascending=False).reset_index(drop=True)

In [None]:
distinct_counts['Rank'] = [x+1 for x in range(len(distinct_counts))]
distinct_quality_url_counts['Rank'] = [x+1 for x in range(len(distinct_quality_url_counts))]

In [None]:
url2rank = dict(zip(distinct_counts.url, distinct_counts.Rank))

diff = []
for row in distinct_quality_url_counts.itertuples():
    diff.append(url2rank[row.url] - row.Rank)
    
distinct_quality_url_counts['Diff'] = diff

In [None]:
def custom_round(num):
    rounded = round(num, 1)
    counter = 1
    if rounded == 0.0:
        counter = 2
        while rounded == 0.0:
            rounded = round(num, counter)
            counter += 1
    str_num = f"{rounded:.{counter}f}"
    if "." in str_num:
        str_num = str_num.rstrip('0').rstrip('.')
    return str_num + '%'


distinct_counts['Label'] = distinct_counts['url'].apply(lambda x: url2label[x] if x in url2label else '').fillna('')
distinct_quality_url_counts['Label'] = distinct_quality_url_counts['url'].apply(lambda x: url2label[x] if x in url2label else '').fillna('')

distinct_counts['Aandeel'] = (distinct_counts.tokens / sum(distinct_counts.tokens) * 100).apply(custom_round)
distinct_quality_url_counts['Aandeel'] = (distinct_quality_url_counts.tokens 
                                          / sum(distinct_quality_url_counts.tokens) * 100).apply(custom_round)

In [None]:
distinct_counts.columns = ['Url', 'Aantal woorden', 'Rank', 'Label', 'Aandeel']
distinct_quality_url_counts.columns = ['Url', 'Aantal woorden', 'Rank', 'Verschil', 'Label','Aandeel']

In [None]:
distinct_counts.to_csv(url_count_path, index=False)
distinct_quality_url_counts.to_csv(quality_url_count_path, index=False)