In [1]:
import glob
import json
from tqdm import tqdm
import os
import json
import pandas as pd
import re

from improved_topics.topic_detection import (
    detect_topics_by_frequency, 
    detect_topics_tfidf, 
    detect_topics_with_entities, 
    calculate_topic_vectors,
    setup_nltk_for_ner
)



# Download and extract data

Download and extract a dump from Wikipedia
```
wget -O simplewiki-20250301-pages-articles-multistream.xml.bz2 https://dumps.wikimedia.org/simplewiki/20250301/simplewiki-20250301-pages-articles-multistream.xml.bz2
```

```
bzip2 -d simplewiki-20250301-pages-articles-multistream.xml.bz2 
```

Use WikiExtractor to convert XML into a number of JSON files
```
python -m wikiextractor.WikiExtractor simplewiki-20250301-pages-articles-multistream.xml --json --output extracted
```

In [2]:
files = []
[files.extend(glob.glob(i+'/*')) for i in glob.glob('../extracted/*')]

[None, None, None]

In [3]:
files

['../extracted/AA/wiki_28',
 '../extracted/AA/wiki_70',
 '../extracted/AA/wiki_72',
 '../extracted/AA/wiki_95',
 '../extracted/AA/wiki_09',
 '../extracted/AA/wiki_81',
 '../extracted/AA/wiki_99',
 '../extracted/AA/wiki_17',
 '../extracted/AA/wiki_03',
 '../extracted/AA/wiki_20',
 '../extracted/AA/wiki_00',
 '../extracted/AA/wiki_05',
 '../extracted/AA/wiki_47',
 '../extracted/AA/wiki_21',
 '../extracted/AA/wiki_42',
 '../extracted/AA/wiki_11',
 '../extracted/AA/wiki_76',
 '../extracted/AA/wiki_57',
 '../extracted/AA/wiki_51',
 '../extracted/AA/wiki_02',
 '../extracted/AA/wiki_88',
 '../extracted/AA/wiki_53',
 '../extracted/AA/wiki_46',
 '../extracted/AA/wiki_39',
 '../extracted/AA/wiki_49',
 '../extracted/AA/wiki_12',
 '../extracted/AA/wiki_61',
 '../extracted/AA/wiki_59',
 '../extracted/AA/wiki_29',
 '../extracted/AA/wiki_16',
 '../extracted/AA/wiki_14',
 '../extracted/AA/wiki_54',
 '../extracted/AA/wiki_67',
 '../extracted/AA/wiki_50',
 '../extracted/AA/wiki_66',
 '../extracted/AA/wi

In [4]:
data_rows = []

for file_path in tqdm(files):
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            try:
                record = json.loads(line)
                title = record.get("title", "")
                text = record.get("text", "")
                tfidf_topics = detect_topics_by_frequency(text)
                
                topic_label = ", ".join(tfidf_topics)
                
                data_rows.append({
                    "filename": file_path,
                    "title": title,
                    "text": text,
                    "number_of_characters": len(text),
                    "number_of_words": len(text.split()),
                    "topic": topic_label,
                    "text_quality": 0,
                    
                })
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {file_path}: {e}")

df = pd.DataFrame(data_rows)
print(df)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 233/233 [16:17<00:00,  4.20s/it]


                       filename                    title  \
0       ../extracted/AA/wiki_28              Card Sharks   
1       ../extracted/AA/wiki_28              Family Feud   
2       ../extracted/AA/wiki_28  Bank of America Stadium   
3       ../extracted/AA/wiki_28             Aston Martin   
4       ../extracted/AA/wiki_28       Sydney Opera House   
...                         ...                      ...   
367504  ../extracted/AC/wiki_13       Dirk VI of Holland   
367505  ../extracted/AC/wiki_13    Floris III of Holland   
367506  ../extracted/AC/wiki_13    Newcastle Breakers FC   
367507  ../extracted/AC/wiki_13      Newcastle KB United   
367508  ../extracted/AC/wiki_13     Adamstown Rosebud FC   

                                                     text  \
0       Card Sharks is a game show that has aired in d...   
1       Family Feud is an American television game sho...   
2       The Bank of America Stadium is a sports stadiu...   
3       Aston Martin Lagonda Limite

In [5]:
df.to_parquet('simple_english.parquet')

In [6]:
df

Unnamed: 0,filename,title,text,number_of_characters,number_of_words,topic,text_quality
0,../extracted/AA/wiki_28,Card Sharks,Card Sharks is a game show that has aired in d...,2818,514,"Sports, Calendar",0
1,../extracted/AA/wiki_28,Family Feud,Family Feud is an American television game sho...,3347,611,"Sports, Calendar",0
2,../extracted/AA/wiki_28,Bank of America Stadium,The Bank of America Stadium is a sports stadiu...,226,40,Sports,0
3,../extracted/AA/wiki_28,Aston Martin,Aston Martin Lagonda Limited is a luxury car c...,1064,192,"History, Alphabet, Technology",0
4,../extracted/AA/wiki_28,Sydney Opera House,The Adil's Sydney Opera House is an opera hous...,1143,198,"Music, Calendar, History",0
...,...,...,...,...,...,...,...
367504,../extracted/AC/wiki_13,Dirk VI of Holland,Dirk VI (c. 1114 – 5 August 1157) was Count of...,250,48,Unknown,0
367505,../extracted/AC/wiki_13,Floris III of Holland,"Floris III (1141 – August 1, 1190) was the cou...",147,30,Unknown,0
367506,../extracted/AC/wiki_13,Newcastle Breakers FC,Newcastle Breakers Football Club was an Austra...,129,20,Sports,0
367507,../extracted/AC/wiki_13,Newcastle KB United,Newcastle KB United was an Australian soccer c...,116,19,Sports,0


In [13]:
df['number_of_words'].sum()

32077573

In [66]:
from transformers import PreTrainedTokenizerFast

enc = PreTrainedTokenizerFast.from_pretrained('./dev/data/simple_english.parquet/custom_tokenizer')

In [71]:
with open('./dev/data/simple_english.parquet/simple_english.parquet_custom_train_000041.bin', "rb") as f:
    # first read the header, which is 256 int32 integers (4 bytes each)
    header = np.frombuffer(f.read(256*4), dtype=np.int32)
    ntok = header[2] # number of tokens (claimed)
    # the rest of it are tokens, stored as uint16
    tokens = np.frombuffer(f.read(), dtype=np.uint32)

In [89]:
decoded_text = enc.decode(tokens[100:200]).replace(" ", "").replace("Ġ", " ").replace(" ##", "").replace('Ċ','\n')

In [90]:
print(decoded_text)

 originated from English speaking countries. It finds seldom use in other metricated Commonwealth Nations.</s></s></s> Shohreh Solati (, born Fatemeh Solati on January 4, 1959, in Tehran) is an Iranian singer. She is among the most active and prolific Iranian female singers. Since the Iranian Revolution, she has continued her music career outside the country.
She is divorced with a daughter and lives in Los Angeles, California.</s></s>
