In [2]:
import os 
import sys 
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection
from colbert import Indexer, Searcher
from datasets import load_dataset

ds = load_dataset("wikimedia/wikipedia", "20231101.en")



In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 6407814
    })
})

In [4]:
ds['train']['text'][:3]

['Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism. Anarchism advocates for the replacement of the state with stateless societies and voluntary free associations. As a historically left-wing movement, this reading of anarchism is placed on the farthest left of the political spectrum, usually described as the libertarian wing of the socialist movement (libertarian socialism).\n\nHumans have lived in societies without formal hierarchies long before the establishment of states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist ideas are found all throughout history, modern anarchism emerged from the Enlightenment. During the latter half of the 19th and the first decades of the 20th century, the anarchist moveme

In [6]:
from tqdm import tqdm
for i, entry in tqdm(enumerate(iter(ds['train']))):
    if i>10:
        break
    print(entry['title'])

11it [00:00, 4258.57it/s]

Anarchism
Albedo
A
Alabama
Achilles
Abraham Lincoln
Aristotle
An American in Paris
Academy Award for Best Production Design
Academy Awards
Actrius





In [10]:

'''
Datasets I want to have in the dataset: 
- d_id, title(label), text(erster absatz bis zum ersten \n\n)
- disambiguation dataset
'''

import pandas as pd
from tqdm import tqdm
import re
import os

def clean_label(label):
    pattern = r'\([^)]*\)'
    # Replace all matches with an empty string
    result = re.sub(pattern, '', label)
    return result.strip()  # Added strip() to remove any trailing spaces

def clean_text(text):
    if '\n\n' in text:
        text_cut = text.split('\n\n')[0]
    else:
        text_cut = text
    return text_cut

def is_disambiguation(label):
    # Check if the label contains "(disambiguation)"
    return "(disambiguation)" in label

def format_doc_id(index):
    # Format with 7 digits (since there are 6,352,465 docs total)
    return f"d_{index:07d}"

# Initialize empty DataFrames
doc_df = pd.DataFrame(columns=['d_id', 'label', 'text'])
disambiguation_df = pd.DataFrame(columns=['d_id', 'label', 'text'])

# Lists to store rows for each DataFrame (more efficient than append)
doc_rows = []
disambiguation_rows = []

for i, entry in tqdm(enumerate(iter(ds['train']))):
    # Check if it's a disambiguation page before cleaning
    # if i >1000:
    #     break
    doc_id = format_doc_id(i + 1)
    if is_disambiguation(entry['title']):
        disambiguation_rows.append({'d_id': doc_id, 'label': entry['title'], 'text': entry['text']})
    else:
        # For regular documents
        cleaned_text = clean_text(entry['text'])
        doc_rows.append({'d_id': doc_id, 'label': entry['title'], 'text': cleaned_text})

    # Create DataFrames from the collected rows (much more efficient)
    
doc_df = pd.DataFrame(doc_rows).set_index('d_id')
disambiguation_df = pd.DataFrame(disambiguation_rows).set_index('d_id')

# Print some stats to verify
print(f"Regular documents: {len(doc_df)}")
print(f"Disambiguation documents: {len(disambiguation_df)}")
pwd = os.getcwd()
datapath = os.path.join(pwd, '../../data/wikipedia')
os.makedirs(datapath, exist_ok=True)
doc_df.to_pickle(path=os.path.join(datapath, 'wikipedia-text-data-no-disambiguation.pkl.gzip'), compression='gzip')
disambiguation_df.to_pickle(path=os.path.join(datapath, 'wikipedia-disambiguation-data.pkl.gzip'), compression='gzip')

6407814it [09:14, 11553.49it/s]


Regular documents: 6352465
Disambiguation documents: 55349


In [11]:
i = 0
while(True):
    if (i*500000)-1 > 6352465:
        break
    wd = os.getcwd()
    datapath = os.path.join(pwd, '../../data/wikipedia/split-data-no-disambiguation')
    os.makedirs(datapath, exist_ok=True)    
    
    doc_df[i*500000:((i+1)*500000)-1].to_pickle(path=os.path.join(datapath, f'wikipedia-text-data-no-disambiguation_{i}.pkl.gzip'), compression='gzip')
    i=i+1

ds['train']['text'][:3]

# Add test dataset with just articals containing
+ Jaguar
+ Hammer
+ Coke
+ Pope
+ Comic
+ Drink
+ Cat
+ Dwarf
+ Game
+ Texas

# Add Query Dataset for all disambiguations

In [12]:
disambiguation_df[:20]

Unnamed: 0_level_0,label,text
d_id,Unnamed: 1_level_1,Unnamed: 2_level_1
d_0000026,Austin (disambiguation),Austin is the capital of Texas in the United S...
d_0000041,Aberdeen (disambiguation),Aberdeen is a city in Scotland.\n\nAberdeen ma...
d_0000055,Argument (disambiguation),"In logic and philosophy, an argument is an att..."
d_0000068,Animal (disambiguation),"An animal is a multicellular, eukaryotic organ..."
d_0000076,Asia Minor (disambiguation),Asia Minor is an alternative name for Anatolia...
d_0000108,Atlas (disambiguation),An atlas is a collection of maps.\n\nAtlas may...
d_0000275,Atlantic (disambiguation),The Atlantic Ocean is the second largest of th...
d_0000294,Athene (disambiguation),Athene or Athena is the shrewd companion of he...
d_0000396,ASIC (disambiguation),"In the realm of electronic technology, ASIC st..."
d_0000447,Lory (disambiguation),A Lory is a small to medium-sized arboreal par...
