In [10]:
import pandas as pd
import numpy as np
import os
import glob
import time

In [23]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)

In [3]:
DATASET_FOLDER="/media/eduseiti/bigdata01/unicamp/ia368v_dd/trabalho_final/clueweb22-pt_10M_sample_fixed"

In [9]:
def massiveweb_filter(text,    
                      min_words=50,    
                      max_words=100_000,    
                      min_word_chars=3,   
                      max_word_chars=10,    
                      max_symbol_to_word_ratio=0.1,    
                      max_lines_end_ellipsis_pct=0.3,    
                      min_words_alphabetic_char_pct=0.9,    
                      lang="pt",
                     ):
    
    """Apply the same cleaning rules used to construct the MassiveWeb corpus. The rules are described in Appendix A.1.1
    of Gopher's paper (https://arxiv.org/pdf/2112.11446.pdf).

    Returns 0 in case all rules passed. Otherwise, returns an integer that indicates which rule was not satisfied.
    """
    # TODO: use NLTK to get words?

    # remove multiple spaces, line breaks and tabs to get actual "words".
    words = ' '.join(text.split()).split()  
    num_words = len(words)

    # Remove any document that have less than min_words or more than max_words.
    if num_words < min_words or num_words > max_words:
        return 1

    # Remove any document whose mean word length in chars is less than min_word_chars or more than max_word_chars.
    mean_word_chars = np.mean(list(map(len, words)))
    if mean_word_chars < min_word_chars or mean_word_chars > max_word_chars:
        return 2

    # Remove any document with a symbol-to-word ratio greater than max_symbol_to_word_ratio for either the hash symbol
    # or the ellipsis.
    num_symbols = np.sum([word == '#' or word == '...' for word in words])
    if num_symbols / (num_words - num_symbols) > max_symbol_to_word_ratio:
        return 3

    # Remove any document with more than 90% of lines starting with a bullet point.
    # TODO: How to get bullet points if we are already using clean text?

    # Remove any document with more than 30% of lines ending with an ellipsis.
    lines = text.split('\n')
    if np.mean(list(map(lambda line: line.endswith('...'), lines))) > max_lines_end_ellipsis_pct:
        return 4
    
    # Remove any document that has less than 80% of words with at least one alphabetic character.
    # TODO: use regex to make it faster? E.g.: re.search('[a-zA-Z]', word)
    if np.mean([any(char.isalpha() for char in word) for word in words]) < min_words_alphabetic_char_pct:
        return 5

    # Remove any document that does not contain at least two stop words.
    if lang == 'pt':
        stop_words = set(['a', 'com', 'e', 'é', 'de', 'o', 'para', 'que', 'tem'])
    else:
        stop_words = set(['the', 'be', 'to', 'of', 'and', 'that', 'have', 'with'])

    if sum(stop_word in words for stop_word in stop_words) < 2:
        return 6

    return 0

### Apply MassiveWeb filter over all cleaned passages

In [6]:
dataset_parts=np.sort(glob.glob(os.path.join(DATASET_FOLDER, "clueweb22-pt_colbertx_0*_cleaned.tsv")))

In [7]:
dataset_parts

array(['/media/eduseiti/bigdata01/unicamp/ia368v_dd/trabalho_final/clueweb22-pt_10M_sample_fixed/clueweb22-pt_colbertx_00_cleaned.tsv',
       '/media/eduseiti/bigdata01/unicamp/ia368v_dd/trabalho_final/clueweb22-pt_10M_sample_fixed/clueweb22-pt_colbertx_01_cleaned.tsv',
       '/media/eduseiti/bigdata01/unicamp/ia368v_dd/trabalho_final/clueweb22-pt_10M_sample_fixed/clueweb22-pt_colbertx_02_cleaned.tsv',
       '/media/eduseiti/bigdata01/unicamp/ia368v_dd/trabalho_final/clueweb22-pt_10M_sample_fixed/clueweb22-pt_colbertx_03_cleaned.tsv',
       '/media/eduseiti/bigdata01/unicamp/ia368v_dd/trabalho_final/clueweb22-pt_10M_sample_fixed/clueweb22-pt_colbertx_04_cleaned.tsv'],
      dtype='<U125')

In [None]:
results = []

for part in dataset_parts:
    print("Handling {}...".format(os.path.basename(part)))
    
    start_time = time.time()
    
    part_df = pd.read_csv(part, sep='\t', names=['passage_id', 'passage', 'n_tokens'])

    print(">> part_df.shape={}".format(part_df.shape))
    
    part_df['massiveweb_filter'] = part_df['passage'].apply(lambda x: massiveweb_filter(x))
    
    filtered_out_count = np.sum(part_df['massiveweb_filter'] > 0)
    
    print(">> number of passages filtered out: {} ({:.4}%)".format(filtered_out_count, filtered_out_count/part_df.shape[0]))
    
    filter_results = np.unique(part_df['massiveweb_filter'].to_numpy(), return_counts=True)
    
    print(filter_results)
    
    print("\n>> elapsed time: {}\n".format(time.time() - start_time))
    
    output_file = os.path.join(DATASET_FOLDER, "{}_massiveweb.tsv".format(os.path.splitext(os.path.basename(part))[0]))
    
    print(">> Saving result as {}\n\n".format(output_file))
    
    part_df.to_csv(output_file, sep='\t', index=False)
    
    results.append({"file": os.path.basename(part),
                    "total_passages": part_df.shape[0],
                    "removed_passages": filtered_out_count,
                    "filter_results": filter_results})

### Sample 1M passages from the entire dataset

In [52]:
def select_samples(which_file, selected_passages, starting_position=0, dataset_position=0, output_folder="", add_headers=False, filter_field=None, filter_values_list=None):
    if add_headers:
        dataset_df = pd.read_csv(which_file, sep="\t", names=['passage_id', 'passage', 'n_tokens', 'massiveweb_filter'])
    else:
        dataset_df = pd.read_csv(which_file, sep="\t")
    
    print("Dataset original shape: {}".format(dataset_df.shape))
    
    if filter_field is not None:
        dataset_df = dataset_df[np.isin(dataset_df[filter_field], filter_values_list)]
        
        print("Filtered dataset shape: {}".format(dataset_df.shape))
    
    
    first_higher_list = np.where(selected_passages[starting_position:] >= dataset_position + dataset_df.shape[0])[0]
    
    if first_higher_list.shape[0] > 0:
        first_higher = first_higher_list[0]
    else:
        first_higher = selected_passages.shape[0] - starting_position
    
    print("Selecting {} passages in this part, from {}({}) until {}({})".format(first_higher, 
                                                                                starting_position, selected_passages[starting_position],
                                                                                starting_position + first_higher - 1, selected_passages[starting_position + first_higher - 1]))
    
    print(selected_passages[starting_position:(starting_position + first_higher)] - dataset_position)
    
    selected_dataset_df = dataset_df.iloc[selected_passages[starting_position:(starting_position + first_higher)] - dataset_position]
    
    print("Selected dataset shape: {}".format(selected_dataset_df.shape))
    
    output_filename = '{}_sample.tsv'.format(os.path.splitext(os.path.basename(which_file))[0])
    
    print("Saving resulting DF as {}...\n".format(output_filename))
    
    selected_dataset_df.to_csv(os.path.join(output_folder, output_filename), sep="\t", header=None, index=False)
   
    return selected_dataset_df, starting_position + first_higher, dataset_df.shape[0] + dataset_position

#### Temp code to generate results

In [56]:
results=[]

for cleaned_part in cleaned_collection_parts:
    dataset_df = pd.read_csv(cleaned_part, sep="\t")
    
    filter_results = np.unique(dataset_df['massiveweb_filter'].to_numpy(), return_counts=True)
    
    results.append({'filter_results': filter_results})

In [57]:
results

[{'filter_results': (array([0, 1, 2, 3, 4, 5, 6]),
   array([1412134,   59395,    7178,     310,   10525,  346475,   16441]))},
 {'filter_results': (array([0, 1, 2, 3, 4, 5, 6]),
   array([1412696,   59228,    8378,     284,   10353,  344613,   16796]))},
 {'filter_results': (array([0, 1, 2, 3, 4, 5, 6]),
   array([1413085,   59256,    7226,     286,   10161,  344014,   16710]))},
 {'filter_results': (array([0, 1, 2, 3, 4, 5, 6]),
   array([1415593,   58972,    8594,     270,   10336,  343488,   16795]))},
 {'filter_results': (array([0, 1, 2, 3, 4, 5, 6]),
   array([1414146,   59856,    7489,     286,   10416,  344440,   16778]))}]

In [59]:
total_passages_kept = np.sum([np.sum(part['filter_results'][1][[0, 4]]) for part in results])

In [60]:
total_passages_kept

7119445

In [64]:
[np.sum(part['filter_results'][1][[0, 4]]) for part in results]

[1422659, 1423049, 1423246, 1425929, 1424562]

In [61]:
selected_passages = np.sort(np.random.choice(list(range(0, total_passages_kept)), int(1e+6), replace=False))

In [62]:
selected_passages.shape

(1000000,)

In [39]:
cleaned_collection_parts = np.sort(glob.glob(os.path.join(DATASET_FOLDER, "clueweb22-pt_colbertx_0*_cleaned_massiveweb.tsv")))

In [40]:
cleaned_collection_parts

array(['/media/eduseiti/bigdata01/unicamp/ia368v_dd/trabalho_final/clueweb22-pt_10M_sample_fixed/clueweb22-pt_colbertx_00_cleaned_massiveweb.tsv',
       '/media/eduseiti/bigdata01/unicamp/ia368v_dd/trabalho_final/clueweb22-pt_10M_sample_fixed/clueweb22-pt_colbertx_01_cleaned_massiveweb.tsv',
       '/media/eduseiti/bigdata01/unicamp/ia368v_dd/trabalho_final/clueweb22-pt_10M_sample_fixed/clueweb22-pt_colbertx_02_cleaned_massiveweb.tsv',
       '/media/eduseiti/bigdata01/unicamp/ia368v_dd/trabalho_final/clueweb22-pt_10M_sample_fixed/clueweb22-pt_colbertx_03_cleaned_massiveweb.tsv',
       '/media/eduseiti/bigdata01/unicamp/ia368v_dd/trabalho_final/clueweb22-pt_10M_sample_fixed/clueweb22-pt_colbertx_04_cleaned_massiveweb.tsv'],
      dtype='<U136')

In [63]:
starting_position = 0
dataset_position = 0

final_passages_count = 0

for cleaned_part in cleaned_collection_parts:
    final_results = select_samples(cleaned_part, 
                                   selected_passages, 
                                   starting_position, 
                                   dataset_position, 
                                   output_folder=DATASET_FOLDER,
                                   filter_field="massiveweb_filter",
                                   filter_values_list=[0, 4])
    
    starting_position = final_results[1]
    dataset_position = final_results[2]
    
    final_passages_count += final_results[0].shape[0]

Dataset original shape: (1852458, 4)
Filtered dataset shape: (1422659, 4)
Selecting 199972 passages in this part, from 0(3) until 199971(1422655)
[      3       4       5 ... 1422652 1422653 1422655]
Selected dataset shape: (199972, 4)
Saving resulting DF as clueweb22-pt_colbertx_00_cleaned_massiveweb_sample.tsv...

Dataset original shape: (1852348, 4)
Filtered dataset shape: (1423049, 4)
Selecting 199800 passages in this part, from 199972(1422662) until 399771(2845707)
[      3       5       6 ... 1423043 1423045 1423048]
Selected dataset shape: (199800, 4)
Saving resulting DF as clueweb22-pt_colbertx_01_cleaned_massiveweb_sample.tsv...

Dataset original shape: (1850738, 4)
Filtered dataset shape: (1423246, 4)
Selecting 199400 passages in this part, from 399772(2845710) until 599171(4268953)
[      2       6       9 ... 1423239 1423242 1423245]
Selected dataset shape: (199400, 4)
Saving resulting DF as clueweb22-pt_colbertx_02_cleaned_massiveweb_sample.tsv...

Dataset original shape: 

In [65]:
final_passages_count

1000000

In [1]:
9263003 - 7119445

2143558

In [2]:
7119445 / 9263003

0.7685893008995031