Below, we take of downloading the files, extracting them and reading the first file into memory. Note that unless `overwrite` is set, the files are not re-downloaded or re-extracted.

In [3]:
import urllib.request
import os

base_url = "http://knowitall.cs.washington.edu/oqa/data/wikianswers/"
number_of_archive_parts = 39
data_directory = os.path.expanduser('~/data/')
overwrite = False

if not os.path.exists(data_directory):
    os.makedirs(data_directory)

# Create list of files to fetch
parts = [
    f"part-{str(part_num).zfill(5)}.gz"
    for part_num in range(0, number_of_archive_parts)
]
questions = "questions.txt"
files = parts + [questions]

print(f"Requesting {len(files)} data files.")
if (overwrite == False):
    files = list(filter(lambda file: not os.path.exists(data_directory + file), files))
    print(f"Downloading {len(files)} files.")

# Fetch WikiAnswer paraphrase corpus
urls = [base_url + file for file in files]
results = [
    urllib.request.urlretrieve(url, data_directory + filename)
    for (url, filename) in zip(urls, files)
]
print("Done.")

Requesting 40 data files.
Downloading 0 files.
Done.


In [4]:
import gzip
import shutil

def extract_files(files_to_extract):
    # Check for previously extracted files and skip them if not overwriting
    extracted_file_names = [os.path.splitext(file)[0] for file in files_to_extract]
    if (overwrite == False):
        files_to_extract = list(filter(lambda file: not os.path.exists(data_directory + file), extracted_file_names))
        print(f"Extracting {len(files_to_extract)} files.")

    for file in files_to_extract:
        with gzip.open(data_directory + file + ".gz", 'rb') as f_in:
            with open(data_directory + file, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    return extracted_file_names
extracted_files = extract_files(parts)

Extracting 0 files.


In [71]:
import re

def process_line(line):
    questions = [re.sub('q:','', question) for question in line.split('\t') if question.find('a:') == -1]
    return [re.sub('\n','', question) for question in questions]

file = data_directory + extracted_files[-1]
print(file)
with open(file) as f:
    lines = [process_line(line) for line in f.readlines()]        

/Users/erikbeerepoot/data/part-00038


In each of the of the downloaded `.gz` files, there are sets of clustered questions. For example:

```
<SNIP>
q:Calories in a handful of strawberries?	q:Calories in handful of strawberries?	q:How many calories are in 1cup of strawberrys?	q:How many calories are in a handful of strawberries? 
</SNIP>
```

While the data are not perfect (some are more similar than others), it overall looks reasonable. We want to:
1. Read in each of the clusters, and generate pairs of similar questions
2. Randomly sample from other clusters to generate pairs of dissimilar questions 
3. Train our model on this dataset

In order to maintain a balanced dataset, we want to create as many negative examples as there are postive examples. Hence, we compute the number of permutations (question pairs) we obtain and generate the same number of non-similar pairs.

In [76]:
import itertools


def generate_question_pairs(cluster):
    """
    Generate permutations of length to (i.e. question pairs) 
    """
    return list(itertools.permutations(cluster, 2))


def generate_uniform_random_indices(low, high, count):
    """
    Returns a list of integers using the discrete uniform distribution, without repeats.
    low -- Starting index
    high -- Ending index
    count -- The number of integers to generate 
    """
    random_integers = []
    generated_count = 0
    while (generated_count < count):
        # Generate a random integer that doesn't match [forbidden]
        random_integer = np.random.randint(0, count)
        while (random_integer in random_integers):
            random_integer = np.random.randint(0, count)
        random_integers.append(random_integer)
        generated_count = generated_count + 1
    return random_integers


def assemble_distinct_question_pairs(current_cluster_index, num_to_generate,
                                     all_clusters):
    """
    Returns a list of dissimilar questions (i.e. negative examples)
    current_cluster_index -- The index of the currrent cluster we're processing
    num_to_generate -- The number of distinct pairs to generate
    all_clusters -- The set of all clusters of questions
    """
    number_of_clusters = len(all_clusters)
    indices = generate_uniform_random_indices(0, number_of_clusters,
                                              num_to_generate)

    questions = []
    for index in indices:
        cluster_size = len(all_clusters[index])
        if(cluster_size > 0):
            questions.append(all_clusters[index][np.random.randint(0, len(all_clusters[index]))])
            
    return generate_question_pairs(questions)


# Generate question pairs for both semantically similar questions and distinct (dissimilar) questions
similar_question_pairs = []
distinct_question_pairs = []

foo = lines[0:1500]
for line in foo:
    number_of_distinct_phrasings = len(line)
    cluster_index = lines.index(line)
    question_pairs = generate_question_pairs(line)

    similar_question_pairs = similar_question_pairs + question_pairs
    distinct_question_pairs = distinct_question_pairs + assemble_distinct_question_pairs(
        cluster_index, number_of_distinct_phrasings, lines
    )

num_similar_question_pairs = sum(
    list([len(foo) for foo in similar_question_pairs]))
num_distinct_question_pairs = sum(
    list([len(foo) for foo in distinct_question_pairs]))
assert (num_similar_question_pairs == num_distinct_question_pairs)

Now that we have the question pairs, we can assemble them into our input dataframe & write to disk.

In [77]:
import pandas as pd

out_path = os.path.expanduser('~/data/question_pair_dataframe.csv')

similar_df = pd.DataFrame(
    similar_question_pairs[:], columns=['question1', 'question2'])
similar_df.insert(2, "labels", 1)

distinct_df = pd.DataFrame(
    distinct_question_pairs[:], columns=['question1', 'question2'])
distinct_df.insert(2, "labels", 0)

combined_df = pd.concat([similar_df, distinct_df], axis=0, ignore_index=True)
combined_df = combined_df.sample(frac=1).reset_index(drop=True)
combined_df.to_csv(out_path, index=False)