In [4]:
import os
import pandas as pd

In [5]:
os.makedirs('searches', exist_ok=True)

In [6]:
# cognet
cnv2 = pd.read_csv(
    'CogNet/CogNet-v2.0.tsv',
    sep = '\t'
)
cnv2.columns = [
    'concept id',
    'query_lang', 'word 1',
    'doc_lang', 'word 2',
    'translit 1', 'translit 2'
]

# training and test sets already defined
training = pd.read_csv(
    'CogNet/evaluation/training.tsv',
    sep = '\t'
)
training.columns = [
    'concept id', 'query_lang', 'word_id',
    'group_id', 'lemma', 'transliteration'
]

test = pd.read_csv(
    'CogNet/evaluation/test.tsv',
    sep = '\t'
)
test.columns = [
    'concept id', 'query_lang', 'word_id',
    'group_id', 'lemma', 'transliteration'
]

In [7]:
# use romanization, fallback to wordform if not provided
cnv2['query'] = cnv2['translit 1'].fillna(cnv2['word 1'])
cnv2['doc'] = cnv2['translit 2'].fillna(cnv2['word 2'])

# same thing, romanization with fallback
training['query'] = training['transliteration'].fillna(training['lemma'])

test['query'] = test['transliteration'].fillna(test['lemma'])

In [8]:
# drop columns
cnv2 = cnv2[['query', 'query_lang', 'doc', 'doc_lang']]

training = training[['query', 'query_lang']]

test = test[['query', 'query_lang']]

In [9]:
# clean values
for column in cnv2.columns:
    cnv2[column] = cnv2[column].fillna('')
    cnv2[column] = cnv2[column].astype(str)
    cnv2[column] = cnv2[column].str.strip()
    
for column in training.columns:
    training[column] = training[column].fillna('')
    training[column] = training[column].astype(str)
    training[column] = training[column].str.strip()
    
    test[column] = test[column].fillna('')
    test[column] = test[column].astype(str)
    test[column] = test[column].str.strip()


In [10]:
# drop rows with empty values
cnv2 = cnv2[~(cnv2 == '').any(axis=1)]

training = training[~(training == '').any(axis=1)]

test = test[~(test == '').any(axis=1)]

In [11]:
# get both directions, let any word be a potential query or doc
cnv2_switched = cnv2.copy().rename({
    'query': 'doc',
    'doc': 'query',
    'query_lang': 'doc_lang',
    'doc_lang': 'query_lang'
})
cnv2 = pd.concat([cnv2, cnv2_switched])

In [12]:
# assert unique query-doc pairs, consider both language and word form
cnv2 = cnv2.drop_duplicates()

training = training.drop_duplicates()

test = test.drop_duplicates()

In [13]:
# filter
qrels_train = cnv2[cnv2['query'].isin(training['query'])]

qrels_test = cnv2[cnv2['query'].isin(test['query'])]

qrels_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55456 entries, 578 to 5323296
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   query       55456 non-null  object
 1   query_lang  55456 non-null  object
 2   doc         55456 non-null  object
 3   doc_lang    55456 non-null  object
dtypes: object(4)
memory usage: 2.1+ MB


In [14]:
# make query ids for train
train_unique_queries = qrels_train['query'].unique()
train_query_map = pd.DataFrame({
    'query': train_unique_queries,
    'query_id': [str(i) for i in range(len(train_unique_queries))]
})
qrels_train = qrels_train.merge(train_query_map, on='query', how='left')

# make query ids for test
test_unique_queries = qrels_test['query'].unique()
test_query_map = pd.DataFrame({
    'query': test_unique_queries,
    'query_id': [str(i) for i in range(len(test_unique_queries))]
})
qrels_test = qrels_test.merge(test_query_map, on='query', how='left')

In [15]:
# make doc ids
unique_docs = cnv2['doc'].unique()
doc_map = pd.DataFrame({
    'doc': unique_docs,
    'doc_id': [str(i) for i in range(len(unique_docs))]
})

# merge to cnv2
old_len = len(cnv2)
cnv2 = cnv2.merge(doc_map, on='doc', how='left')
assert len(cnv2) == old_len, (len(cnv2), old_len)

# add to train qrels
old_len = len(qrels_train)
qrels_train = qrels_train.merge(doc_map, on='doc', how='left')
assert len(qrels_train) == old_len, (len(qrels_train), old_len)

# add to test qrels
old_len = len(qrels_test)
qrels_test = qrels_test.merge(doc_map, on='doc', how='left')
assert len(qrels_test) == old_len, (len(qrels_test), old_len)

In [16]:
# relevance judgment
qrels_train['relevance'] = 1

qrels_test['relevance'] = 1

In [17]:
# sort
qrels_train = qrels_train.sort_values(
    by=['query_id', 'doc_id']
)[['query_id', 'query', 'query_lang', 'doc_id', 'doc', 'doc_lang', 'relevance']]

qrels_test = qrels_test.sort_values(
    by=['query_id', 'doc_id']
)[['query_id', 'query', 'query_lang', 'doc_id', 'doc', 'doc_lang', 'relevance']]

In [18]:
# write
assert qrels_train.isna().sum().sum() == 0
qrels_train.to_json('qrels-train-v2.jsonl', lines=True, orient='records')

assert qrels_test.isna().sum().sum() == 0
qrels_test.to_json('qrels-test-v2.jsonl', lines=True, orient='records')

In [19]:
# make queries too
queries = qrels_train[['query_id', 'query', 'query_lang']].drop_duplicates(subset='query_id')
queries['query_id'] = queries['query_id'].astype(int)
assert queries.isna().sum().sum() == 0
queries.sort_values(by='query_id').to_csv('queries-train.tsv', sep='\t', index=False)

queries = qrels_test[['query_id', 'query', 'query_lang']].drop_duplicates(subset='query_id')
queries['query_id'] = queries['query_id'].astype(int)
assert queries.isna().sum().sum() == 0
queries.sort_values(by='query_id').to_csv('queries-test.tsv', sep='\t', index=False)

In [20]:
# index for bktree to search
docs = cnv2[['doc_id', 'doc', 'doc_lang']].drop_duplicates(subset='doc_id')
docs['doc_id'] = docs['doc_id'].astype(int)
assert docs.isna().sum().sum() == 0
docs.sort_values(by='doc_id').to_csv('docs.tsv', sep='\t', index=False)