# Preprocess

1. Preproess content
2. Convert .json to data frames
3. Get features

## Preprocess JSON data

1.   Statistics
2.   Convert to pandas dataframes

In [2]:
'''
    Load raw data in .json format
'''
import json

document_map = json.load(open('./data/raw/documents.json'))
training_map = json.load(open('./data/raw/trainingset.json'))
validation_map = json.load(open('./data/raw/validationset.json'))

In [2]:
# Maximum length of single query
query_len_list = []
for (key, value) in training_map['queries'].items():
  query_len_list.append(len(value))
print(len(query_len_list))
print(max(query_len_list))

30000
186


In [None]:
# Maximum length of single document
doc_len_list = []
for (key, value) in document_map.items():
  doc_len_list.append(len(value))
print(len(doc_len_list))
print(max(doc_len_list))

In [3]:
# Prepare for preprocessing
from preprocess import TextPreprocessor as TPP
import pandas as pd
from tqdm import tqdm
from unidecode import unidecode
import nltk

In [6]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [7]:
'''
  Preprocess documents.json and convert to .csv
'''

csv_folder = './data/csv/'
document_csv_file = csv_folder + 'documents.csv'

tpp = TPP()

from multiprocessing import Manager, Process, Pool

def func(proc_idx, data_list, start_idx, end_idx, ret_dict):
    doc_pd = pd.DataFrame({}, columns=['doc_id', 'doc_text'])
    pd_idx = 0
    for (doc_id, doc_text) in tqdm(data_list[start_idx:end_idx]):
        doc_pd.loc[pd_idx, 'doc_id'] = doc_id
        doc_pd.loc[pd_idx, 'doc_text'] = ' '.join(tpp.preprocess(unidecode(doc_text)))
        pd_idx = pd_idx + 1
    ret_dict[proc_idx] = doc_pd
    
data_list = list(document_map.items())
data_list_len = len(data_list)
proc_num = 8
chunk_size = data_list_len // proc_num

pool = Pool(proc_num)
manager = Manager()
ret_dict = manager.dict()

for proc_idx in range(proc_num):
    start_idx = proc_idx * chunk_size
    end_idx = min(data_list_len, start_idx + chunk_size)
    pool.apply_async(func, args=(proc_idx, data_list, start_idx, end_idx, ret_dict))

pool.close()
pool.join()

doc_frame = pd.concat([ret_dict[i] for i in range(proc_num)], axis=0)
doc_frame.to_csv(document_csv_file)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
100%|██████████| 62500/62500 [07:54<00:00, 131.62it/s]
100%|██████████| 62500/62500 [07:53<00:00, 131.90it/s]
100%|██████████| 62500/62500 [07:55<00:00, 131.41it/s]
100%|██████████| 62500/62500 [07:55<00:00, 131.46it/s]
100%|██████████| 62500/62500 [07:54<00:00, 131.76it/s]
100%|██████████| 62500/62500 [07:55<00:00, 131.51it/s]
100%|██████████| 62500/62500 [08:00<00:00, 130.06it/s]
100%|██████████| 62500/62500 [08:05<00:00, 128.76it/s]


In [8]:
'''
    Preprocess json dataset with labels and convert to .csv file
'''

def json2csv(data_map):
    # Process query_id and query_text
    df = pd.DataFrame({}, columns=['query_id', 'query_text', 'query_label'])
    tpp = TPP()
    
    pd_idx = 0
    for (query_id, query_text) in tqdm(data_map['queries'].items()):
        df.loc[pd_idx, 'query_id'] = query_id
        df.loc[pd_idx, 'query_text'] = ' '.join(tpp.preprocess(unidecode(query_text)))
        pd_idx = pd_idx + 1
    
    pd_idx = 0
    for (query_id, query_label) in tqdm(data_map['labels'].items()):
        assert str(df.loc[pd_idx, 'query_id']) == query_id
        df.loc[pd_idx, 'query_label'] = ' '.join(query_label)
        pd_idx = pd_idx + 1
    return df

In [9]:
csv_folder = './data/csv/'
training_csv = csv_folder + 'training.csv'
validation_csv = csv_folder + 'validation.csv'

json2csv(training_map).to_csv(training_csv)
json2csv(validation_map).to_csv(validation_csv)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
100%|██████████| 30000/30000 [01:01<00:00, 489.88it/s]
100%|██████████| 30000/30000 [00:04<00:00, 6033.41it/s]
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package average

## Prepare for Wrod2Vec model

1.   Dictionary preparation
2.   Corpus preparation
3.   TF-IDF preparation

In [10]:
from gensim import corpora, similarities, models
import pandas as pd
from tqdm import tqdm

In [18]:
'''
    Prepare for dictionary, TF-IDF model, sprase matrix
'''

csv_folder = './data/csv/'
document_csv_file = csv_folder + 'documents.csv'
document_pd = pd.read_csv(document_csv_file)
# pool all text from documents
raw_text = document_pd['doc_text'].values.tolist()
# pool all words from documents
text_pool = [line.split() for line in raw_text]

In [19]:
'''
    Clean corpus
'''
from collections import defaultdict
# Remove words appear once
word_freq = defaultdict(int)
for line in text_pool:
    for word in line:
        word_freq[word] += 1
text_pool = [[token for token in line if word_freq[token] > 1] for line in text_pool]

In [21]:
'''
    Save dictionary
'''
dict_path = './model/Word2Vec/'

print('Initializing ...')
dictionary = corpora.Dictionary(text_pool)
corpus = [dictionary.doc2bow(line) for line in text_pool]
tfidf_model = models.TfidfModel(corpus, dictionary=dictionary)
corpus_tfidf = tfidf_model[corpus]
print("Initialized")

# Save dict and model
dictionary.save(dict_path + 'dictionary.dict')
tfidf_model.save(dict_path + 'tfidf.model')
corpora.MmCorpus.serialize(dict_path + 'corpus.mm', corpus)
num_features = len(dictionary.token2id.keys())
# Similarities of sparse matrix
index = similarities.SparseMatrixSimilarity(corpus_tfidf, num_features=num_features)
index.save(dict_path + 'index.index')

Initializing ...
Initialized


In [22]:
print(num_features)

225512


## Feature Engineering Preparation

Corpus, Word Vectors, Models loading.

* Word2Vec
* TF-IDF
* BM25

In [1]:
from gensim import corpora, similarities, models
from gensim.models import Word2Vec
import pandas as pd
from tqdm import tqdm
from gensim.summarization.bm25 import BM25
import numpy as np
import math
import collections
from multiprocessing import cpu_count, Pool
import Levenshtein
import textdistance

'''
    Initialize models
'''
vec_model_path = './model/Word2Vec/GoogleNews-vectors-negative300.bin.gz'
print('Loading models...')
g_vec_model = models.KeyedVectors.load_word2vec_format(vec_model_path, binary=True)
g_dictionary = corpora.Dictionary.load('./model/Word2Vec/dictionary.dict')
g_tfidf_model = models.TfidfModel.load("./model/Word2Vec/tfidf.model")
g_index = similarities.SparseMatrixSimilarity.load('./model/Word2Vec/index.index')
print('Loaded')

'''
    Pool all text from documents
'''
csv_folder = './data/csv/'
document_csv_file = csv_folder + 'documents.csv'
document_pd = pd.read_csv(document_csv_file)

# pool all items from documents
raw_text = document_pd['doc_text'].values.tolist()
text_pool = [line.split() for line in raw_text]

from collections import defaultdict
# Remove words appear once
word_freq = defaultdict(int)
for line in text_pool:
    for word in line:
        word_freq[word] += 1
text_pool = [[token for token in line if word_freq[token] > 1] for line in text_pool]

'''
    Load BM25 model
'''
from gensim.corpora.mmcorpus import MmCorpus
from gensim.test.utils import datapath

dict_path = './model/Word2Vec/'

g_corpus = MmCorpus(dict_path + 'corpus.mm')
g_bm25_model = BM25(text_pool)
# g_vec_bm25_model = BM25(g_corpus)
g_average_idf = sum(map(lambda k: float(g_bm25_model.idf[k]), g_bm25_model.idf.keys())) / len(g_bm25_model.idf.keys())

Loading models...
Loaded


In [2]:
'''
    Features computation utils
'''
def get_len(x):
    '''
        Length of tokens
    '''
    x = x.split()
    return len(x)


def get_token_cnt(x, y):
    '''
        Compute times of each token of y appeared in x
    '''
    x = x.split()
    y = y.split()
    num = 0
    for i in y:
        if i in x:
            num += 1
    return num


def get_token_cnt_ratio(x, y):
    x = x.split()
    return y / len(x)


def get_jaccard_sim(x, y):
    '''
        Jaccard Similarity between x & y
    '''
    x = set(x)
    y = set(y)
    return float(len(x & y) / len(x | y))


def get_mat_cos_sim(doc, corpus):
    '''
        Cosine Similarity between x & y
    '''
    corpus = corpus.split(' ')
    doc = doc.split(' ')

    corpus_vec = [g_dictionary.doc2bow(corpus)]
    vec = g_dictionary.doc2bow(doc)

    corpus_tfidf = g_tfidf_model[corpus_vec]
    vec_tfidf = g_tfidf_model[vec]

    num_features = len(g_dictionary.token2id.keys())
    mat_index = similarities.SparseMatrixSimilarity(corpus_tfidf, num_features=num_features)
    sim = mat_index.get_similarities(vec_tfidf)

    return sim[0]


def get_weight_counter_and_tf_idf(x, y):
    x = x.split()
    y = y.split()
    corups = x + y
    obj = dict(collections.Counter(corups))
    x_weight = []
    y_weight = []
    idfs = []
    for key in obj.keys():
        idf = 1
        w = obj[key]
        if key in x:
            idf += 1
            x_weight.append(w)
        else:
            x_weight.append(0)
        if key in y:
            idf += 1
            y_weight.append(w)
        else:
            y_weight.append(0)
        idfs.append(math.log(3.0 / idf) + 1)
    return [np.array(x_weight), np.array(y_weight), np.array(x_weight) * np.array(idfs), np.array(y_weight) * np.array(idfs), np.array(list(obj.keys()))]


def get_manhattan_distance(x, y):
    '''
        Manhattan distance
    '''
    return np.linalg.norm(x - y, ord=1)


def get_cos_sim(x, y):
    '''
        Cosine similarity between vectors
    '''
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

   
def get_euclidean_sim(x, y):
    '''
        Euclidean similarity between vectors
    '''
    return np.sqrt(np.sum(x - y) ** 2)


def get_tfidf_sim(query, doc):
    '''
        TF-IDF
    '''
    weight = list(map(lambda x, y: get_weight_counter_and_tf_idf(x, y), tqdm(query), doc))
    x_weight_couner = []
    y_weight_couner = []
    x_weight_tfidf = []
    y_weight_tfidf = []
    words = []
    for i in weight:
        x_weight_couner.append(i[0])
        y_weight_couner.append(i[1])
        x_weight_tfidf.append(i[2])
        y_weight_tfidf.append(i[3])
        words.append(i[4])

    mht_sim_counter = list(map(lambda x, y: get_manhattan_distance(x, y), x_weight_couner, y_weight_couner))
    mht_sim_tfidf = list(map(lambda x, y: get_manhattan_distance(x, y), x_weight_tfidf, y_weight_tfidf))

    cos_sim_counter = list(map(lambda x, y: get_cos_sim(x, y), x_weight_couner, y_weight_couner))
    cos_sim_tfidf = list(map(lambda x, y: get_cos_sim(x, y), x_weight_tfidf, y_weight_tfidf))

    euclidean_sim_counter = list(map(lambda x, y: get_euclidean_sim(x, y), x_weight_couner, y_weight_couner))
    euclidean_sim_tfidf = list(map(lambda x, y: get_euclidean_sim(x, y), x_weight_tfidf, y_weight_tfidf))

    return mht_sim_counter, mht_sim_tfidf, cos_sim_counter, cos_sim_tfidf, euclidean_sim_counter, euclidean_sim_tfidf


def get_word_vec(x):
    '''
        Word2Vec
    '''
    vec = []
    for word in x.split():
        if word in g_vec_model:
            vec.append(g_vec_model[word])
    if len(vec) == 0:
        return np.nan
    else:
        return np.mean(np.array(vec), axis=0)


def get_df_grams(train_sample, values, cols):
    def create_ngram_set(input_list, ngram_value):
        return set(zip(*[input_list[i:] for i in range(ngram_value)]))

    def get_n_gram(df, values):
        train_query = df.values
        train_query = [[word for word in str(sen).replace("'", '').split(' ')] for sen in train_query]
        train_query_n = []
        for input_list in train_query:
            train_query_n_gram = set()
            for value in range(values, values + 1):
                train_query_n_gram = train_query_n_gram | create_ngram_set(input_list, value)
            train_query_n.append(train_query_n_gram)
        return train_query_n

    train_query = get_n_gram(train_sample[cols[0]], values)
    train_title = get_n_gram(train_sample[cols[1]], values)
    sim = list(map(lambda x, y: len(x) + len(y) - 2 * len(x & y), train_query, train_title))
    sim_number_rate = list(map(lambda x, y:   len(x & y) / len(x) if len(x) != 0 else 0, train_query, train_title))
    return sim, sim_number_rate


def get_token_matched_features(query, title):
    q_list = query.split()
    t_list = title.split()
    set_query = set(q_list)
    set_title = set(t_list)
    count_words = len(set_query.union(set_title))

    comwords = [word for word in t_list if word in q_list]
    comwords_set = set(comwords)
    unique_rate = len(comwords_set) / count_words

    same_word1 = [w for w in q_list if w in t_list]
    same_word2 = [w for w in t_list if w in q_list]
    same_len_rate = (len(same_word1) + len(same_word2)) / \
        (len(q_list) + len(t_list))
    if len(comwords) > 0:
        com_index1 = len(comwords)
        same_word_q = com_index1 / len(q_list)
        same_word_t = com_index1 / len(t_list)

        for word in comwords_set:
            index_list = [i for i, x in enumerate(q_list) if x == word]
            com_index1 += sum(index_list)
        q_loc = com_index1 / (len(q_list) * len(comwords))
        com_index2 = len(comwords)
        for word in comwords_set:
            index_list = [i for i, x in enumerate(t_list) if x == word]
            com_index2 += sum(index_list)
        t_loc = com_index2 / (len(t_list) * len(comwords))

        same_w_set_q = len(comwords_set) / len(set_query)
        same_w_set_t = len(comwords_set) / len(set_title)
        word_set_rate = 2 * len(comwords_set) / \
            (len(set_query) + len(set_title))

        com_set_query_index = len(comwords_set)
        for word in comwords_set:
            index_list = [i for i, x in enumerate(q_list) if x == word]
            if len(index_list) > 0:
                com_set_query_index += index_list[0]
        loc_set_q = com_set_query_index / (len(q_list) * len(comwords_set))
        com_set_title_index = len(comwords_set)
        for word in comwords_set:
            index_list = [i for i, x in enumerate(t_list) if x == word]
            if len(index_list) > 0:
                com_set_title_index += index_list[0]
        loc_set_t = com_set_title_index / (len(t_list) * len(comwords_set))
        set_rate = (len(comwords_set) / len(comwords))
    else:
        unique_rate, same_len_rate, same_word_q, same_word_t, q_loc, t_loc, same_w_set_q, same_w_set_t, word_set_rate, loc_set_q, loc_set_t, set_rate = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    return unique_rate, same_len_rate, same_word_q, same_word_t, q_loc, t_loc, same_w_set_q, same_w_set_t, word_set_rate, loc_set_q, loc_set_t, set_rate


def get_substr_features(query, title):
    q_list = query.split()
    query_len = len(q_list)
    t_list = title.split()
    title_len = len(t_list)
    count1 = np.zeros((query_len + 1, title_len + 1))
    index = np.zeros((query_len + 1, title_len + 1))
    for i in range(1, query_len + 1):
        for j in range(1, title_len + 1):
            if q_list[i - 1] == t_list[j - 1]:
                count1[i][j] = count1[i - 1][j - 1] + 1
                index[i][j] = index[i - 1][j - 1] + j
            else:
                count1[i][j] = 0
                index[i][j] = 0
    max_count1 = count1.max()

    if max_count1 != 0:
        row = int(np.where(count1 == np.max(count1))[0][0])
        col = int(np.where(count1 == np.max(count1))[1][0])
        mean_pos = index[row][col] / (max_count1 * title_len)
        begin_loc = (col - max_count1 + 1) / title_len
        rows = np.where(count1 != 0.0)[0]
        cols = np.where(count1 != 0.0)[1]
        total_loc = 0
        for i in range(0, len(rows)):
            total_loc += index[rows[i]][cols[i]]
        density = total_loc / (query_len * title_len)
        rate_q_len = max_count1 / query_len
        rate_t_len = max_count1 / title_len
    else:
        begin_loc, mean_pos, total_loc, density, rate_q_len, rate_t_len = 0, 0, 0, 0, 0, 0
    return max_count1, begin_loc, mean_pos, total_loc, density, rate_q_len, rate_t_len


def get_common_words(query, title):
    query = set(query.split())
    title = set(title.split())
    return len(query & title)


def get_bm25_group(df):
    '''
        Build BM25 model for each query group
    '''
    df.columns = ['query_id', 'query_text', 'doc_text']
    df['query_id'] = df['query_id'].fillna('always_nan')
    query_id_group = df.groupby(['query_id'])
    bm_list = []
    for name, group in tqdm(query_id_group):
        group_corpus = group['doc_text'].values.tolist()
        group_corpus = [sentence.strip().split() for sentence in group_corpus]
        query = group['query_text'].values[0].strip().split()
        group_bm25_model = BM25(group_corpus)
        # group_average_idf = sum(map(lambda k: float(group_bm25_model.idf[k]), group_bm25_model.idf.keys())) / len(group_bm25_model.idf.keys())
        bm_score = group_bm25_model.get_scores(query) # group_average_idf)
        bm_list.extend(bm_score)

    return bm_list


def get_bm25_overall(doc_id, query_text):
    '''
        Compute BM25 with model over all documents
    '''
    score = g_bm25_model.get_score(query_text.split(' '), document_id_2_idx[doc_id]) #g_average_idf
    return score

## Getting Features

For each pair of query and document, we have features below:

* Jaccard similarity
* Levenshtein distance
* Sparse matrix cosine similarity
* TF-IDF similarities
* Word vectors similarities
* N-gram
* Tokens features
* BM25 in group and overall

In [3]:
'''
    Compute all features
''' 

def get_features(feature_data):
    data = feature_data.copy()
    feat_prefix = 'feat_'

    # get text for each id
    data['query_text'] = data['query_id'].apply(lambda query_id: query_map[query_id])
    data['doc_text'] = data['doc_id'].apply(lambda doc_id: document_map[doc_id])
    
    data['query_len'] = data['query_text'].apply(get_len)
    data['doc_len'] = data['doc_text'].apply(get_len)

    data['query_vec'] = data['query_text'].apply(lambda x: get_word_vec(x))
    data['doc_vec'] = data['doc_text'].apply(lambda x: get_word_vec(x))

    data[feat_prefix + 'jaccard_sim'] = list(map(get_jaccard_sim, data['query_text'], data['doc_text']))
    data[feat_prefix + 'edit_distance'] = list(map(lambda x, y: Levenshtein.distance(x, y) / (len(x) + 1), tqdm(data['query_text']), data['doc_text']))
    data[feat_prefix + 'edit_jaro'] = list(map(lambda x, y: Levenshtein.jaro(x, y), tqdm(data['query_text']), data['doc_text']))
    data[feat_prefix + 'edit_ratio'] = list(map(lambda x, y: Levenshtein.ratio(x, y), tqdm(data['query_text']), data['doc_text']))
    data[feat_prefix + 'edit_jaro_winkler'] = list(map(lambda x, y: Levenshtein.jaro_winkler(x, y), tqdm(data['query_text']), data['doc_text']))
    data[feat_prefix + 'hamming'] = list(map(lambda x, y: textdistance.Hamming(qval=None).normalized_distance(x, y), tqdm(data['query_text']), data['doc_text']))

    data[feat_prefix + 'mat_cos_sim'] = list(map(lambda x, y: get_mat_cos_sim(x, y), tqdm(data['query_text']), data['doc_text']))

    data[feat_prefix + 'mht_sim'], data[feat_prefix + 'tf_mht_sim'], \
    data[feat_prefix + 'cos_sim'], data[feat_prefix + 'tf_cos_sim'], \
    data[feat_prefix + 'euc_sim'], data[feat_prefix + 'tf_euc_sim'] \
        = get_tfidf_sim(data['query_text'], data['doc_text'])
    
    data[feat_prefix + 'cos_mean_word2vec'] = list(map(get_cos_sim, tqdm(data['query_vec']), data['doc_vec']))
    data[feat_prefix + 'cos_mean_word2vec'] = data[feat_prefix + 'cos_mean_word2vec'].apply(lambda x: np.nan if np.isnan(x).any() else x)
    data[feat_prefix + 'euc_mean_word2vec'] = list(map(get_euclidean_sim, tqdm(data['query_vec']), data['doc_vec']))
    data[feat_prefix + 'mhd_mean_word2vec'] = list(map(get_manhattan_distance, tqdm(data['query_vec']), data['doc_vec']))
    data[feat_prefix + 'mhd_mean_word2vec'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else get_manhattan_distance(x, y), tqdm(data['query_vec']), data['doc_vec']))
    data[feat_prefix + '2_gram_sim'], data[feat_prefix + '2_sim_number_rate'] = get_df_grams(data, 2, ['query_text', 'doc_text'])
    
    data[feat_prefix + '3_gram_sim'], data[feat_prefix + '3_sim_number_rate'] = get_df_grams(data, 3, ['query_text', 'doc_text'])
    
    '''
    data[feat_prefix + 'query_token_matched_cnt'] = list(map(get_token_cnt, data['doc_text'], data['query_text']))
    data[feat_prefix + 'query_token_matched_cnt_ratio'] = list(map(get_token_cnt_ratio, data['query_text'], data['feat_query_token_matched_cnt']))
    data[feat_prefix + "ls_max_count"], data[feat_prefix + "ls_local_begin"], data[feat_prefix + "ls_local_mean"], data[feat_prefix+"ls_total_loc"], data[feat_prefix + "ls_density"], data[feat_prefix + "ls_rate_q_len"], data[feat_prefix + "ls_rate_t_len"] = zip(*data.apply(lambda line: get_substr_features(line["query_text"], line["doc_text"]), axis=1))
    data[feat_prefix + 'common_words'] = list(map(get_common_words, data['doc_text'], data['query_text']))
    data[feat_prefix + 'common_words_rate_q'] = data[feat_prefix + 'common_words'] / data['query_len']
    data[feat_prefix + 'common_words_rate_d'] = data[feat_prefix + 'common_words'] / data['doc_len']
    data[feat_prefix + "unique_rate"], data[feat_prefix + "same_len_rate"], data[feat_prefix + "same_word_q"], data[feat_prefix + "same_word_t"], data[feat_prefix + "q_loc"], data[feat_prefix + "t_loc"], data[feat_prefix + "same_w_set_q"], data[feat_prefix + "same_w_set_t"], data[feat_prefix + "word_set_rate"], data[feat_prefix + "loc_set_q"], data[feat_prefix + "loc_set_t"], data[feat_prefix + "set_rate"] = zip(*data.apply(lambda line: get_token_matched_features(line["query_text"], line["doc_text"]), axis=1))
    '''
    
    data[feat_prefix + 'bm25_group'] = get_bm25_group(data[['query_id', 'query_text', 'doc_text']])
    data[feat_prefix + 'bm25_overall'] = list(map(get_bm25_overall, tqdm(data['doc_id']), data['query_text']))
    
    feat = ['query_id', 'doc_id', 'relevance']
    for col in data.columns:
        if col.find(feat_prefix) != -1:
            feat.append(col)

    data = data[feat]
    
    return data

In [4]:
'''
    Documents and queries map from id to text
'''
document_map = dict()
query_map = dict()
# map doc_id to doc index in corpus
document_id_2_idx = dict()

def init_dict(query_export_file):
    doc_idx = 0
    for doc in zip(document_pd['doc_id'], document_pd['doc_text']):
        document_map[doc[0]] = doc[1]
        document_id_2_idx[doc[0]] = doc_idx
        doc_idx += 1
    
    query_export_pd = pd.read_csv(query_export_file)
    for query in zip(query_export_pd['query_id'], query_export_pd['query_text']):
        query_map[query[0]] = query[1]

In [5]:
'''
    Interface for training dataset generation
    * Features computation
'''

def pool_extract(data, f, chunk_size, worker=8):
    from multiprocessing import cpu_count,Pool
    cpu_worker = cpu_count()
    print('CPU core: {}'.format(cpu_worker))
    if worker == -1 or worker > cpu_worker:
        worker = cpu_worker
    print('Cores used: {}'.format(worker))
    len_data = len(data)
    start = 0
    end = 0
    p = Pool(worker)
    res = []
    while end < len_data:
        end = start + chunk_size
        if end > len_data:
            end = len_data
        rslt = p.apply_async(f, args=(data[start:end],))
        start = end
        res.append(rslt)
    p.close()
    p.join()
    results = pd.concat([i.get() for i in res], axis=0, ignore_index=True)
    return results

def generate_features(export_file, feature_file, new_set=False):
    feature_pd = None
    num_worker = 8
    if new_set:
        export_pd = pd.read_csv(export_file)
        feature_pd = export_pd[['doc_id', 'query_id', 'relevance']]
    else:
        feature_pd = pd.read_csv(feature_file)
    CHUNK_SIZE = len(feature_pd) // num_worker + 1
    print(f'Chunk size: {CHUNK_SIZE}, Length: {len(feature_pd)}')
    # feature_pd = get_features(feature_pd)
    feature_pd = pool_extract(feature_pd, get_features, CHUNK_SIZE, worker=num_worker)
    feature_pd.to_csv(feature_file, index=False)
    print('Features Generation Finished')

In [7]:
'''
    Basic settings for training features generation
'''
NEGATIVE_SCALE = 30 # 100 150 200
csv_folder = './data/csv/'
train_folder = './data/train/'
dict_file = csv_folder + 'training.csv'
export_file = csv_folder + 'export_training_' + str(NEGATIVE_SCALE) + '.csv'
feature_file = train_folder + 'features_' + str(NEGATIVE_SCALE) + '.csv'

init_dict(dict_file)
generate_features(export_file, feature_file, new_set=True)

Chunk size: 124148, Length: 993178
CPU core: 8
Cores used: 8


100%|██████████| 124148/124148 [00:01<00:00, 70564.47it/s]
100%|██████████| 124148/124148 [00:01<00:00, 69926.23it/s]
100%|██████████| 124148/124148 [00:01<00:00, 71378.63it/s]
100%|██████████| 124148/124148 [00:01<00:00, 70724.47it/s]
100%|██████████| 124148/124148 [00:00<00:00, 549517.41it/s]
100%|██████████| 124148/124148 [00:01<00:00, 67302.72it/s]
100%|██████████| 124148/124148 [00:00<00:00, 540718.33it/s]

100%|██████████| 124148/124148 [00:00<00:00, 531562.58it/s]
100%|██████████| 124148/124148 [00:01<00:00, 70652.76it/s]
100%|██████████| 124148/124148 [00:00<00:00, 565393.55it/s]
100%|██████████| 124148/124148 [00:01<00:00, 70305.98it/s]
  8%|▊         | 9726/124148 [00:00<00:01, 97239.69it/s]/s]
 92%|█████████▏| 113992/124148 [00:00<00:00, 571557.46it/s]
  0%|          | 0/124142 [00:00<?, ?it/s]0, 566920.15it/s]
100%|██████████| 124148/124148 [00:00<00:00, 554841.06it/s]
100%|██████████| 124148/124148 [00:01<00:00, 94747.66it/s]
100%|██████████| 124148/124148 [00:01<00:00, 95

Features Generation Finished


In [9]:
'''
    Basic settings for validation features generation
'''
NEGATIVE_SCALE = 30
csv_folder = './data/csv/'
train_folder = './data/train/'
dict_file = csv_folder + 'validation.csv'
export_file = csv_folder + 'export_validation_' + str(NEGATIVE_SCALE) + '.csv'
feature_file = train_folder + 'validation_' + str(NEGATIVE_SCALE) + '.csv'

init_dict(dict_file)
generate_features(export_file, feature_file, new_set=True)

Chunk size: 12354, Length: 98828
CPU core: 8
Cores used: 8


100%|██████████| 12350/12350 [00:00<00:00, 73254.40it/s]
100%|██████████| 12354/12354 [00:00<00:00, 70774.42it/s]
100%|██████████| 12354/12354 [00:00<00:00, 70920.98it/s]

100%|██████████| 12354/12354 [00:00<00:00, 69620.38it/s]

  0%|          | 0/12354 [00:00<?, ?it/s], 70432.71it/s]
100%|██████████| 12350/12350 [00:00<00:00, 593603.87it/s]
  0%|          | 0/12354 [00:00<?, ?it/s], 589285.14it/s]
100%|██████████| 12354/12354 [00:00<00:00, 581292.70it/s]
100%|██████████| 12354/12354 [00:00<00:00, 66995.98it/s]]



100%|██████████| 12354/12354 [00:00<00:00, 574276.91it/s]
100%|██████████| 12354/12354 [00:00<00:00, 559047.46it/s]
100%|██████████| 12354/12354 [00:00<00:00, 90632.06it/s]

  0%|          | 0/12354 [00:00<?, ?it/s], 96800.49it/s]
100%|██████████| 12354/12354 [00:00<00:00, 95021.80it/s]
100%|██████████| 12354/12354 [00:00<00:00, 90913.20it/s]
  0%|          | 0/12354 [00:00<?, ?it/s]
100%|██████████| 12354/12354 [00:00<00:00, 584703.58it/s]
100%|██████████| 12350/12350 [00:

Features Generation Finished
