In [710]:
import pandas as pd
import numpy as np
import nltk
from collections import Counter

In [711]:
meta_charact = ['.', ',', '^', '$', '*', '+', '?', '{', '}', '[', ']', '\\', '|', '(' ,')', '/']

Чистит строки от метасимволов

In [712]:
def clear_data(words_list):
    for ind in range(len(words_list)):
        for char in meta_charact:
            words_list[ind] = words_list[ind].replace(char, '')
    return words_list

### Считаем файлы

In [713]:
train_set = pd.read_csv('relevance_train.csv', sep='\t', names=['query_id', 'document_id', 'relevance'])
train_set = train_set[1:]
train_set.head()

Unnamed: 0,query_id,document_id,relevance
1,1,184,2
2,1,29,2
3,1,31,2
4,1,12,3
5,1,51,3


In [714]:
test_set = pd.read_csv('relevance_test.csv', sep=',', names=['query_id', 'document_id'])
test_set = test_set[1:]
test_set.head()

Unnamed: 0,query_id,document_id
1,126,974
2,126,1326
3,126,187
4,126,969
5,126,970


In [715]:
queries_set = pd.read_csv('queries.csv', sep='\t', names=['query_id', 'query'], header=None, index_col=['query_id'])
queries_set = queries_set[1:]
queries_set.head()

Unnamed: 0_level_0,query
query_id,Unnamed: 1_level_1
1,what similarity laws must be obeyed when const...
2,what are the structural and aeroelastic proble...
4,what problems of heat conduction in composite ...
8,can a criterion be developed to show empirical...
9,what chemical kinetic system is applicable to ...


Приведём файл с документами к удобочитаемому виду. Для каждого документа будет хранить Counter от всех слов в документе, предварительно подготовленных nltk.word_tokenize. Автора, тему, дату не будем учитывать как отдельный признак

In [716]:
documents_set = pd.DataFrame(columns=['document_id', 'content'])

# паттерны для id, date, author, ...
patterns = ['.Id', '.A', '.T', '.B','.W']

In [717]:
with open('Documents.csv') as data_file:
    doc_content = ""
    for line in data_file:
        is_pattern = False
        if (line.startswith(id_pattern)):
            if (doc_content != ""):
                doc_content = nltk.word_tokenize(doc_content)
                doc_content = clear_data(doc_content)
                documents_set = documents_set.append({'document_id':doc_id, 'content':Counter(doc_content)}, ignore_index=True)
            doc_content = ""  
            doc_id = int(line[3:])
            continue

        for pattern in patterns:
            if (line.startswith(pattern)):
                is_pattern = True
                break
                
        if (not is_pattern):
            doc_content += " ".join(line.split('\n'))

In [718]:
documents_set = documents_set.set_index('document_id')
documents_set = pd.DataFrame(documents_set)
documents_set.head()

Unnamed: 0_level_0,content
document_id,Unnamed: 1_level_1
1,"{'experimental': 3, 'investigation': 2, 'of': ..."
2,"{'simple': 3, 'shear': 3, 'flow': 7, 'past': 5..."
3,"{'the': 3, 'boundary': 2, 'layer': 2, 'in': 2,..."
4,"{'approximate': 2, 'solutions': 3, 'of': 5, 't..."
5,"{'one-dimensional': 2, 'transient': 3, 'heat':..."


### Подготавливаем данные для извлечения признаков

Для начала выберем все уникальные слова для всех запросов

In [719]:
unique_query_words = []
unique_query_words = "".join(list(filter(lambda x : type(x)==str, np.array(queries_set['query']))))
unique_query_words = np.unique(nltk.word_tokenize(unique_query_words))

Уберём из множества уникальных слов метасимволы

In [720]:
unique_query_words = clear_data(unique_query_words)
unique_query_words = np.unique(unique_query_words)

Соединим train_set с запросами по ключу query_id, чтобы было легче смотреть не текст запроса. Эту же операцию можно было бы проделать для document_id, не это не работает :(

In [721]:
train_set_res = train_set.join(queries_set, on='query_id', lsuffix="_train", rsuffix="_queries")
train_set_res.head()

Unnamed: 0,query_id,document_id,relevance,query
1,1,184,2,what similarity laws must be obeyed when const...
2,1,29,2,what similarity laws must be obeyed when const...
3,1,31,2,what similarity laws must be obeyed when const...
4,1,12,3,what similarity laws must be obeyed when const...
5,1,51,3,what similarity laws must be obeyed when const...


То же самое проделываем с тестовымы данными

In [722]:
test_set_res = test_set.join(queries_set, on='query_id', lsuffix="_train", rsuffix="_queries")
test_set_res.head()

Unnamed: 0,query_id,document_id,query
1,126,974,what are wind-tunnel corrections for a two-dim...
2,126,1326,what are wind-tunnel corrections for a two-dim...
3,126,187,what are wind-tunnel corrections for a two-dim...
4,126,969,what are wind-tunnel corrections for a two-dim...
5,126,970,what are wind-tunnel corrections for a two-dim...


### Для каждой пары запрос - документ составляем признаки

Функция, которая записывает в файлы строки в нужном формате. А именно, для каждой пары запрос документ, для каждого уникального слова из множества уникальных слов в запросах, если это уникальное слово есть в запросе, то будем присать, сколько раз оно встретилось в данном документе

In [739]:
"""с валидационной выборкой""" 
def write_to_file_with_val(train_file_name, val_file_name, train_set_res, counter=True, train_part=0.8):
    with open(train_file_name, 'w') as features_matrix_train:
        with open(val_file_name, 'w') as features_matrix_valid:
            for row_ind in range(len(train_set_res)):
                line = ""
                
                # считываем id запроса и документа, метрку релевантности, а также текст запроса 
                query_id = train_set_res.iloc[row_ind, 0]
                document_id = train_set_res.iloc[row_ind, 1]
                relevance = train_set_res.iloc[row_ind, 2]
                query_content = train_set_res.iloc[row_ind, 3]
                query_content = nltk.word_tokenize(str(query_content))
                
                # по id документа смотрим, какие слова и сколько раз встречались в этом документе
                document_content_counter = documents_set.iloc[int(document_id), 0]

                line += str(relevance) + ' ' + 'qid:' + str(query_id) + ' '

                # для всех слов из множества уникальных слов смотрим, если 
                # это слово есть в запросе, то смотрим, сколько раз оно встречалось в документе
                # и ставим это число, также можно попробовать вариант ставить 1, если слово также встечалось в документе
                # и 0, если нет
                for ind, word in enumerate(unique_query_words):
                    if (not word in query_content):
                        line += str(ind) + ':' + '0' + ' '
                        continue
                    if (counter):
                        line += str(ind) + ':' + str(document_content_counter[word]) + ' '
                    else:
                        if (word in document_content_counter):
                            line += str(ind) + ':' + str(1) + ' '
                        else:
                            line += str(ind) + ':' + str(0) + ' '

                line += '\n'

                if (row_ind < round(len(train_set_res)*train_part)):
                    features_matrix_train.write(line)
                else:
                    features_matrix_valid.write(line)

In [740]:
"""делает то же самое, что и функция выше, но без валидации, а также ещё проверяет, 
что для document_id есть документ"""
def write_to_file(file_name, data, counter=True, train=True):
    with open(file_name, 'w') as features_matrix:
        for row_ind in range(len(data)):
            line = ""
            query_id = data.iloc[row_ind, 0]
            document_id = data.iloc[row_ind, 1]
            
            if (train):
                relevance = train_set_res.iloc[row_ind, 2]
                line += str(relevance) + ' ' + 'qid:' + str(query_id) + ' '
                
                query_content = data.iloc[row_ind, 3]

            else:
                line += '-1' + ' ' + 'qid:'+str(query_id) + ' '
                query_content = data.iloc[row_ind, 2]

            query_content = nltk.word_tokenize(str(query_content))

            try:
                document_content_counter = documents_set.iloc[int(document_id), 0]
            except IndexError:
                for ind in range(len(unique_query_words)):
                    line += str(ind) + ':' + '0' + ' '
                line += '\n'
                features_matrix.write(line)
                continue

            for ind, word in enumerate(unique_query_words):
                if (not word in query_content):
                    line += str(ind) + ':' + '0' + ' '
                    continue
                if (counter):
                        line += str(ind) + ':' + str(document_content_counter[word]) + ' '
                else:
                    if (word in document_content_counter):
                        line += str(ind) + ':' + str(1) + ' '
                    else:
                        line += str(ind) + ':' + str(0) + ' '

            line += '\n'
            features_matrix.write(line)

In [741]:
# with open('train3.txt', 'w') as features_matrix_train:
#     for row_ind in range(len(train_set_res)):
#         line = ""
#         query_id = train_set_res.iloc[row_ind, :][0]
#         document_id = train_set_res.iloc[row_ind, :][1]
#         relevance = train_set_res.iloc[row_ind, :][2]
#         query_content = train_set_res.iloc[row_ind, :][3]
#         query_content = nltk.word_tokenize(str(query_content))
#         document_content_counter = documents_set.iloc[int(document_id), :][0]

#         line += str(relevance) + ' ' + 'qid:' + str(query_id) + ' '

#         for ind, word in enumerate(unique_query_words):
#             if (not word in query_content):
#                 line += str(ind) + ':' + '0' + ' '
#                 continue
#             line += str(ind) + ':' + str(document_content_counter[word]) + ' '

#         line += '\n'

#         features_matrix_train.write(line)

In [742]:
# with open('test1.txt', 'w') as features_matrix:
# #     i = 0
#     for row_ind in range(len(test_set_res)):
#         line = ""
#         query_id = test_set_res.iloc[row_ind, :][0]
#         document_id = test_set_res.iloc[row_ind, :][1]
#         query_content = test_set_res.iloc[row_ind, :][2]
#         query_content = nltk.word_tokenize(str(query_content))
#         line += '-1' + ' ' + 'qid:'+str(query_id) + ' '
# #         i += 1
    
#         try:
#             document_content_counter = documents_set.iloc[int(document_id), :][0]
#         except IndexError:
# #             print(i, query_id, document_id)
#             for ind in range(len(unique_query_words)):
#                 line += str(ind) + ':' + '0' + ' '
#             line += '\n'
#             features_matrix.write(line)
#             continue

#         for ind, word in enumerate(unique_query_words):
#             if (not word in query_content):
#                 line += str(ind) + ':' + '0' + ' '
#                 continue
#             line += str(ind) + ':' + str(document_content_counter[word]) + ' '

#         line += '\n'
#         features_matrix.write(line)

In [743]:
write_to_file_with_val('train_with_counter.txt', 'val_with_counter.txt', train_set_res)

write_to_file_with_val('train_without_counter.txt', 'val_without_counter.txt', train_set_res, counter=False)

### Применяем алгорим LambdaMART к подготовленным данным в нужном формате

Попробуем с Counter

In [744]:
%%time
! java -jar RankLib-2.1-patched.jar -train train_with_counter.txt -validate val_with_counter.txt -ranker 6 -metric2t NDCG@5 -save LambdaMART_with_counter > LambdaMART_with_counter.txt

Wall time: 11.7 s


Теперь без Counter

In [745]:
%%time
! java -jar RankLib-2.1-patched.jar -train train_without_counter.txt -validate val_without_counter.txt -ranker 6 -metric2t NDCG@5 -save LambdaMART_without_counter > LambdaMART_without_counter.txt

Wall time: 11.4 s


С Counter работает лучше на train, но хуже на validation. Посмотрим на результаты моделей без валидации

In [750]:
write_to_file('train_with_counter_without_val.txt', train_set_res, train=True, counter=True)
write_to_file('test_with_counter_without_val.txt', test_set_res, train=False, counter=True)

write_to_file('train_without_counter_without_val.txt', train_set_res, counter=False, train=True)
write_to_file('test_without_counter_without_val.txt', test_set_res, counter=False, train=False)

In [747]:
%%time
! java -jar RankLib-2.1-patched.jar -train train_with_counter_without_val.txt -ranker 6 -tree 400 -metric2t NDCG@5 -save LambdaMART_with_counter_without_val > LambdaMART_with_counter_without_val.txt

Wall time: 1min 18s


In [748]:
%%time
! java -jar RankLib-2.1-patched.jar -train train_without_counter_without_val.txt -ranker 6 -tree 400 -metric2t NDCG@5 -save LambdaMART_without_counter_without_val > LambdaMART_without_counter_without_val.txt

Wall time: 1min 21s


### Делаем submission

Для метода без Counter

In [751]:
!java -jar RankLib-2.1-patched.jar -load LambdaMART_without_counter_without_val -rank test_without_counter_without_val.txt -score score_without_counter_without_val.txt


[+] General Parameters:
Model file:	LambdaMART_without_counter_without_val
Feature normalization: No
Model:		LambdaMART

Reading feature file [test_without_counter_without_val.txt]: 0... 
Reading feature file [test_without_counter_without_val.txt]... [Done.]            
(100 ranked lists, 847 entries read)


In [755]:
test_set_res.index = range(len(test_rel))
test_set_res.head()

Unnamed: 0,query_id,document_id,query
0,126,974,what are wind-tunnel corrections for a two-dim...
1,126,1326,what are wind-tunnel corrections for a two-dim...
2,126,187,what are wind-tunnel corrections for a two-dim...
3,126,969,what are wind-tunnel corrections for a two-dim...
4,126,970,what are wind-tunnel corrections for a two-dim...


In [752]:
score = pd.read_csv('score_without_counter_without_val.txt', sep='\t', header=None)
score = score.drop(1, axis=1)
score.head()

Unnamed: 0,0,2
0,126,-2.638955
1,126,-2.638955
2,126,-3.307772
3,126,-2.210904
4,126,-2.635123


In [757]:
score['3'] = test_set_res['document_id']
score.head()

Unnamed: 0,0,2,3
0,126,-2.638955,974
1,126,-2.638955,1326
2,126,-3.307772,187
3,126,-2.210904,969
4,126,-2.635123,970


In [763]:
output = score.groupby(0).apply(lambda x: x.sort_values([2],ascending=False).head(len(score)))
output.index = range(len(output))
output = pd.DataFrame(output)
output = output.drop(2, axis=1)
output.columns = ['QueryId','DocumentId']
output.head(3)

Unnamed: 0,QueryId,DocumentId
0,126,971
1,126,969
2,126,970


In [759]:
output.to_csv('out_without_counter_without_val.csv', index=None)

То же самое с Counter

In [764]:
!java -jar RankLib-2.1-patched.jar -load LambdaMART_with_counter_without_val -rank test_with_counter_without_val.txt -score score_with_counter_without_val.txt


[+] General Parameters:
Model file:	LambdaMART_with_counter_without_val
Feature normalization: No
Model:		LambdaMART

Reading feature file [test_with_counter_without_val.txt]: 0... 
Reading feature file [test_with_counter_without_val.txt]... [Done.]            
(100 ranked lists, 847 entries read)


In [766]:
score = pd.read_csv('score_with_counter_without_val.txt', sep='\t', header=None)
score = score.drop(1, axis=1)
score.head()

Unnamed: 0,0,2
0,126,-4.30205
1,126,-3.933957
2,126,-4.150296
3,126,-1.092625
4,126,-3.268303


In [767]:
len(score) == len(test_set_res)

True

In [768]:
score['3'] = test_set_res['document_id']
score.head()

Unnamed: 0,0,2,3
0,126,-4.30205,974
1,126,-3.933957,1326
2,126,-4.150296,187
3,126,-1.092625,969
4,126,-3.268303,970


In [769]:
output = score.groupby(0).apply(lambda x: x.sort_values([2],ascending=False).head(len(score)))
output.index = range(len(output))
output = pd.DataFrame(output)
output = output.drop(2, axis=1)
output.columns = ['QueryId','DocumentId']
output.head(3)

Unnamed: 0,QueryId,DocumentId
0,126,969
1,126,971
2,126,942


In [770]:
output.to_csv('out_with_counter_without_val.csv', index=None)