In [67]:
import pandas as pd
import re
import numpy as np

from functools import reduce
import nltk

In [29]:
import stop_words

In [60]:
import gensim
from gensim.models.doc2vec import LabeledSentence

In [181]:
import math

In [177]:
import random

## for cleanup

In [28]:
ps = nltk.stem.snowball.SnowballStemmer('english')

In [30]:
sw = stop_words.get_stop_words('english')

In [39]:
regex = re.compile('[^a-zA-Z]')
#First parameter is the replacement, second parameter is your input string
regex.sub('', 'ab3d*E')

'abdE'

In [93]:
def clean_query(tq):
    result = " "
    t = regex.sub(' ',tq)
    for w in t.split():
        if w not in sw:
            result += ps.stem(w.replace(r'[^a-zA-Z]', '')) + " "
    return result

In [95]:
def prepare_query(tq):
    result = " "
    t = regex.sub(' ',tq)
    for w in t.split():
        if w not in sw:
            result += ps.stem(w.replace(r'[^a-zA-Z]', '')) + " "
    return result.split()

## prepare data

In [33]:
class Doc:
    def __init__(self, title, author, text_lines):
        self.title = title
        self.author=author
        self.text = flat(list(map(lambda l: regex.sub(' ',l).split(), text_lines)))

In [38]:
def flat(list_of_list):
    result = []
    for li in list_of_list:
        for el in li:
            result.append(el)
            
    result = list(filter(lambda l:len(line) > 1, result))
    
    return result

In [36]:
def parse_article_lines(lines):
    text_index = 0
    
    l_s_i = 0
    l_e_i = 0 # label_{start-end}_index
    
    for i,l in enumerate(lines):
        if '.T' in l:
            l_s_i = i+1
        
        if '.A' in l:
            l_e_i = i
        
        if '.W' in l:
            text_index = i+1
    
    author = lines[l_e_i+1]
    
    return Doc(lines[l_s_i:l_e_i], author, lines[text_index:len(lines)])

In [237]:
article_start = 0
article_lines = []

docs = {}

lines = []
with open('Documents.csv') as file:
    lines = file.readlines()

for i,line in enumerate(lines):
    if 'Id' in line and len(line) < 12 and article_start != i:
        article_lines = lines[article_start:i]
        article_start = i
        
        q_id = line.split()[1]
        docs[q_id] = parse_article_lines(article_lines)
    

In [44]:
rel_train = pd.read_csv('relevance_train.csv', sep=',', lineterminator='\n')
rel_test = pd.read_csv('relevance_test.csv')
queries = pd.read_csv('queries.csv', sep='|',lineterminator='\n')

In [61]:
def doc2labeled_sentence(doc):
    return LabeledSentence(words=doc.text, tags=doc.title)

In [63]:
ls = list(map(doc2labeled_sentence, docs.values()))

## now prepare model

In [353]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  

In [80]:
model = gensim.models.Word2Vec(min_count=1,workers=4, hs=1, negative=0)

In [189]:
SIZE = 50

In [190]:
model = gensim.models.Doc2Vec(size=SIZE, min_count=1, workers=4, hs=1, negative=0)

In [191]:
model.build_vocab(ls)

In [192]:
kw = model.wv

## fix that, i suppose should be some library method or somehow fix vocaulary

In [291]:
def get_sen_vector(sen):
    v = np.zeros(SIZE)
    for w in sen:
        try:
            v = np.add(v, kw[w])
        except ValueError as e:
            pass
        except KeyError as e:
            pass
        
    return v / SIZE

## prepare now pairs doc - request

In [194]:
vectors = []
labels = []

for row in rel_train.iterrows():
    qid = row[1][0]
    did = row[1][1]
    rel = row[1][2]
    
    if str(did) in docs and qid in queries['QueryId']:
            labels.append(rel)
            q_v = get_sen_vector(queries[queries['QueryId']==qid])
            d_v = get_sen_vector(docs[str(did)].text)
            vectors.append(np.hstack([d_v, q_v]))
        
vectors = np.array(vectors)
labels = np.array(labels).reshape(len(labels),1)

## metrics

In [196]:
def dcg(rels,p):
    s = 0.0
    for i in range(0,p):
        rel = rels[i]
        s += (pow(rel,2)-1) / (math.log(i+1,2)+1)
        
    return s

In [197]:
def ndcg(ideal, rel, p):
    return dcg(rel,p) / dcg(ideal,len(ideal))

## prepare classification model

In [166]:
import xgboost as xgb
from sklearn.model_selection import train_test_split



In [198]:
trainX, testX, trainY, testY = train_test_split(vectors, labels)

In [199]:
dtrain = xgb.DMatrix(trainX, trainY)
dtest = xgb.DMatrix(testX, testY)

In [200]:

param = {'silent':1, 'objective':'reg:linear' }
param['nthread'] = 8
param['eval_metric'] = 'ndcg'
param['eta'] = 0.075
param['max_depth'] = 5
param['silent'] = 1
param['n_estimators'] = 1000
param['early_stopping_rounds'] = 30

watchlist = [ (dtrain,'train'), (dtest, 'test') ]

In [201]:
bst = xgb.train(param, dtrain, 500, watchlist,verbose_eval=100)

[0]	train-:0.543997	test-:0.540333
[100]	train-:0.496985	test-:0.490111
[200]	train-:0.495376	test-:0.487092
[300]	train-:0.495098	test-:0.482758
[400]	train-:0.494968	test-:0.488049


In [202]:
pred = bst.predict(dtest)

In [207]:
ndcg(testY, pred, 20)

array([ 0.12684762])

## linear model

In [209]:
from sklearn.linear_model import LinearRegression

In [210]:
lr = LinearRegression()
lr.fit(trainX, trainY)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [211]:
pred = lr.predict(testX)

In [212]:
from sklearn.metrics import mean_squared_error

In [213]:
mean_squared_error(pred, testY)

2.5751783252238747

In [214]:
ndcg(testY, pred,5)

array([ 0.05126464])

## preparing results

In [218]:
from sklearn.metrics.pairwise import cosine_similarity

In [359]:
def get_text_from_query_by(i):
    print(i)
    text = queries[queries['QueryId']==i].values[0][1]
    return clean_query(text)

In [418]:
def to_nparray(res):
    result = np.zeros((len(res), 2))
    for i,row in enumerate(res):
        result[i,0] = row[1]
        result[i,1] = row[2]
    return result

In [417]:
def rank_block_by_dist(q_ids, d_ids):
    sim = []
    for q_id, d_id in zip(q_ids, d_ids):
        dv = get_sen_vector(docs[str(d_id)].text)
        qv = get_sen_vector(get_text_from_query_by(q_id))
        sim.append(cosine_similarity(dv,qv))
        
    return to_nparray(sorted(zip(sim, q_ids, d_ids), key=lambda key: key[0]))

In [416]:
def rank_block_by_pred(q_ids, d_ids):
    result = []
    for q_id, d_id in zip(q_ids, d_ids):
        d_vec = get_sen_vector(docs[str(d_id)].text)
        q_vec = get_sen_vector(get_text_from_query_by(q_id))
        vec = np.hstack([d_vec, q_vec]).reshape(1,SIZE * 2)
        m = xgb.DMatrix(vec)
        pred = bst.predict(m)
        result.append(pred)
        
    return to_nparray(sorted(zip(result, q_ids, d_ids), key=lambda key: key[0]))

## example for ranking in case of query absense in train

In [365]:
tq = rel_test.iloc[0:9,0].values
td = rel_test.iloc[0:9,1].values
r2 = rank_block_by_pred(tq,td)

126
126
126
126
126
126
126
126
126


In [474]:
prev_q = -1
prev_i = -1
cnt = 0

result = np.zeros((len(rel_test),2))

for i,row in rel_test.iterrows():
    q_id = str(row['QueryId'])
    d_id = str(row['DocumentId'])
    
    if prev_q == -1:
        prev_q = q_id
        prev_i = i
        
    if prev_q != q_id:
        if int(prev_q) in queries['QueryId']:
            print('ranking by dist')
            r = rank_block_by_dist(rel_test.iloc[prev_i:i,0], rel_test.iloc[prev_i:i,1])
            for row in r:
                result[cnt,0] = row[0]
                result[cnt,1] = row[1]
                cnt += 1
        else:
            print('simple rewirte')
            for j in range(prev_i, i):
                result[j,0] = rel_test.iloc[j,0]
                result[j,1] = rel_test.iloc[j,1]
                cnt += 1
                
        prev_q = q_id
        prev_i = i

ranking by dist
126
126
126
126
126
126
126
126
126
ranking by dist
127




IndexError: index 0 is out of bounds for axis 0 with size 0

In [471]:
result.shape

(847, 2)

In [472]:
pd.DataFrame(result, dtype=np.uint8)

Unnamed: 0,0,1
0,126,174
1,126,203
2,126,202
3,126,46
4,126,205
5,126,206
6,126,187
7,126,204
8,0,0
9,0,0


In [171]:
res_out = np.zeros(shape=(len(test_q), 2),dtype=np.uint8)
p_q = test_q[0]
p_i = 0
cnt=0
for i,q in enumerate(test_q):
    if p_q != q or i == len(test_q)-1:
        tup = zip(result[p_i:i], test_q[p_i:i], test_d[p_i:i])
        out = sorted(tup, key=lambda e:e[0])
        for r,q,d in out:
            res_out[cnt,0] = int(q)
            res_out[cnt,1]=int(d)
            cnt += 1
        p_i = i
        p_q = q

In [173]:
output = pd.DataFrame(res_out,columns=['QueryId', 'DocumentId'])

In [176]:
output.to_csv(path_or_buf='pred.csv', columns=('QueryId','DocumentId'), header=('QueryId','DocumentId'), index=False)