In [1]:
import pandas as pd
import tqdm
import os
import re
import lightgbm as lgbm
import scipy
import numpy as np
import pickle

In [2]:
DATA_FILE = '../test.csv'
PREDICTION_FILE = '../pred.csv'

In [3]:
def normalize_answer(text):
    """Lower text and remove punctuation and extra whitespace."""
    return ' '.join(re.findall(r"\w+", text)).lower()


def sentence_to_word(sentences):
    sentences_in_words = list()
    for sentence in sentences:
        sentences_in_words.append(normalize_answer(sentence).split())
    return sentences_in_words


def text_to_sentence(text):
    sentences = text.split(".")
    return [s.strip() for s in sentences if s.strip() != '']


def get_max_match_sentance(data_row):
    sentences = text_to_sentence(data_row["paragraph"])
    sentences_in_words = sentence_to_word(sentences)
    question_in_words = sentence_to_word([data_row["question"]])[0]

    max_overlap = None
    max_match_sentance_id = None

    question_words = set(question_in_words)
    list_over = None
    for sentance_id in range(len(sentences_in_words)):
        sentence_words = set(sentences_in_words[sentance_id])
        overlap = len(sentence_words.intersection(question_words))
        if max_overlap is None or overlap > max_overlap:
            max_overlap = overlap
            max_match_sentance_id = sentance_id
            list_over = sentence_words.intersection(question_words)
    part_sent = set(sentences_in_words[max_match_sentance_id]).difference(list_over)



    return sentences[max_match_sentance_id], part_sent


In [4]:
X_test = pd.read_csv(DATA_FILE)


In [5]:
X_test['paragraph'] = X_test['paragraph'].apply(lambda x: str(x))
X_test['question'] = X_test['question'].apply(lambda x: str(x))

In [6]:
X_test['t_pred'] = 0
for data_ind in tqdm.tqdm(X_test.index.values):
    full_sentance, part_sentance = get_max_match_sentance(X_test.loc[data_ind])
    s = ''
    for i in part_sentance:
        s += i+' '
    X_test.loc[data_ind, 't_pred'] = s[:-1]
    X_test.loc[data_ind, 'full_t_pred'] = full_sentance
X_test['full_t_pred'] = X_test['full_t_pred'].apply(lambda x: str(x))


100%|██████████| 40000/40000 [02:43<00:00, 245.10it/s]


In [7]:
X_test['paragraph_len_ch'] = X_test['paragraph'].apply(lambda x: len(x))
X_test['question_len_ch'] = X_test['question'].apply(lambda x: len(x))

X_test['paragraph'] = X_test['paragraph'].apply(lambda x: x.lower().split())
X_test['question'] = X_test['question'].apply(lambda x: x.lower().split())

X_test['paragraph_len'] = X_test['paragraph'].apply(lambda x: len(x))
X_test['question_len'] = X_test['question'].apply(lambda x: len(x))

tmp = pd.DataFrame(X_test.groupby('paragraph_id')['question_id'].count())
tmp.reset_index(inplace=True)
tmp.columns = ['paragraph_id', 'paragraph_freq']
X_test = pd.merge(X_test, tmp, on=['paragraph_id'], how='left')

In [8]:
rows = []
tmp = X_test.apply(lambda row: [rows.append([row['paragraph_id'], row['question_id'], nn]) 
                         for nn in row.paragraph], axis=1)

In [10]:
tmp = pd.DataFrame(rows)
tmp.columns = ['paragraph_id', 'question_id', 'word']
X_test = pd.merge(tmp, X_test, on=['paragraph_id', 'question_id'], how='left')

In [11]:
def f(x, y):    
    return x in y

In [12]:
X_test['in_t_pred'] = X_test.apply(lambda x: f(x['word'], x['t_pred']), axis=1)
X_test['in_full_t_pred'] = X_test.apply(lambda x: f(x['word'], x['full_t_pred']), axis=1)

In [13]:
X_paragraph_id = X_test['paragraph_id']
X_question_id = X_test['question_id']
word  = X_test['word']

del X_test['paragraph_id']
del X_test['question_id']
del X_test['word']
del X_test['paragraph']
del X_test['question']
del X_test['t_pred']
del X_test['full_t_pred']

In [14]:
X_test[:4]

Unnamed: 0,paragraph_len_ch,question_len_ch,paragraph_len,question_len,paragraph_freq,in_t_pred,in_full_t_pred
0,552,64,82,8,5,False,False
1,552,64,82,8,5,False,True
2,552,64,82,8,5,False,True
3,552,64,82,8,5,False,True


In [15]:
with open('lgb.pickle', 'rb') as f:
    lgb = pickle.load(f)

In [16]:
pred_test = lgb.predict_proba(X_test)[:,1]

In [17]:
tmp = pd.DataFrame(pred_test > 0.1)
tmp.columns = ['bool_pred']
tmp['pred'] = pd.DataFrame(pred_test)
tmp['paragraph_id'] = X_paragraph_id
tmp['question_id'] = X_question_id

In [21]:
tmp['word'] = word + ' '

result = pd.DataFrame(tmp[tmp.bool_pred == True].groupby(['paragraph_id', 'question_id'])['word'].apply(lambda x: x.sum()))
result.reset_index(inplace=True)
result.columns = ['paragraph_id', 'question_id', 'answer']

In [28]:
result.to_csv(PREDICTION_FILE, header=True, index=False)

In [29]:
pd.read_csv(PREDICTION_FILE)

Unnamed: 0,paragraph_id,question_id,answer
0,4,38010,и в и в и в в и в в и для манитобы характерен ...
1,4,39058,по данным министерства окружающей среды и экол...
2,4,46148,по данным министерства окружающей среды и экол...
3,4,54480,по и манитоба занимает первое место по количес...
4,6,7569,по прибытии в италию увлёкся жанровой живопись...
5,6,33361,в и с и стал работой в этом жанре стала италья...
6,6,40187,в и с и в в и а опх и ни ни опх с в в 1829 год...
7,6,46399,прибытии в увлёкся жанровой живописью и наряду...
8,6,69892,по в и с на и на эту в увидев её в современник...
9,9,21639,в в 19 азербайджанский и и и
