In [244]:
import numpy as np
import pandas as pd
import os
import json
import re
import requests
from pymystem3 import Mystem
from gensim.models import FastText, Word2Vec
from bs4 import BeautifulSoup
from inscriptis import get_text
from operator import itemgetter
from tqdm import tqdm_notebook
from six import iteritems
from nltk.corpus import stopwords 
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.test.utils import datapath

In [96]:
path_data = './data/data'
path_index = './data/data_index'

### Utils

In [97]:
def speller(q):
    pos = 0
    q_pos = []
    q_split = q.split()
    for i in q_split:
        if pos > 0: pos += 1
        q_pos.append(pos)
        pos += len(i)
    
    url = 'https://speller.yandex.net/services/spellservice.json/checkText?text=' + q
    response = requests.get(url).json()
    
    if len(q_split) > 1 and len(response) == 1 and response[0]['len'] == len(q):
        return q
    
    if len(response) > 0:
        for spl in response:
            if spl['pos'] in q_pos:
                q_split[q_pos.index(spl['pos'])] = spl['s'][0]

        return ' '.join(q_split)
    else:
        return q

In [98]:
class Synonyms:
    def __init__(self):
        self.cache = dict()
        
    def get_syn(self, word):
        if word in self.cache:
            return self.cache[word]
        
        key = 'dict.1.1.20190324T123533Z.62c2a8f7b72801a9.3260c2a87968032222ed684baa0b0e9679ef58f0'
        url = 'https://dictionary.yandex.net/api/v1/dicservice.json/lookup?lang=ru-ru&text=' + word + '&key=' + key
        response = requests.get(url).json()
                
        result = []
        if len(response) > 0 and 'def' in response and len(response['def']) > 0:
            tr = response['def'][0]['tr']
            tr_len = len(tr)
            i = 0
            path = tr[i]
            
            if 'text' in path:
                result.append(path['text'])
                
            while tr_len > 0 and 'syn' not in path:
                path = tr[i]
                i += 1
                tr_len -= 1
                
            if 'syn' in path:
                result = result + [w['text'] for w in path['syn']]
                result = [w for w in result if len(w.split(' ')) == 1][:5]
                
        self.cache[word] = result 
        return result
        
# synonyms = Synonyms()

In [124]:
# with open(os.path.join('./_synonyms.json'), 'w') as idf:
#     idf.write(json.dumps(synonyms.cache))

In [100]:
class Lemmatizer:
    def __init__(self):
        self.cache = dict()
        self.morph = Mystem()

    def lemmatize(self, word):
        if word in self.cache:
            return self.cache[word]

        result = self.morph.lemmatize(word)[0]
        self.cache[word] = result
        return result

get_lemma = Lemmatizer()

In [101]:
regex_num = re.compile('([\d])[\s]+([\d])')
regex_punct = re.compile('[%s]' % re.escape('!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}—~–«»-•©№…'))
stopwords_list = stopwords.words('russian') + \
                 stopwords.words('english') + \
                 ['а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у',
                  'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 
                  'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
def normalizer(text):
    text = regex_punct.sub(' ', text)
    text = regex_num.sub('\\1\\2', text)
    text = text.lower()
    terms = []
    for word in text.split():
        word = get_lemma.lemmatize(word)
        if word not in stopwords_list:
            terms.append(word)        

    return terms

In [102]:
weights = { 
    'title': 5, 
    'h1': 4, 
    'h2': 2.5,
    'h3': 2,
    'h4': 1.5,
    'b' : 1.5, 
    'strong': 1.5, 
    'text': 1
}

def parse(path_file):
    zones = {}
        
    with open(path_file) as opened_file:        
        soup = BeautifulSoup(opened_file, 'html.parser')
    
    try:    
        for zone in weights.keys():
            zones[zone] = []
            for item in soup.findAll(zone):
                if item.string:
                    zones[zone].extend(normalizer(item.string))

        zones['text'] = normalizer(get_text(soup.prettify()))
        
    except:
        try: 
            zones['text'] = normalizer(get_text(soup.prettify()))
        except:
            return None
    
    return zones

In [103]:
# url = './data/data/doc.06052.dat'
# extract = parse(url)
# extract

In [104]:
def get_frequency(extract):
    terms = {}
    terms_sum = {}
    
    for zone in extract.keys():
        terms[zone] = {}
        for word in extract[zone]:
            # частота терма в каждой зоне
            terms[zone][word] = terms[zone].get(word, 0) + 1

    for zone in terms.keys():
        for word in terms[zone].keys():
            # умножаем частоту в каждой зоне на вес
            terms[zone][word] = terms[zone][word] * weights[zone]
    
    for zone in terms.keys():
        for word in terms[zone].keys():
            # Складываем получившиеся частоты из разных зон
            terms_sum[word] = terms_sum.get(word, 0) + terms[zone][word]
    
    return terms_sum

### Словари

In [105]:
def spl_queries():
    with open('./data/queries.numerate.txt') as item:
        with open('./data/queries.numerate.spl.txt', 'w') as spl:
            for line in item:
                q_id, q = line.strip().split('\t')
                try:
                    spl.write(q_id + '\t' + speller(q) + '\n')
                except:
                    spl.write(q_id + '\t' + q + '\n')

# spl_queries()

In [106]:
def get_url(text):
    return text.readline().strip()

hashUrl_docId = {}        
docId_urls = {}
with open('./data/urls.numerate.txt') as item:
    for line in item:
        doc_id, url = line.strip().split('\t')
        docId_urls[int(doc_id)] = url
        hashUrl_docId[hash(url)] = int(doc_id)
        
query_id = {}
with open('./data/queries.numerate.spl.txt') as item:
    for line in item:
        q_id, q = line.strip().split('\t')
        query_id[q] = int(q_id)

### LDA

In [107]:
def getCorpus():
    index_files = os.listdir(path_index)
    dictionary = Dictionary([])
    corpus = []
        
    for item in tqdm_notebook(index_files, total=len(index_files), mininterval=1):
        path_file = os.path.join(path_index, item)

        try:
            with open(path_file) as opened_file:
                doc = DocIndex().load(path_file) 
                dictionary.add_documents([doc.terms])
                corpus.append(dictionary.doc2bow(doc.terms))

        except:
            continue
            
    with open(os.path.join("./_corpus.txt"), 'w') as ifile:
        ifile.write(json.dumps(corpus))
            
    return corpus

def LDA_model(corpus):
    np.random.seed(17)
    LDA = LdaModel(corpus, num_topics=10)
    lda.save("./LDAmodel/model")
    
    return LDA

# corpus = getCorpus()
# corpus = json.loads(open("./_corpus.txt").read())
# LDA = LDA_model(corpus)
# LDA = LdaModel.load("./LDAmodel/model")

### Query

In [108]:
sample_submission = pd.read_csv('./data/sample_submission.txt')

class Query():     
    def __init__(self, query):
        if not query in query_id: 
            return

        self.id = query_id[query]
        self.docs_id = sample_submission[sample_submission['QueryId'] == self.id]['DocumentId'].values        
        self.terms = normalizer(query)

### Index

In [109]:
class DocIndex():    
    def __init__(self):
        self.id = 0
        self.url = ''
        self.error = False
        self.doc_len = 0
        self.doc_freqs = {}
        self.positions = {}
        self.terms = []
        
    def build_data(self, path_file, hashUrl):
        self.id = hashUrl_docId[hashUrl]
        self.url = docId_urls[self.id]

        extract = parse(path_file)
            
        if not extract:
            self.error = True
        else:
            # Все термы документа
            self.terms = extract['text']
            
            # Позиция терм в документе
            for i, term in enumerate(extract['text']):
                if term not in self.positions:
                    self.positions[term] = []
                self.positions[term].append(i+1)
            
            # Длина документа
            self.doc_len = len(extract['text'])
            
            # Объединяем частоту из разных зон
            self.doc_freqs = get_frequency(extract)
        return self
    
    def save(self, path_index):
        with open(os.path.join(path_index, str(self.id)), 'w') as doc:
            doc.write(json.dumps(self.__dict__))
            
    def load(self, path_index):
        try:
            params = json.loads(open(path_index).read())
        except:
            self.error = True
            return self
        
        self.id = params['id']
        self.url = params['url']
        self.error = params['error']
        self.doc_len = params['doc_len']
        self.doc_freqs = params['doc_freqs']
        self.positions = params['positions']
        self.terms = params['terms']
        return self

### Build

In [16]:
class IndexBuild(): 
    def __init__(self):
        self.corpus_size = 0
        self.average_idf = 0
        self.avgdl = 0
        self.docs_len = 0
        self.df = {}
        self.idf = {}
        self._build()
        
    def _build(self):
        total_load = 0
        total_save = 0
        self.err_no_url = []
        self.errors = []
        
        index_files = os.listdir(path_index)
        data_files = os.listdir(path_data)
        
        for item in tqdm_notebook(data_files, total=len(data_files), mininterval=1):
            path_file = os.path.join(path_data, item)

            try:
                with open(path_file) as opened_file:
                    hashUrl = hash(get_url(opened_file))
                    if hashUrl not in hashUrl_docId:
                        self.err_no_url.append(path_file)
                        continue
                    else:
                        if str(hashUrl_docId[hashUrl]) in index_files:
                            total_load += 1
                            doc_index = DocIndex().load(os.path.join(path_index, str(hashUrl_docId[hashUrl]))) 
                        else:
                            doc_index = DocIndex().build_data(path_file, hashUrl)
                            if doc_index.error:
                                self.errors.append((path_file))
                                continue
                            else:
                                total_save += 1
                                doc_index.save(path_index)

                self.docs_len += doc_index.doc_len
                self.corpus_size += 1
                for word in doc_index.doc_freqs.keys():
                    self.df[word] = self.df.get(word, 0) + 1

            except:
                self.errors.append((path_file))
                continue

        self.avgdl = self.docs_len / self.corpus_size
        idf_sum = 0
        eps = 0.25
        negative_idfs = []
        for word, freq in iteritems(self.df):
            idf = np.log((self.corpus_size - freq + 0.5) / (freq + 0.5))
            self.idf[word] = idf
            idf_sum += idf
            if idf < 0:
                negative_idfs.append(word)
        self.average_idf = idf_sum / len(self.idf)

        for word in negative_idfs:
            self.idf[word] = eps * self.average_idf
        
        with open(os.path.join('./_params.json'), 'w') as ifile:
            params = {
                "corpus_size": self.corpus_size,
                "docs_len": self.docs_len, 
                "avgdl": self.avgdl,
                "average_idf": self.average_idf
            }
            ifile.write(json.dumps(params))
        with open(os.path.join('./_idf.json'), 'w') as idf:
            idf.write(json.dumps(self.idf))
        
        print("Загружено: {0}; Сохранено: {1}; err_no_url: {2}; error: {3};".format(total_load, total_save, len(self.err_no_url), len(self.errors)))
        return self
        
index = IndexBuild()

A Jupyter Widget


Загружено: 0; Сохранено: 74223; err_no_url: 17; error: 18;


In [127]:
idf = json.loads(open(os.path.join('./_idf.json')).read())
params = json.loads(open(os.path.join('./_params.json')).read())
synonyms = json.loads(open(os.path.join('./_synonyms.json')).read())

In [111]:
def get_phrase_score(phrase_words, phrase_pos, doc_id):    
    score_phrase = 0
    freq_phrase = 0
    match_words = []
    for i, pos in enumerate(phrase_pos[0]):
        match_words.append(pos)
        j = 1
        stop = False
        while j < len(phrase_words) and not stop:
            check_pos = [pos+1]
            for k, item in enumerate(check_pos): 
                if item in phrase_pos[j]:
                    match_words.append(item)
                    pos = item
                    break
                else:
                    if k == len(check_pos)-1:
                        stop = True
                        break
            j += 1

        if len(match_words) == len(phrase_words):
            freq_phrase += 1
        match_words = []

    if freq_phrase > 0:
        score_phrase = 0.1 * sum([idf[w] for w in phrase_words]) * (freq_phrase/(1+freq_phrase))
    
    return score_phrase

In [241]:
def ranging(query):
    result = []
    k = 2
    b = 0.75
    
    q = Query(query)
    
    for doc_id in q.docs_id:
        doc_index = DocIndex().load(os.path.join(path_index, str(doc_id))) 
        if doc_index.error: 
            continue
            
        doc_freqs = doc_index.doc_freqs
        dl = doc_index.doc_len
        avgdl = params['avgdl']
        score = 0
        
        # BM25F
        score_bm25 = 0
        for word in q.terms:
            if word in doc_freqs:      
                f = doc_freqs[word]
                TF = (f * (k + 1)) / (f + k * (1 - b + b * (dl / avgdl)))
                score_bm25 += idf[word] * TF
            else:
                # Учет синонимов для отсутствующих слов из запроса
                # get_synonym = synonyms.get_syn(word)
                get_synonym = synonyms[word][:1]
                for w_syn in get_synonym:
                    if w_syn in doc_freqs:
                        f = doc_freqs[w_syn]
                        score_bm25 += 0.1 * idf[w_syn] * (f/(1+f))
                        break

                
        # Pair - Сколько раз было точное вхождение пары слов из запроса
        positions = doc_index.positions
        pair = []
        phrase_pos = []
        phrase_words = []
        for i, word in enumerate(q.terms):
            if str(word) in positions:
                phrase_pos.append(positions[str(word)])
                phrase_words.append(str(word))     
            if i+1 == len(q.terms): break
            if str(q.terms[i]) in positions and str(q.terms[i+1]) in positions:
                pair.append(([positions[str(q.terms[i])], positions[str(q.terms[i+1])]], [q.terms[i], q.terms[i+1]]))
                
        friq_pair = []
        score_pair = 0
        for i in range(len(pair)):
            pos_pair = pair[i][0] 
            words = pair[i][1]
            count = 0
            for p in pos_pair[0]:
                if p+1 in pos_pair[1]:
                    count += 1
            friq_pair.append(count)
            score_pair += 0.1 * (idf[words[0]] + idf[words[1]]) * (count/(1+count))
            
            
        # Phrase - Подряд все слова фразы, если она больше 2 слов
        score_phrase = 0
        if len(phrase_words) == len(q.terms) and len(phrase_words) > 2:
            score_phrase = get_phrase_score(phrase_words, phrase_pos, doc_id)
        
        
        score = score_bm25 + score_pair + score_phrase
        result.append([score, q.id, doc_id])
                
    return sorted(result, key=itemgetter(0), reverse=True)[:10]

In [242]:
answers = []
with open('./data/queries.numerate.spl.txt') as item:
    for line in item:
        q_id, q = line.strip().split('\t')
        answers.extend(ranging(q))

In [243]:
df = pd.DataFrame(answers, columns=['Score', 'QueryId', 'DocumentId'])
df[['QueryId', 'DocumentId']].to_csv('./predict.csv', sep=',', index=False)

In [41]:
# id_doc - file_name
docId_fileName = {}

data_files = os.listdir(path_data)
for item in data_files:
    path_file = os.path.join(path_data, item)

    with open(path_file) as opened_file:
        hashUrl = hash(get_url(opened_file))
        if hashUrl not in hashUrl_docId:
            continue
        else:
            docId_fileName[hashUrl_docId[hashUrl]] = item

In [46]:
sample = pd.read_csv('./data/sample_submission.txt')
q_id = query_id['беспроводная зарядка для iphone xr']
docs_id = sample[sample['QueryId'] == q_id]['DocumentId'].values 
docs_id

array([9840, 9814, 9808, 9822, 9821, 9805, 9841, 9838, 9809, 9827, 9818,
       9828, 9825, 9802, 9806, 9813, 9835, 9800, 9810, 9826, 9843, 9829,
       9834, 9807, 9816, 9815, 9803, 9830, 9804, 9811, 9819, 9832, 9833,
       9839, 9842, 9801, 9820, 9799, 9836, 9837, 9824, 9812, 9823, 9817,
       9831])

In [48]:
len([docId_fileName[i] for i in docs_id])

45