In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np

import re
import os
from functools import reduce

from multiprocessing.dummy import Pool as ThreadPool
from multiprocessing import Pool, Lock, Value

# import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split

## Получение всех заголовков

In [3]:
doc_to_title = {}
with open('docs_titles.tsv') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = data[1]
        doc_to_title[doc_id] = title
print (len(doc_to_title))

28026


In [4]:
doc_to_title

{15731: 'ВАЗ 21213 | Замена подшипников ступицы | Нива',
 14829: 'Ваз 2107 оптом в Сочи. Сравнить цены, купить потребительские товары на Tiu.ru',
 15764: 'Купить ступица Лада калина2. Трансмиссия - переходные ступицы цена, замена, тюнинг.',
 17669: 'Классика 21010 - 21074',
 14852: 'Ступица Нива — замена подшипника своими руками',
 15458: 'ВАЗ 2110',
 14899: 'Обзор подшипников полуоси ВАЗ 2101-07, 2121,2123',
 16879: 'Купить Подшипники и ступицы FAG (Страница 23)',
 16310: 'HorsePowers — автомобильный интернет портал » Отзыв владельца ВАЗ 2121 Нива 2007 года',
 15440: 'Новости и сообщения из официальной группы Вконтакте торговой компании 33 Sport - Магазины - Тольятти',
 16242: 'Инструкция по замене подшипника передней ступицы ивеко дейли через dorognoekam.ru',
 16383: 'Ступицы - OLX.ua - страница 80',
 15580: 'маааленькая проблемка — бортжурнал Автокам 2160 ╬ 1994 года на DRIVE2',
 16840: 'Разгруженные полуоси для Нива (24 шлица 765 мм)',
 17519: 'Прошивки для нива м7.9.7 скачать - Фа

In [5]:
headers = pd.read_csv('data_headers_last.csv', sep=';')
doc_to_headers = dict(zip(headers['1'].values, headers['0']))

### Preprocessing

In [6]:
def standardize_text(text):
    text = text.replace(r"http\S+", "")
    text = text.replace(r"http", "")
    text = text.replace(r"@\S+", "")
    text = text.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    text = text.replace(r"@", "at")
    text = text.lower()
    return text

In [7]:
test_sent = doc_to_title[16473]
test_sent

'Кто такие таты? | New-Best.com Самый простой поиск ответов на наилучшие вопросы'

In [8]:
def remove_urls (vTEXT):
    vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.ASCII)
    return(vTEXT)


print( remove_urls("this is a test https://sdfs.sdfsdf.com/sdfsdf/sdfsdf/sd/sdfsdfs?bob=%20tree&jef=man lets see this too https://sdfsdf.fdf.com/sdf/f end"))
print( remove_urls(test_sent))

this is a test  lets see this too  end
Кто такие таты? | New-Best.com Самый простой поиск ответов на наилучшие вопросы


In [9]:
standardize_text(doc_to_title[6512])

'книга 2. ветер перемен (вячеслав безкрылов) / проза.ру'

In [10]:
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer

import re
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def to_digit(x):
    try:
        return int(x)
    except ValueError:
        pass
    
def remove_urls (vTEXT):
    regex = re.compile(
    # r'^(?:http|ftp)s?://'  # http:// or https://
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
    r'localhost|'  # localhost...
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'  # ...or ipv4
    r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'  # ...or ipv6
    r'(?::\d+)?'  # optional port
    r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    
    vTEXT = re.sub(regex, '', vTEXT)
    return(vTEXT)

def preprocess_text(document):
        # Remove urls
        # document = re.sub(url_reg, '', document)
        document = remove_urls(document)
        
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[а-яА-Я]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[а-яА-Я]\s+', ' ', str(document))
        
        # Remove 1-2 digits
#         document = re.sub(r'\b[0-9]\b', ' ', str(document))

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', str(document), flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        # print(tokens)
        # tokens = [stemmer.lemmatize(word) for word in tokens]
        # tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens \
                  if (len(word) > 2)|\
                  (type(to_digit(word)) == int)|\
                  (word.lower() in ['cs','go','vc'])]
        # print(tokens)

        preprocessed_text = ' '.join(list(map(lambda x: morph.parse(x)[0].normal_form, tokens )))

        return preprocessed_text

In [70]:
preprocess_text('Замена и регулировка переднего ступичного подшипника нива | Нива Ремонт').split()

['замена',
 'регулировка',
 'передний',
 'ступичный',
 'подшипник',
 'нива',
 'нива',
 'ремонт']

In [11]:
test_sent = doc_to_title[16473]
test_sent

'Кто такие таты? | New-Best.com Самый простой поиск ответов на наилучшие вопросы'

In [12]:
preprocess_text(test_sent)

'кто такой тат new best com самый простой поиск ответ хороший вопрос'

In [13]:
len(doc_to_title)

28026

In [14]:
len(doc_to_headers)

28024

In [15]:
DF.values.shape

NameError: name 'DF' is not defined

In [103]:
df1 = pd.DataFrame(doc_to_headers.values(), index=doc_to_headers.keys(), columns=['headers']).astype(str)
df2 = pd.DataFrame(doc_to_title.values(), index=doc_to_title.keys(), columns=['titles']).sort_index().astype(str)

DF = pd.concat([df1,df2], axis=1)

doc_to_Titles_and_Headers = dict(zip((DF['headers'] + ' ' + DF['titles']).index , 
                                     (DF['headers'] + ' ' + DF['titles']).values))

In [15]:
# Only titles
docs_titles = list(doc_to_title.values())
final_corpus = [preprocess_text(sentence) for sentence in tqdm(docs_titles) if sentence.strip() !='']

word_punctuation_tokenizer = nltk.WordPunctTokenizer()
word_tokenized_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in tqdm(final_corpus)]

# ## Titles with headers
# docs_titles_and_headers = list(doc_to_Titles_and_Headers.values())
# final_corpus = [preprocess_text(str(sentence)) for sentence in tqdm(docs_titles_and_headers) if str(sentence).strip() !='']

# word_punctuation_tokenizer = nltk.WordPunctTokenizer()
# word_tokenized_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in tqdm(final_corpus)]

HBox(children=(IntProgress(value=0, max=28026), HTML(value='')))




HBox(children=(IntProgress(value=0, max=27994), HTML(value='')))




## Настройка Tf-Idf

In [50]:
Vect = TfidfVectorizer(norm='l1')
Vect.fit(final_corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l1', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

## Train part

In [19]:
import pandas as pd
train_data = pd.read_csv('train_groups.csv')
traingroups_titledata = {}
for i in tqdm(range(len(train_data))):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = preprocess_text(doc_to_title[doc_id])
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))

HBox(children=(IntProgress(value=0, max=11690), HTML(value='')))




In [20]:
from tqdm import tnrange
from scipy.spatial.distance import cosine, cdist

def get_embedding(vec, group_Matrix, thrshold = 0.03):
    # return sorted((vec * group_Matrix > thrshold).sum(axis=1), reverse=True)[1:25]
    return sorted((vec * group_Matrix).sum(axis=1), reverse=True)[1:25]
#     vec = vec.reshape(1, len(vec))
#     group_dist = cdist(group_Matrix, vec, metric='euclidean')
#     return sorted( group_dist.reshape(len(group_dist),) ,   reverse=True)[1:80]

In [51]:
y_train = []
X_train = []
groups_train = []
for new_group in tqdm(traingroups_titledata):
    docs = traingroups_titledata[new_group]
    group_TfIdf_Matrix = Vect.transform([title for _, title, _ in docs]).toarray()
    for k, ((doc_id, title, target_id), vec) in enumerate(zip(docs, group_TfIdf_Matrix)):
        y_train.append(target_id)
        groups_train.append(new_group)
        X_train.append(get_embedding(vec, group_TfIdf_Matrix))


X_train = np.array(X_train)
y_train = np.array(y_train)
groups_train = np.array(groups_train)
print (X_train.shape, y_train.shape, groups_train.shape)

HBox(children=(IntProgress(value=0, max=129), HTML(value='')))


(11690, 24) (11690,) (11690,)


In [52]:
min(map(len, X_train)), max(map(len, X_train))

(24, 24)

In [53]:
arr = np.zeros((X_train.shape[0], max(map(len, X_train))))
for i, row in enumerate(X_train):
    for j, val in enumerate(row):
        arr[i,j]=val
X_train = np.array(arr)

In [54]:
print (X_train.shape, y_train.shape, groups_train.shape)

(11690, 24) (11690,) (11690,)


In [545]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)

In [183]:
pd.Series(y_train).value_counts()

0    8329
1    3361
dtype: int64

## Test part

In [25]:
test_data = pd.read_csv('test_groups.csv')
testgroups_titledata = {}
for i in tqdm(range(test_data.shape[0])):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    pair_id = new_doc['pair_id']
    title = preprocess_text(doc_to_title[doc_id])
    if doc_group not in testgroups_titledata:
        testgroups_titledata[doc_group] = []
    testgroups_titledata[doc_group].append((doc_id, pair_id, title))

HBox(children=(IntProgress(value=0, max=16627), HTML(value='')))




In [55]:
X_test = []
pairs_id = []
for new_group in tqdm(testgroups_titledata):
    docs = testgroups_titledata[new_group]
    group_TfIdf_Matrix = Vect.transform([title for _, _, title in docs]).toarray()
    for k, ((doc_id, pair_id, title), vec) in enumerate(zip(docs, group_TfIdf_Matrix)):
        X_test.append(get_embedding(vec, group_TfIdf_Matrix))
        pairs_id.append(pair_id)

X_test = np.array(X_test)
print(X_test.shape)

HBox(children=(IntProgress(value=0, max=180), HTML(value='')))


(16627, 24)


In [56]:
X_test.shape[0], len(X_test[0])

(16627, 24)

In [57]:
min(map(len, X_test)), max(map(len, X_test))

(24, 24)

In [58]:
arr = np.zeros((X_test.shape[0], max(map(len, X_test))))
for i, row in enumerate(X_test):
    for j, val in enumerate(row):
        arr[i,j]=val
X_test = np.array(arr)

In [59]:
print(X_test.shape)

(16627, 24)


In [551]:
ss = StandardScaler()
X_test = ss.fit_transform(X_test)

***

## Валидация

In [60]:
from itertools import zip_longest, product

prev_group = 1
prev_index = 0
groups_indices = []
for k, i in zip_longest(range(len(groups_train) + 1),
                        groups_train, fillvalue=-1):
    if prev_group != i:
        groups_indices.append([prev_index, k])
        prev_group = i
        prev_index = k
groups_indices = np.array(groups_indices)

In [32]:
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import KFold

import random

In [33]:
X_train.shape

(11690, 24)

In [34]:
def predict(clf, trsh, X):
    proba = clf.predict_proba(X)
    return np.array(list(map(lambda x: 1 if x[1] > trsh else 0  , proba)))

In [319]:
X_train[np.where(np.isnan(X_train) == True)[0],
        np.where(np.isnan(X_train) == True)[1]] = 0.5

In [61]:
L1_RATIO = np.arange(0.1, 1.1, 0.1)
THRSHS = np.arange(0.05, 1.0, 0.05)
REGUL_C = np.logspace(-2,1,4) #np.arange(1,241,40) #np.arange(1,201,40)
result = np.zeros(len(list(product(REGUL_C, THRSHS, L1_RATIO))))

for i, (C, trsh, l1_ratio) in tqdm(list(enumerate(product(REGUL_C, THRSHS, L1_RATIO)))):
    f_scores = []
    roc_aucs = []
    kf = KFold(n_splits=6)
    for train_index, val_index in kf.split(groups_indices):
        Train_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[train_index]))
        Val_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[val_index]))

        Train_X, Train_y = X_train[Train_Indicies], y_train[Train_Indicies]
        Val_X, Val_y = X_train[Val_Indicies], y_train[Val_Indicies]

        clf = LogisticRegression(C=C, penalty='elasticnet', solver='saga', l1_ratio=l1_ratio)
        clf.fit(Train_X, Train_y)
        preds = predict(clf, trsh, Val_X)
        
        f_scores.append(f1_score(Val_y, preds))
#         roc_aucs.append(roc_auc_score(Val_y, preds))
        
        result[i] = np.mean(f_scores)
#         result[i] = np.mean(roc_aucs)
        
best_index = result.argmax()
BEST_T = list(product(REGUL_C, THRSHS, L1_RATIO))[best_index][1]
BEST_REGUL_C = list(product(REGUL_C, THRSHS, L1_RATIO))[best_index][0]
BEST_L1_RATIO = list(product(REGUL_C, THRSHS, L1_RATIO))[best_index][2]
print(BEST_REGUL_C, BEST_T, BEST_L1_RATIO, result[best_index])

HBox(children=(IntProgress(value=0, max=760), HTML(value='')))


10.0 0.25 0.9 0.7030812098313097


In [62]:
best_index = result.argmax()
BEST_T = list(product(REGUL_C, THRSHS))[best_index][1]
BEST_REGUL_C = list(product(REGUL_C, THRSHS))[best_index][0]
print(BEST_REGUL_C, BEST_T, result[best_index])

IndexError: list index out of range

In [342]:
X_test[np.where(np.isnan(X_test) == True)[0],
        np.where(np.isnan(X_test) == True)[1]] = 0.5

In [63]:
Test_Preds = []
f_scores = []
roc_aucs = []

kf = KFold(n_splits=4, shuffle=True)

for train_index, val_index in kf.split(groups_indices):
    Train_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[train_index]))
    Val_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[val_index]))

    Train_X, Train_y = X_train[Train_Indicies], y_train[Train_Indicies]
    Val_X, Val_y = X_train[Val_Indicies], y_train[Val_Indicies]

    clf = LogisticRegression(C=BEST_REGUL_C, penalty='elasticnet', solver='saga', l1_ratio=BEST_L1_RATIO)
    clf.fit(Train_X, Train_y)
    preds = predict(clf, BEST_T, Val_X)
    Test_Preds.append(predict(clf, BEST_T, X_test))
    f_scores.append(f1_score(Val_y, preds))
    roc_aucs.append(roc_auc_score(Val_y, preds))
Test_Preds = np.array(Test_Preds)

In [64]:
print('f-score`s : ', np.round(f_scores,3), np.mean(f_scores))
print('ROC-AUC score`s : ', np.round(roc_aucs,3), np.mean(roc_aucs))

f-score`s :  [0.746 0.653 0.806 0.629] 0.7085964474278712
ROC-AUC score`s :  [0.834 0.774 0.848 0.786] 0.8105690897920117


In [39]:
Test_Preds

array([[1, 0, 0, ..., 1, 1, 1],
       [1, 0, 0, ..., 1, 1, 1],
       [1, 0, 0, ..., 1, 1, 1],
       [1, 0, 0, ..., 1, 1, 1]])

In [453]:
weights = f_scores/sum(f_scores)
Predictions = []
for i in Test_Preds[[2,3]].T:
    Predictions.append(int(np.dot(i, weights[[2,3]]).round()))

In [65]:
Predictions = Test_Preds[[0,2]].T.mean(axis=1).round()

In [66]:
SUBMIT = pd.DataFrame(columns=['pair_id', 'target'])
SUBMIT['pair_id'] = pairs_id
SUBMIT['target'] = Predictions.astype(int)

SUBMIT.to_csv('submit_tfidf_LogReg_l1_without_trsh_tfidf_l1.csv', index=0)

In [67]:
SUBMIT

Unnamed: 0,pair_id,target
0,11691,1
1,11692,0
2,11693,0
3,11694,1
4,11695,0
...,...,...
16622,28313,0
16623,28314,1
16624,28315,1
16625,28316,1
