In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np

import re
import os
from functools import reduce

from multiprocessing.dummy import Pool as ThreadPool
from multiprocessing import Pool, Lock, Value

# import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split

## Получение всех заголовков

In [3]:
doc_to_title = {}
with open('docs_titles.tsv') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = data[1]
        doc_to_title[doc_id] = title
print (len(doc_to_title))

28026


### Preprocessing

In [4]:
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer

import re
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def to_digit(x):
    try:
        return int(x)
    except ValueError:
        pass
    
def remove_urls (vTEXT):
    regex = re.compile(
    # r'^(?:http|ftp)s?://'  # http:// or https://
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
    r'localhost|'  # localhost...
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'  # ...or ipv4
    r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'  # ...or ipv6
    r'(?::\d+)?'  # optional port
    r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    
    vTEXT = re.sub(regex, '', vTEXT)
    return(vTEXT)

def preprocess_text(document):
        # Remove urls
        # document = re.sub(url_reg, '', document)
        document = remove_urls(document)
        
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[а-яА-Я]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[а-яА-Я]\s+', ' ', str(document))
        
        # Remove 1-2 digits
#         document = re.sub(r'\b[0-9]\b', ' ', str(document))

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', str(document), flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        # print(tokens)
        # tokens = [stemmer.lemmatize(word) for word in tokens]
        # tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens \
                  if (len(word) > 2)|\
                  (type(to_digit(word)) == int)|\
                  (word.lower() in ['cs','go','vc'])]
        # print(tokens)

        preprocessed_text = ' '.join(list(map(lambda x: morph.parse(x)[0].normal_form, tokens )))

        return preprocessed_text

In [5]:
# Only titles
docs_titles = list(doc_to_title.values())
final_corpus = [preprocess_text(sentence) for sentence in tqdm(docs_titles) if sentence.strip() !='']

word_punctuation_tokenizer = nltk.WordPunctTokenizer()
word_tokenized_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in tqdm(final_corpus)]

HBox(children=(IntProgress(value=0, max=28026), HTML(value='')))




HBox(children=(IntProgress(value=0, max=27994), HTML(value='')))




## Настройка Tf-Idf

In [6]:
Vect = TfidfVectorizer(norm='l1')
Vect.fit(final_corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l1', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [8]:
from tqdm import tnrange
from scipy.spatial.distance import cosine, cdist

def get_embedding(vec, group_Matrix, thrshold = 0.03):
    # return sorted((vec * group_Matrix > thrshold).sum(axis=1), reverse=True)[1:25]
    return sorted((vec * group_Matrix).sum(axis=1), reverse=True)[1:25]

## Train part

In [7]:
import pandas as pd
train_data = pd.read_csv('train_groups.csv')
traingroups_titledata = {}
for i in tqdm(range(len(train_data))):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = preprocess_text(doc_to_title[doc_id])
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))

HBox(children=(IntProgress(value=0, max=11690), HTML(value='')))




In [9]:
y_train = []
X_train = []
groups_train = []
for new_group in tqdm(traingroups_titledata):
    docs = traingroups_titledata[new_group]
    group_TfIdf_Matrix = Vect.transform([title for _, title, _ in docs]).toarray()
    for k, ((doc_id, title, target_id), vec) in enumerate(zip(docs, group_TfIdf_Matrix)):
        y_train.append(target_id)
        groups_train.append(new_group)
        X_train.append(get_embedding(vec, group_TfIdf_Matrix))


X_train = np.array(X_train)
y_train = np.array(y_train)
groups_train = np.array(groups_train)
print (X_train.shape, y_train.shape, groups_train.shape)

HBox(children=(IntProgress(value=0, max=129), HTML(value='')))


(11690, 24) (11690,) (11690,)


In [10]:
arr = np.zeros((X_train.shape[0], max(map(len, X_train))))
for i, row in enumerate(X_train):
    for j, val in enumerate(row):
        arr[i,j]=val
X_train = np.array(arr)

## Test part

In [11]:
test_data = pd.read_csv('test_groups.csv')
testgroups_titledata = {}
for i in tqdm(range(test_data.shape[0])):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    pair_id = new_doc['pair_id']
    title = preprocess_text(doc_to_title[doc_id])
    if doc_group not in testgroups_titledata:
        testgroups_titledata[doc_group] = []
    testgroups_titledata[doc_group].append((doc_id, pair_id, title))

HBox(children=(IntProgress(value=0, max=16627), HTML(value='')))




In [12]:
X_test = []
pairs_id = []
for new_group in tqdm(testgroups_titledata):
    docs = testgroups_titledata[new_group]
    group_TfIdf_Matrix = Vect.transform([title for _, _, title in docs]).toarray()
    for k, ((doc_id, pair_id, title), vec) in enumerate(zip(docs, group_TfIdf_Matrix)):
        X_test.append(get_embedding(vec, group_TfIdf_Matrix))
        pairs_id.append(pair_id)

X_test = np.array(X_test)
print(X_test.shape)

HBox(children=(IntProgress(value=0, max=180), HTML(value='')))


(16627, 24)


In [13]:
arr = np.zeros((X_test.shape[0], max(map(len, X_test))))
for i, row in enumerate(X_test):
    for j, val in enumerate(row):
        arr[i,j]=val
X_test = np.array(arr)

## Валидация

In [14]:
from itertools import zip_longest, product

prev_group = 1
prev_index = 0
groups_indices = []
for k, i in zip_longest(range(len(groups_train) + 1),
                        groups_train, fillvalue=-1):
    if prev_group != i:
        groups_indices.append([prev_index, k])
        prev_group = i
        prev_index = k
groups_indices = np.array(groups_indices)

In [21]:
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import KFold

import random

In [22]:
def predict(clf, trsh, X):
    proba = clf.predict_proba(X)
    return np.array(list(map(lambda x: 1 if x[1] > trsh else 0  , proba)))

In [24]:
PENALTY = ['l2']
LOSS = ['squared_hinge']
THRSHS = np.arange(0.05, 1.0, 0.05)
REGUL_C = np.logspace(-2,1,4) #np.arange(1,241,40) #np.arange(1,201,40)

Params = list(product(REGUL_C, LOSS, PENALTY, THRSHS))

result = np.zeros(len(Params))

for i, (C, loss, penalty, trsh) in tqdm(list(enumerate(Params))):
    f_scores = []
    roc_aucs = []
    kf = KFold(n_splits=6)
    for train_index, val_index in kf.split(groups_indices):
        Train_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[train_index]))
        Val_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[val_index]))

        Train_X, Train_y = X_train[Train_Indicies], y_train[Train_Indicies]
        Val_X, Val_y = X_train[Val_Indicies], y_train[Val_Indicies]

        clf = SVC(C=C, probability=True)
        clf.fit(Train_X, Train_y)
        preds = predict(clf, trsh, Val_X)
        
        f_scores.append(f1_score(Val_y, preds))
#         roc_aucs.append(roc_auc_score(Val_y, preds))
        
        result[i] = np.mean(f_scores)
#         result[i] = np.mean(roc_aucs)
        
best_index = result.argmax()
BEST_LOSS = Params[best_index][1]
BEST_REGUL_C = Params[best_index][0]
BEST_PENALTY = Params[best_index][2]
BEST_T = Params[best_index][3]
print(BEST_REGUL_C, BEST_LOSS, BEST_PENALTY, result[best_index])



HBox(children=(IntProgress(value=0, max=76), HTML(value='')))

KeyboardInterrupt: 

In [34]:
f_scores = []
roc_aucs = []
kf = KFold(n_splits=6)
for train_index, val_index in kf.split(groups_indices):
    Train_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[train_index]))
    Val_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[val_index]))

    Train_X, Train_y = X_train[Train_Indicies], y_train[Train_Indicies]
    Val_X, Val_y = X_train[Val_Indicies], y_train[Val_Indicies]

    clf = SVC(C=0.01, probability=True)
    clf.fit(Train_X, Train_y)
    preds = predict(clf, 0.3, Val_X)

    f_scores.append(f1_score(Val_y, preds))
#         roc_aucs.append(roc_auc_score(Val_y, preds))

    result[i] = np.mean(f_scores)
#         result[i] = np.mean(roc_aucs)

In [35]:
np.mean(f_scores)

0.6940817107109333

In [47]:
Test_Preds = []
f_scores = []
roc_aucs = []

kf = KFold(n_splits=4, shuffle=True)

for train_index, val_index in kf.split(groups_indices):
    Train_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[train_index]))
    Val_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[val_index]))

    Train_X, Train_y = X_train[Train_Indicies], y_train[Train_Indicies]
    Val_X, Val_y = X_train[Val_Indicies], y_train[Val_Indicies]

    clf = SVC(C=100, probability=True)
    clf.fit(Train_X, Train_y)
    preds = predict(clf, 0.25, Val_X)
    Test_Preds.append(predict(clf, 0.25, X_test))
    f_scores.append(f1_score(Val_y, preds))
    roc_aucs.append(roc_auc_score(Val_y, preds))
Test_Preds = np.array(Test_Preds)

In [48]:
print('f-score`s : ', np.round(f_scores,3), np.mean(f_scores))
print('ROC-AUC score`s : ', np.round(roc_aucs,3), np.mean(roc_aucs))

f-score`s :  [0.658 0.686 0.685 0.765] 0.69850008184307
ROC-AUC score`s :  [0.795 0.785 0.816 0.798] 0.7985480700938516
