# Соревнование

## Скачивание текста страницы

In [145]:
from tqdm import tqdm
import os
import numpy as np

In [146]:
doc_to_title = {}

for num, filename in tqdm(enumerate(os.listdir('compiled'))):
    if filename == '.DS_Store':
        continue
    num = filename.split('.', 1)[0]
    with open('compiled/' + filename) as f:
        dt = f.read()
        doc_to_title[int(num)] = dt

28026it [00:16, 1714.90it/s]


In [147]:
doc_to_title[1]

'тип: реферат; размер: . kb.; резюме: в статье систематизированы клинические и лабораторные данные, свидетельствующие об эндокринных расстройствах, приводящих к бесплодию, или сопутствующих ему. даны рекомендации по рационализации и минимизации числа лабораторных анализов на этапе обследования пациентов с бесплодием м. б. аншина центр репродукции и генетики «фертимед», г. москва документы рефераты сочинения гдз м. б. аншина центр репродукции и генетики «фертимед», г. москва\n'

## Составление трейн-тест групп

In [148]:
import pandas as pd

train_data = pd.read_csv('train_groups.csv')
traingroups_titledata = {}

for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    
    title = doc_to_title[doc_id]
    
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))

In [149]:
test_data = pd.read_csv('test_groups.csv')
testgroups_titledata = {}

for i in range(len(test_data)):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    
    title = doc_to_title[doc_id]
    
    if doc_group not in testgroups_titledata:
        testgroups_titledata[doc_group] = []
        
    testgroups_titledata[doc_group].append((doc_id, title, -1))

In [150]:
from nltk.corpus import stopwords

import ssl
ssl._create_default_https_context = ssl._create_stdlib_context
import nltk
nltk.download('stopwords')

stop_words = []
with open('stop words.txt') as f:
    for line in f:
        stop_words.append(line.strip())
stop_words[0] = stop_words[0][-3:]
stop_words += ['http', 'ru', 'com']


russian_stopwords = stopwords.words("russian")
russian_stopwords.extend(['это', 'нею'])

stop_words += russian_stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Dmitry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## SVD по группе, расстояния и статистики в группе

In [151]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine

def get_tf_vect(docs):
    texts = []
    for k, (doc_id, title, target_id) in enumerate(docs):
        texts.append(title)
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    vectorizer.fit(texts)
    return vectorizer

def get_tf_trans(model, text):
    return model.transform([text]).toarray()[0]

In [153]:
funcs = [get_tf_vect, get_tf_trans]

In [154]:
from sklearn.decomposition import TruncatedSVD

X_train = []
y_train = []
groups_train = []

for new_group in tqdm(traingroups_titledata):
    docs = traingroups_titledata[new_group]
    model = funcs[0](docs)
    
    vectors = []
    for k, (doc_id, text, target_id) in enumerate(docs):
        vect_text = funcs[1](model, text) 
        vectors.append(vect_text)
        
    vectors = np.array(vectors)

    svd = TruncatedSVD(n_components=100)
    vectors = svd.fit_transform(vectors)
    
    center = np.mean(vectors, axis=0)
    
    lrad = 0
    rrad = 10000
    eps = 0.0001
    ncount = int(len(vectors) * 0.8)
    
    median = np.median(vectors)
    
    while rrad - lrad > eps:
        mid = (lrad + rrad) / 2
        cnt = 0
        for vect in vectors:
            dst = np.sum((median - vect) ** 2)
            if dst < mid * mid:
                cnt += 1
        if cnt <= ncount:
            lrad = mid
        else:
            rrad = mid
    
    for k, (doc_id, text, target_id) in enumerate(docs):
        y_train.append(target_id)
        groups_train.append(new_group)
        
        vect_text = vectors[k]
        
        cnt = 0
        
        dist = []
        for j, (j_doc_id, j_text, j_target_id) in enumerate(docs):
            if j == k:
                continue
            j_vect_text = vectors[j]
            dist.append(cosine(vect_text, j_vect_text))
            
            dst = np.sum((vectors[k] - vectors[j]) ** 2)
            if dst < rrad * rrad:
                cnt += 1
        
        center_dist = cosine(center, vect_text)
        mean_sq_dist = np.mean(np.array(dist) ** 2)
        dist = np.sort(dist)[:15]
        
        sq_mean5 = np.mean(dist[:5] ** 2)
        
        X_train.append( np.concatenate(( dist, np.array([center_dist, mean_sq_dist, cnt, sq_mean5]) )) )

X_train = np.array(X_train)
y_train = np.array(y_train)
groups_train = np.array(groups_train)

print(X_train.shape, y_train.shape, groups_train.shape)

100%|██████████| 129/129 [02:30<00:00,  1.17s/it]

(11690, 19) (11690,) (11690,)





In [155]:
X_train[0:1]

array([[0.26155087, 0.28023325, 0.63909363, 0.71518123, 0.72712701,
        0.77126297, 0.78902297, 0.80523784, 0.82530073, 0.83086177,
        0.83689377, 0.83719606, 0.8375959 , 0.84952542, 0.85811398,
        0.58131501, 0.87830123, 2.        , 0.31911562]])

In [156]:
X_test = []
groups_test = []

for new_group in tqdm(testgroups_titledata):
    docs = testgroups_titledata[new_group]
    model = funcs[0](docs) 
    
    vectors = []
    for k, (doc_id, text, _) in enumerate(docs):
        vect_text = funcs[1](model, text) 
        vectors.append(vect_text)
        
    vectors = np.array(vectors)

    svd = TruncatedSVD(n_components=100)
    vectors = svd.fit_transform(vectors)
    
    center = np.mean(vectors, axis=0)
    
    lrad = 0
    rrad = 10000
    eps = 0.0001
    ncount = int(len(vectors) * 0.8)
    
    median = np.median(vectors)
    
    while rrad - lrad > eps:
        mid = (lrad + rrad) / 2
        cnt = 0
        for vect in vectors:
            dst = np.sum((median - vect) ** 2)
            if dst < mid * mid:
                cnt += 1
        if cnt <= ncount:
            lrad = mid
        else:
            rrad = mid
    
    for k, (doc_id, text, _) in enumerate(docs):
        groups_test.append(new_group)
        
        vect_text = vectors[k]

        cnt = 0
        
        dist = []
        
        for j, (j_doc_id, j_text, _) in enumerate(docs):
            if j == k:
                continue
            j_vect_text = vectors[j]
            dist.append(cosine(vect_text, j_vect_text))
            
            dst = np.sum((vectors[k] - vectors[j]) ** 2)
            if dst < rrad * rrad:
                cnt += 1
        
        center_dist = cosine(center, vect_text)
        mean_sq_dist = np.mean(np.array(dist) ** 2)
        dist = np.sort(dist)[:15]
        
        sq_mean5 = np.mean(dist[:5] ** 2)
        
        X_test.append( np.concatenate(( dist, np.array([center_dist, mean_sq_dist, cnt, sq_mean5]) )) )
        
X_test = np.array(X_test)
groups_test = np.array(groups_test)

  dist = 1.0 - uv / np.sqrt(uu * vv)
100%|██████████| 180/180 [03:18<00:00,  1.10s/it]


In [157]:
X_test.shape

(16627, 19)

In [158]:
X_test[0:1]

array([[0.56311887, 0.57032456, 0.5871184 , 0.61859738, 0.652734  ,
        0.66479215, 0.67216785, 0.72397733, 0.7455059 , 0.74803881,
        0.74967039, 0.80758432, 0.81902166, 0.84141607, 0.84660421,
        0.52691717, 0.87988677, 0.        , 0.35916107]])

In [159]:
X_train[np.isnan(X_train)] = 1
X_test[np.isnan(X_test)] = 1

np.save('x_train24.npy', X_train)
np.save('y_train24.npy', y_train)
np.save('x_test24.npy', X_test)