In [32]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from multiprocessing import Pool
from tqdm.notebook import tqdm
import glob
import codecs
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.preprocessing import StandardScaler
import re
from sklearn import decomposition, manifold
from sklearn.cluster import AgglomerativeClustering, DBSCAN

In [2]:
import pymorphy2

In [3]:
morph = pymorphy2.MorphAnalyzer()

# Подготовка данных

In [36]:
path = 'content.tar/content/'
filename = '1115.dat'
desc = ''
keywords = ''
with codecs.open(path + filename, 'r', 'utf-8') as f:
    soup = BeautifulSoup(f, 'html.parser')
    title = soup.title.text
    meta = soup.find_all('meta')
    ref = [i.text.strip() for i in soup.find_all('a')]
    for tag in meta:
        if 'name' in tag.attrs:
            if tag.attrs['name'] == 'keywords':
                keywords = tag.attrs['content'] if 'content' in tag.attrs else ''
            if tag.attrs['name'] == 'description':
                desc = tag.attrs['content'] if 'content' in tag.attrs else ''
    text = soup.get_text()
    print('title:{0}\nkeywords:{1}\ndesc:{2}\nref:{3}'.format(title, keywords, desc, ref))

title:Пятиэтажки сносимых серий — Комплекс градостроительной политики и строительства города Москвы
keywords:снос пятиэтажек
desc:
ref:['', '', '', 'Комплекс градостроительной политики и строительства города Москвы', '', '', '', '', '', '✕', 'Метро', 'Дороги', 'Новая Москва', 'МЦК', 'Карта строек', 'Округа', 'Госпрограммы', 'О Стройкомплексе', 'Жителям', 'Градостроителям', 'Застройщикам', 'СМИ', 'Контакт-центр', 'Деятельность', 'Структура', 'Справочник организаций', 'Контакты', 'Законы, постановления, распоряжения, указы', 'Проекты правовых нормативных актов', 'Решения об утверждении проектной документации', 'Государственные программы', 'Строительство в округах, районах', 'Архитектурные конкурсы', 'Все стройки Москвы', 'Карты развития дорожно-транспортной инфраструктуры', 'Строительство жилья', 'Снос пятиэтажек', 'Строительство поликлиник в Москве', 'Строительство детских садов', 'Строительство школ и БНК', 'Стадионы Москвы', 'Реновация промзон', 'Долевое строительство', 'Уникальная ар

In [37]:
#функция, которая использовалась для извлечения информации на первом этапе
def process_page(page):
    info = {}
    ident = re.search(r'[0-9]+', page)
    info['id'] = ident.group(0)
    with codecs.open(page, 'r', 'utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
        title = soup.title.text if soup.title else ''
        info['title'] = title
        ref = [i.text.strip() for i in soup.find_all('a')]
        info['refs'] = ';'.join(ref)
        meta = soup.find_all('meta')
        info['desc'] = ''
        info['keywords'] = ''
        for tag in meta:
            if 'name' in tag.attrs:
                if tag.attrs['name'] == 'keywords':
                    keywords = tag.attrs['content'] if 'content' in tag.attrs else ''
                    info['keywords'] = keywords
                if tag.attrs['name'] == 'description':
                    desc = tag.attrs['content'] if 'content' in tag.attrs else ''
                    info['desc'] = desc
    return info
        
    

In [139]:
#улучшенная функция, которая использовалась для извлечения информации (с нормализацией текста)
def russian_words(s):
    new_s = ''
    s = re.sub('\s+', ' ', s)
    for w in re.findall("[А-я]+", s):
        if len(w) >= 3:
            new_s = new_s + ' ' + morph.parse(w.lower())[0].normal_form
    #[morph.parse(word)[0].normal_form for word in new_s.split() if len(word) >= 3]
    return new_s

def smart_process_page(page):
    info = {}
    ident = re.search(r'[0-9]+', page)
    info['id'] = ident.group(0)
    with codecs.open(page, 'r', 'utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
        title = soup.title.text if soup.title else ''
        info['title'] = russian_words(title)
        ref = [russian_words(i.text.strip()) for i in soup.find_all('a')]
        info['refs'] = ';'.join(list(filter(lambda x: x != '', ref)))
        meta = soup.find_all('meta')
        info['desc'] = ''
        info['keywords'] = ''
        for tag in meta:
            if 'name' in tag.attrs:
                if tag.attrs['name'] == 'keywords':
                    keywords = tag.attrs['content'] if 'content' in tag.attrs else ''
                    info['keywords'] = russian_words(keywords)
                if tag.attrs['name'] == 'description':
                    desc = tag.attrs['content'] if 'content' in tag.attrs else ''
                    info['desc'] = russian_words(desc)
    return info


In [38]:
result = list(map(process_page, tqdm(glob.glob('content.tar/content/*'))))
df = pd.DataFrame(result)

HBox(children=(IntProgress(value=0, max=28026), HTML(value='')))




In [88]:
result = list(map(smart_process_page, tqdm(glob.glob('content.tar/content/*'))))
df = pd.DataFrame(result)

HBox(children=(IntProgress(value=0, max=28026), HTML(value='')))




In [140]:
result = list(map(smart_process_page, tqdm(glob.glob('content.tar/content/*'))))
df = pd.DataFrame(result)

HBox(children=(IntProgress(value=0, max=28026), HTML(value='')))




In [141]:
df.head()

Unnamed: 0,id,title,refs,desc,keywords
0,1,аншина центр репродукция генетик фертимед москва,скачать; аншина центр репродукция генетик фер...,тип реферат размер резюме статья систематизир...,аншина центр репродукция генетик фертимед мос...
1,10,нужный помощь доска объявление заработать ден...,главный; регистрация; вход; главный страница;...,,
2,100,курс валюта бобруйск хороший курс сегодня кур...,белорусский магазин дефицит видеокарта майнер...,курс валюта банка бобруйск обновление каждый ...,курс валюта бобруйск курс валюта курс валюта ...
3,1000,как пользоваться компас леса рекомендация,жизнь; экономика; наука; авто; отдых; хай теч...,если собраться лес обязательно захватить себя...,как пользоваться компас леса как научиться по...
4,10000,как удалить аккаунт,дробный; можно список ниже; войти; войти; пор...,подписываться канал ставить лайк писать позит...,


In [142]:
df.title.values

array([' аншина центр репродукция генетик фертимед москва',
       ' нужный помощь доска объявление заработать деньга денежный помощь кредитный помощь',
       ' курс валюта бобруйск хороший курс сегодня курс конверсия валюта',
       ...,
       ' сосед заливать мой балкон про остекление жилищный коммунальный хозяйство конференция юрклуб',
       ' мамочка санкт петербург группа страна мама',
       ' апноэ остановка сон причина академия вселенная счастие'],
      dtype=object)

In [63]:
df.loc[:,'refs'] = list(map(lambda x: x.encode('utf-8', 'replace').decode('utf-8'), df.loc[:,'refs']))

In [64]:
df.to_csv('pages_info.tsv', sep = '\t', index = False)

In [92]:
df.to_csv('clean_pages_info.tsv', sep = '\t', index = False)

In [143]:
df.to_csv('normal_form_pages_info.tsv', sep = '\t', index = False)

## Генерим признаки

In [4]:
df = pd.read_csv('normal_form_pages_info.tsv', sep = '\t')

In [5]:
df.fillna('', inplace=True)

In [285]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28026 entries, 0 to 28025
Data columns (total 5 columns):
id          28026 non-null int64
title       28026 non-null object
refs        28026 non-null object
desc        28026 non-null object
keywords    28026 non-null object
dtypes: int64(1), object(4)
memory usage: 1.1+ MB


In [147]:
df.head()

Unnamed: 0,id,title,refs,desc,keywords
0,1,аншина центр репродукция генетик фертимед москва,скачать; аншина центр репродукция генетик фер...,тип реферат размер резюме статья систематизир...,аншина центр репродукция генетик фертимед мос...
1,10,нужный помощь доска объявление заработать ден...,главный; регистрация; вход; главный страница;...,,
2,100,курс валюта бобруйск хороший курс сегодня кур...,белорусский магазин дефицит видеокарта майнер...,курс валюта банка бобруйск обновление каждый ...,курс валюта бобруйск курс валюта курс валюта ...
3,1000,как пользоваться компас леса рекомендация,жизнь; экономика; наука; авто; отдых; хай теч...,если собраться лес обязательно захватить себя...,как пользоваться компас леса как научиться по...
4,10000,как удалить аккаунт,дробный; можно список ниже; войти; войти; пор...,подписываться канал ставить лайк писать позит...,


Здесь будут идеи про то, какие признаки использовать.
- Число общих слов в заголовках в группе
- Стандартное отклонение от (1)
- Число общих слов в refs, keywords, desc
- Косинусное расстояние между заголовками, refs, desc (среднее внутри группы)
- Сложить всё в кучу и посчитать Жаккара
- Лучше удалять знаки препинания и прочий мусор (мб регулярные выражения?) done
- И вообще текст лучше нормализовать (чтобы окончания удалялись) done
- является ли страничка выбросом с точки зрения dbscan

Отбор признаков: считать корреляции с таргетом

In [6]:
train_data = pd.read_csv('train_groups.csv')
traingroups_data = {}
for i in tqdm(range(len(train_data))):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = df[df.id == doc_id].title.values
    refs = df[df.id == doc_id].refs.values
    keywords = df[df.id == doc_id].keywords.values
    desc = df[df.id == doc_id].desc.values
    #print(doc_id, title, refs, keywords, desc, target)
    if doc_group not in traingroups_data:
        traingroups_data[doc_group] = []
    traingroups_data[doc_group].append((doc_id, title[0], refs[0], keywords[0], desc[0], target))

HBox(children=(IntProgress(value=0, max=11690), HTML(value='')))




In [7]:
train_data = pd.read_csv('train_groups.csv')
traingroups = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    if doc_group not in traingroups:
        traingroups[doc_group] = []
    traingroups[doc_group].append([doc_id,target])

# Песочница

Далее идут пробы разных идей, можно не смотреть

In [292]:
indices = np.asarray(traingroups[1])[:, 0]

In [293]:
vectorizer = TfidfVectorizer(max_df=0.4)

In [515]:
tfidf_mtx = vectorizer.fit_transform(df[df.id.isin(indices)].title.values)

In [193]:
tfidf_mtx_theme= vectorizer.fit_transform(df[df.id.isin(theme)].title.values)

In [175]:
hvectorizer = HashingVectorizer()
hashing_mtx = hvectorizer.fit_transform(df[df.id.isin(indices)].title.values)

In [509]:
tfidf_mtx

<102x475 sparse matrix of type '<class 'numpy.float64'>'
	with 849 stored elements in Compressed Sparse Row format>

In [465]:
from sklearn import decomposition, manifold
from sklearn.cluster import AgglomerativeClustering, DBSCAN

In [522]:
pca_machine = decomposition.TruncatedSVD(n_components=50)

In [523]:
X_pca = pca_machine.fit_transform(tfidf_mtx)

In [524]:
pca_machine.explained_variance_

array([0.03866454, 0.05067422, 0.04737904, 0.02871258, 0.02603472,
       0.02292828, 0.02280408, 0.0201574 , 0.01756046, 0.01692816,
       0.01665631, 0.01618035, 0.01521724, 0.01462557, 0.0144876 ,
       0.01425048, 0.01374659, 0.01371677, 0.01344916, 0.01318795,
       0.0130209 , 0.01265593, 0.01224448, 0.01175181, 0.01158525,
       0.0114405 , 0.01096649, 0.01079836, 0.01047736, 0.01026153,
       0.01003651, 0.0100155 , 0.00974265, 0.00979137, 0.00962768,
       0.00977693, 0.00956313, 0.00968058, 0.00952478, 0.00941317,
       0.00918494, 0.00903565, 0.00900045, 0.00892914, 0.00861795,
       0.00833066, 0.00805761, 0.00805653, 0.00778934, 0.00770143])

In [463]:
clustering = AgglomerativeClustering(n_clusters=None, affinity='cosine', linkage='complete', distance_threshold=0.85).fit(X_pca)

In [525]:
clustering = DBSCAN(eps=0.8, min_samples=2, metric='cosine').fit(X_pca)

In [526]:
clustering.labels_[same_theme_indices]

array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [535]:
np.asarray(clustering.labels_ != -1, dtype = int)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1])

In [361]:
dist_inside = pairwise_distances(X_pca, metric= 'cosine')

In [362]:
np.mean(dist_inside)

0.9325404388768355

In [301]:
np.sort(dist_inside, axis = 1)

array([[0.00000000e+00, 9.63701717e-02, 1.65413213e-01, ...,
        1.03769992e+00, 1.04601676e+00, 1.05050590e+00],
       [0.00000000e+00, 3.34074536e-01, 3.34074536e-01, ...,
        1.02772963e+00, 1.03194306e+00, 1.04141075e+00],
       [0.00000000e+00, 3.71558454e-01, 5.63847098e-01, ...,
        1.05096676e+00, 1.06504298e+00, 1.13861478e+00],
       ...,
       [0.00000000e+00, 9.70880918e-01, 9.77676504e-01, ...,
        1.02818470e+00, 1.03237017e+00, 1.23519531e+00],
       [0.00000000e+00, 2.22044605e-16, 7.13807366e-02, ...,
        1.02040012e+00, 1.02390340e+00, 1.02798458e+00],
       [0.00000000e+00, 4.40214577e-01, 4.99175372e-01, ...,
        1.05561723e+00, 1.07211250e+00, 1.07468742e+00]])

In [382]:
same_theme_indices = np.where(np.asarray(traingroups[1])[:, 1])

In [364]:
same_theme_id = np.asarray(traingroups[1])[same_theme_indices][:,0]

In [383]:
same_theme_indices

(array([28, 34, 38, 41, 63, 82, 84, 98], dtype=int64),)

In [369]:
np.mean(dist_inside[same_theme_indices[0], :][:,same_theme_indices[0]], axis = 1)

array([0.84829034, 0.85253285, 0.86014829, 0.86410612, 0.84967683,
       0.85582527, 0.83632201, 0.85058701])

In [371]:
np.mean(dist_inside[same_theme_indices[0], :], axis = 1)

array([0.86156318, 0.9039839 , 0.97774935, 0.95611611, 0.94305625,
       0.96962838, 0.96552611, 0.97548428])

In [368]:
np.mean(dist_inside, axis = 1)

array([0.86266679, 0.89361704, 0.97247139, 0.89369979, 0.97518434,
       0.96351423, 0.975296  , 0.88739875, 0.87301329, 0.9526289 ,
       0.92737703, 0.97074684, 0.88403513, 0.91203252, 0.96284435,
       0.90198509, 0.86104272, 0.8960462 , 0.93850382, 0.95314068,
       0.93310311, 0.93820364, 0.91269946, 0.90316921, 0.91903961,
       0.9414762 , 0.94810858, 0.91342309, 0.86156318, 0.97305285,
       0.92804509, 0.95139299, 0.86741677, 0.88888789, 0.9039839 ,
       0.97532251, 0.9063953 , 0.9075783 , 0.97774935, 0.98475879,
       0.94871724, 0.95611611, 0.89615101, 0.95332946, 0.85264233,
       0.95360734, 0.89662382, 0.95251703, 0.85899877, 0.93601977,
       0.91435413, 0.95421151, 0.97664041, 0.92008978, 0.95954485,
       0.9803891 , 0.96943491, 0.92301604, 0.85574101, 0.91571631,
       0.9720689 , 0.9240197 , 0.93137361, 0.94305625, 0.93624256,
       0.98309934, 0.9414576 , 0.92972871, 0.93613242, 0.95369368,
       0.9651184 , 0.92375165, 0.88459262, 0.97246227, 0.93607

In [332]:
same_theme_indices

(array([28, 34, 38, 41, 63, 82, 84, 98], dtype=int64),)

In [312]:
df[df.id.isin(same_theme_id)]

Unnamed: 0,id,title,refs,desc,keywords
6110,15498,простой разборка ступица нива ремонт,ваза; войти; авто; бизнес; дом; интернет; мед...,,
6239,15613,снятие установка подшипник ступица нива нива,главный; ваза; ремонт; тюнинг; тест драйв; ха...,подшипник ступица лада амортизатор шаровой оп...,подвеска колесо нива пыльник рычаг амортизато...
6405,15763,замена регулировка передний ступичный подшипн...,перейти содержимое; главный; тот раздел; двиг...,симптом последствие звук выйти строй подшипни...,
6727,16052,как снять ступица ваза нива,дробный; можно список ниже; вадик чер; войти;...,снимать ступица ваза нива,
7329,16595,как поменять подшипник ступица нива шеврол видео,ремонт автомобиль; видео; выбор; клип; хороши...,как поменять подшипник ступица нива шеврол видео,
7577,16818,замена подшипник передний ступица нива,весь подшипник; справочник; использование; ст...,замена подшипник передний ступица нива являть...,
7836,17050,ваза замена подшипник ступица,общий сведение; диагностика неисправность; дв...,,
7975,17176,как заменить подшипник ступица передний колес...,проверка регулировка зазор подшипник ступица;...,как заменить подшипник ступица передний колес...,как заменить подшипник ступица передний колес...


In [None]:
#можно для каждой группы посчитать такую штуку, потом взять n минимальных расстояний 
#только желательно ещё как-нибудь графически проверить, что это работает

In [None]:
#надо как-то снизить размерность в этой матрице, а потом посчитать расстояния между векторами
#внутри пространства меньшей размерности

In [176]:
hashing_mtx

<90x1048576 sparse matrix of type '<class 'numpy.float64'>'
	with 588 stored elements in Compressed Sparse Row format>

In [187]:
tfidf_mtx_theme

<9x16 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [176]:
from sklearn.metrics import pairwise_distances
dist_inside = pairwise_distances(tfidf_mtx, metric= 'cosine')
dist_inside

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [238]:
dist_inside[15, 20]

0.994144981916966

In [177]:
dist_inside.mean()

0.9744234883487349

In [179]:
np.where(np.asarray(traingroups[19])[:, 1])

(array([ 4, 15, 20, 37, 84, 86, 89], dtype=int64),)

In [180]:
dist_inside[dist_inside > 0].min()

0.25215142803799906

In [23]:
theme = np.asarray(traingroups[5])[np.where(np.asarray(traingroups[5])[:, 1])][:, 0]

In [37]:
dist_inside.mean(axis=1)

array([0.96666667, 0.96666667, 0.96467697, 0.95193392, 0.92133704,
       0.96666667, 0.96666667, 0.96666667, 0.96276687, 0.96438714,
       0.96666667, 0.95994812, 0.96438702, 0.96666667, 0.96666667,
       0.96252717, 0.96666667, 0.91763767, 0.92497215, 0.96666667,
       0.96666667, 0.95762439, 0.96109081, 0.95310484, 0.96666667,
       0.96276687, 0.96666667, 0.96438702, 0.96666667, 0.95368406])

In [None]:
##End of Песочница

In [567]:
tfidf_mtx.shape

(91, 19770)

# Чистовик

In [33]:
pca_machine = decomposition.TruncatedSVD(n_components=40)

In [34]:
vectorizer = TfidfVectorizer(max_df=0.4)

In [40]:
isoutlier = {}
f1_list = []
for group in tqdm(traingroups):
    indices = np.asarray(traingroups[group])[:, 0]
    tfidf_mtx = vectorizer.fit_transform(df[df.id.isin(indices)].title.values)
    pca_machine = decomposition.TruncatedSVD(n_components=int(np.sqrt(tfidf_mtx.shape[1])))#
    X_pca = pca_machine.fit_transform(tfidf_mtx)
    clustering = DBSCAN(eps=0.8, min_samples=2, metric='cosine').fit(X_pca)
    isout = list(np.asarray(clustering.labels_ != -1, dtype = int))
    f1 = f1_score(np.asarray(traingroups[group])[:, 1], np.asarray(isout))
    f1_list.append(f1)
    ind_list = list(indices)
    for i in range(len(ind_list)):
        isoutlier[(group, ind_list[i])] = isout[i]

HBox(children=(IntProgress(value=0, max=129), HTML(value='')))




In [41]:
np.mean(f1_list)

0.3985839482657143

In [None]:
f1_score(np.asarray(traingroups[1])[:, 1], )

In [None]:
np.asarray(traingroups[group])[:, 0]

In [10]:
def jaccar(s1, s2):
    u = len(s1.union(s2))
    if u > 0:
        return len(s1.intersection(s2))/u
    else:
        return 0

In [8]:
def bag_of_words(s):
    #temp = s.strip(' \t\n!()#*?:;,.').split()
    #return [morph.parse(word)[0].normal_form for word in s.split() if len(word) >= 3]
    #return [i.strip('.,:').lower() for i in temp if len(i) >= 3]
    return s.strip().split()

In [42]:
%%time
y_train = []
X_train = []
groups_train = []
for new_group in tqdm(traingroups_data):
    docs = traingroups_data[new_group]
    for k, (doc_id, title, refs, keywords, desc, target_id) in enumerate(docs):
        y_train.append(target_id)
        groups_train.append(new_group)
        dist = {'title':[], 'keywords':[], 'desc':[], 'title_jaccar':[], 'refs_jaccar':[]}
        #title_dist = []
        #jaccars = []
        #extra_dist = []
        words = set(bag_of_words(title))
        wdesc = set(bag_of_words(desc))
        wkey = set(bag_of_words(keywords))
        temp = [i.strip().split() for i in refs.split(';') if len(i) >= 3]
        wrefs = set([item.strip() for sublist in temp for item in sublist if len(item.strip()) >= 3])
        #wrefs = set(temp)
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j, refs_j, keywords_j, desc_j, target_j = docs[j]
            words_j = set(bag_of_words(title_j))
            dist['title'].append(len(words.intersection(words_j)))
            ###
            wdesc_j = set(bag_of_words(desc_j))
            wkey_j = set(bag_of_words(keywords_j))
            temp = [i.strip().split() for i in refs_j.split(';') if len(i) >= 3]
            #wrefs_j = set(temp)
            wrefs_j = set([item.strip() for sublist in temp for item in sublist if len(item.strip()) >= 3])
            #jaccars.extend([jaccar(wdesc_j, wdesc), jaccar(wrefs_j, wrefs), jaccar(words, words_j)])
            #all_dist.extend([len(wkey.intersection(wkey_j)), len(wdesc.intersection(wdesc_j))])
            dist['keywords'].append(len(wkey.intersection(wkey_j)))
            dist['desc'].append(len(wdesc.intersection(wdesc_j)))
            dist['title_jaccar'].append(jaccar(words, words_j))
            dist['refs_jaccar'].append(jaccar(wrefs_j, wrefs))
        features = []
        for key in ['title', 'keywords', 'desc', 'title_jaccar', 'refs_jaccar']:
            if (key == 'keywords') | (key == 'desc'):
                features.extend(sorted(dist[key], reverse=True)[0:4])
            else:
                features.extend(sorted(dist[key], reverse=True)[0:15])
        features.append(isoutlier[(new_group, doc_id)])
        #features.append(isoutlier_refs[(new_group, doc_id)])
        #features = sorted(all_dist, reverse=True)[0:15]
        #features.extend(sorted(jaccars, reverse = True)[0:10])
        X_train.append(features)
        
X_train = np.array(X_train)
y_train = np.array(y_train)
groups_train = np.array(groups_train)
print (X_train.shape, y_train.shape, groups_train.shape)

HBox(children=(IntProgress(value=0, max=129), HTML(value='')))


(11690, 54) (11690,) (11690,)
Wall time: 10min 27s


In [655]:
X_train

array([[5.        , 4.        , 4.        , ..., 0.12264151, 0.11818182,
        1.        ],
       [6.        , 6.        , 6.        , ..., 0.15454545, 0.15419501,
        1.        ],
       [3.        , 3.        , 3.        , ..., 0.08812261, 0.08695652,
        1.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.02479339, 0.02424242,
        1.        ],
       [4.        , 1.        , 1.        , ..., 0.05027933, 0.04977376,
        1.        ],
       [1.        , 1.        , 1.        , ..., 0.04712939, 0.04585153,
        1.        ]])

# Обучение моделей, подбор параметров

In [43]:
scaler = StandardScaler()

In [44]:
dbs = X_train[:, -1]
dbs

array([1., 1., 1., ..., 1., 1., 1.])

In [45]:
np.unique(dbs)

array([1.])

In [659]:
X_train[:,:-1]

array([[5.        , 4.        , 4.        , ..., 0.12301587, 0.12264151,
        0.11818182],
       [6.        , 6.        , 6.        , ..., 0.17236842, 0.15454545,
        0.15419501],
       [3.        , 3.        , 3.        , ..., 0.09236948, 0.08812261,
        0.08695652],
       ...,
       [0.        , 0.        , 0.        , ..., 0.02491103, 0.02479339,
        0.02424242],
       [4.        , 1.        , 1.        , ..., 0.05111821, 0.05027933,
        0.04977376],
       [1.        , 1.        , 1.        , ..., 0.04979253, 0.04712939,
        0.04585153]])

In [46]:
#если добавляем признак от dbscan 
X_train = scaler.fit_transform(X_train[:,:-1])
X_train = np.hstack([X_train, dbs.reshape(-1,1)])

In [13]:
#без dbscan
X_train = scaler.fit_transform(X_train)

In [14]:
X_train

array([[ 1.06940584,  0.91559146,  1.08864629, ...,  0.83060644,
         0.87650367,  0.85084107],
       [ 1.54456183,  1.98490204,  2.21208159, ...,  1.66093309,
         1.42517507,  1.48486643],
       [ 0.11909385,  0.38093618,  0.52692864, ...,  0.31499945,
         0.28286125,  0.30110838],
       ...,
       [-1.30637414, -1.22302969, -1.15822432, ..., -0.81994794,
        -0.8062495 , -0.8029961 ],
       [ 0.59424984, -0.6883744 , -0.59650667, ..., -0.37902807,
        -0.36795239, -0.35350773],
       [-0.83121814, -0.6883744 , -0.59650667, ..., -0.40133183,
        -0.42212378, -0.42255997]])

## Регрессия

In [15]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import f1_score, fbeta_score

In [698]:
parameters_f1 = {}
for i in tqdm(range(1, len(traingroups_data) - 13, 13)):
    X_train_temp = X_train[np.where((groups_train >= i + 13) | (groups_train < i))]
    y_train_temp = y_train[np.where((groups_train >= i + 13) | (groups_train < i))]
    X_val = X_train[np.where((groups_train < i + 13) & (groups_train >= i))]
    y_val = y_train[np.where((groups_train < i + 13) & (groups_train >= i))]
    for C in np.linspace(0.001, 0.5, 10):
        #[0.001, 0.01, 0.1, 0.5, 1, 10, 100, 1000]
        clf = LogisticRegression(C = C, solver='lbfgs')
        clf.fit(X_train_temp, y_train_temp)
        prediction = clf.predict(X_val)
        f1 = f1_score(y_val, prediction)
        fbeta_1 = fbeta_score(y_val, prediction, beta = 0.001)
        fbeta_2 = fbeta_score(y_val, prediction, beta = 100)
        if C not in parameters_f1:
            parameters_f1[C] = []
        parameters_f1[C].append((f1, fbeta_1, fbeta_2))

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))








In [25]:
np.linspace(0.001, 0.5, 10)

array([0.001     , 0.05644444, 0.11188889, 0.16733333, 0.22277778,
       0.27822222, 0.33366667, 0.38911111, 0.44455556, 0.5       ])

In [699]:
f_df = pd.DataFrame()
for key in parameters_f1:
    temp = pd.DataFrame(data = {'f1': [i[0] for i in parameters_f1[key]], 
                                'f_beta_001': [i[1] for i in parameters_f1[key]], 
                                'f_beta_100': [i[2] for i in parameters_f1[key]]})
    temp['C'] = key
    f_df = f_df.append(temp)

In [701]:
f_df.groupby('C').mean()

Unnamed: 0_level_0,f1,f_beta_001,f_beta_100
C,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.001,0.593599,0.69609,0.525109
0.056444,0.602881,0.690197,0.542585
0.111889,0.603463,0.68818,0.545244
0.167333,0.603383,0.686174,0.546436
0.222778,0.603845,0.686263,0.547173
0.278222,0.60308,0.685879,0.546255
0.333667,0.602848,0.685197,0.54636
0.389111,0.602013,0.684557,0.545332
0.444556,0.602059,0.6837,0.545841
0.5,0.602199,0.683445,0.546402


In [58]:
parameters_f1 = {}
thresholds = {}
for i in tqdm(range(1, len(traingroups_data) - 13, 13)):
    X_train_temp = X_train[np.where((groups_train >= i + 13) | (groups_train < i))]
    y_train_temp = y_train[np.where((groups_train >= i + 13) | (groups_train < i))]
    X_val = X_train[np.where((groups_train < i + 13) & (groups_train >= i))]
    y_val = y_train[np.where((groups_train < i + 13) & (groups_train >= i))]
    #for C in np.linspace(0.001, 0.5, 10):
        #[0.001, 0.01, 0.1, 0.5, 1, 10, 100, 1000]
    clf = LinearRegression()
    clf.fit(X_train_temp, y_train_temp)
    prediction = clf.predict(X_val)
    for threshold in np.linspace(0.2, 0.4, 5):
    #[0.25, 0.3, 0.4, 0.5, 0.6, 0.7]:
        prediction_t = np.asarray(prediction >= threshold, dtype = int)
        f1 = f1_score(y_val, prediction_t)
        fbeta_1 = fbeta_score(y_val, prediction_t, beta = 0.001)
        fbeta_2 = fbeta_score(y_val, prediction_t, beta = 100)
        if threshold not in thresholds:
            thresholds[threshold] = []
        thresholds[threshold].append((f1, fbeta_1, fbeta_2))
    

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




In [49]:
prediction

array([0.70969072, 0.60329002, 0.33830763, ..., 0.15215779, 0.3859822 ,
       0.50616722])

f_beta001 >> precision Cbest = 0.001
f_bea100 >> recall Cbest = 0.11
f1 Cbest = 0.11

In [57]:
thresholds = {}
for i in tqdm(range(1, len(traingroups_data) - 13, 13)):
    X_train_temp = X_train[np.where((groups_train >= i + 13) | (groups_train < i))]
    y_train_temp = y_train[np.where((groups_train >= i + 13) | (groups_train < i))]
    X_val = X_train[np.where((groups_train < i + 13) & (groups_train >= i))]
    y_val = y_train[np.where((groups_train < i + 13) & (groups_train >= i))]
    clf = LogisticRegression(C = 0.11, solver='lbfgs')
    clf.fit(X_train_temp, y_train_temp)
    prediction = clf.predict_proba(X_val)[:, 1]
    for threshold in np.linspace(0.2, 0.4, 5):
    #[0.25, 0.3, 0.4, 0.5, 0.6, 0.7]:
        prediction_t = np.asarray(prediction >= threshold, dtype = int)
        f1 = f1_score(y_val, prediction_t)
        fbeta_1 = fbeta_score(y_val, prediction_t, beta = 0.001)
        fbeta_2 = fbeta_score(y_val, prediction_t, beta = 100)
        if threshold not in thresholds:
            thresholds[threshold] = []
        thresholds[threshold].append((f1, fbeta_1, fbeta_2))

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))






In [59]:
f_df = pd.DataFrame()
for key in thresholds:
    temp = pd.DataFrame(data = {'f1': [i[0] for i in thresholds[key]], 
                                'f_beta_001': [i[1] for i in thresholds[key]], 
                                'f_beta_100': [i[2] for i in thresholds[key]]})
    temp['t'] = key
    f_df = f_df.append(temp)

In [60]:
f_df.groupby('t').f1.mean()

t
0.20    0.620410
0.25    0.638207
0.30    0.646114
0.35    0.650938
0.40    0.638985
Name: f1, dtype: float64

threshold = 0.35

In [53]:
f_df.groupby('t').f_beta_001.mean()

t
0.30    0.562189
0.35    0.603824
0.40    0.638875
0.45    0.665975
0.50    0.686761
Name: f_beta_001, dtype: float64

In [None]:
f1 t_best = 0.3
f_beta_001 0.7
fbeta_100 0.25 0.3

In [44]:
np.linspace(0.3, 0.5, 5)

array([0.3 , 0.35, 0.4 , 0.45, 0.5 ])

In [212]:
#thresholds = {}
parameters_f1 = {}
for i in tqdm(range(1, len(traingroups_data) - 13, 13)):
    X_train_temp = X_train[np.where((groups_train >= i + 13) | (groups_train < i))]
    y_train_temp = y_train[np.where((groups_train >= i + 13) | (groups_train < i))]
    X_val = X_train[np.where((groups_train < i + 13) & (groups_train >= i))]
    y_val = y_train[np.where((groups_train < i + 13) & (groups_train >= i))]
    clf = LogisticRegression(C = 0.11, solver='lbfgs')
    clf.fit(X_train_temp, y_train_temp)
    prediction = clf.predict(X_val)
    #prediction_t = np.asarray(prediction >= 0.4, dtype = int)
    f1 = f1_score(y_val, prediction)
    fbeta_1 = fbeta_score(y_val, prediction, beta = 0.001)
    fbeta_2 = fbeta_score(y_val, prediction, beta = 100)
    if i not in parameters_f1:
        parameters_f1[i] = []
    parameters_f1[i].append((f1, fbeta_1, fbeta_2))
    #print(f1, fbeta_1, fbeta_2)

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))






In [213]:
f_df = pd.DataFrame()
for key in parameters_f1:
    temp = pd.DataFrame(data = {'f1': [i[0] for i in parameters_f1[key]], 
                                'f_beta_001': [i[1] for i in parameters_f1[key]], 
                                'f_beta_100': [i[2] for i in parameters_f1[key]]})
    f_df = f_df.append(temp)

In [203]:
f_df.describe()

Unnamed: 0,f1,f_beta_001,f_beta_100
count,9.0,9.0,9.0
mean,0.587135,0.697783,0.511549
std,0.095719,0.110881,0.096934
min,0.366102,0.5,0.288782
25%,0.576832,0.638743,0.508033
50%,0.584958,0.664335,0.525871
75%,0.605634,0.796392,0.575856
max,0.700272,0.845394,0.610686


log_reg: 
f1 = 0.536412
f_001 = 0.696774
f_100 = 0.467504    

n_features = 75
0.534247
0.648760
0.486080

модифицированные фичи
0.584958
0.664335
0.525871

In [215]:
np.where(clf.coef_ > 0)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64),
 array([ 0,  1,  2,  3,  6,  7,  9, 11, 14, 17, 19, 20, 22, 23, 24, 25, 27,
        28, 29, 30, 31, 32, 33, 34, 37, 40, 41, 43, 45, 46, 49, 51],
       dtype=int64))

In [216]:
clf.coef_

array([[ 0.21594231,  0.07217693,  0.06750532,  0.16115087, -0.06121074,
        -0.01484309,  0.00686427,  0.13741699, -0.11407353,  0.03433764,
        -0.28833621,  0.16122478, -0.12715123, -0.63655428,  0.4784934 ,
        -0.24546121, -0.02337509,  0.31265803, -0.36687017,  0.05286761,
         0.05373897, -0.19614837,  0.39603255,  0.05389279,  0.23336239,
         0.05229998, -0.17880639,  0.3953796 ,  0.13572783,  0.20215368,
         0.41303241,  0.32493809,  0.21669494,  0.16054196,  0.065898  ,
        -0.19789605, -0.46777195,  0.28844181, -0.03582818, -0.00794133,
         0.03944739,  0.02435624, -0.30201212,  0.37881475, -0.48120549,
         0.07068799,  0.23552486, -0.16372671, -0.32021719,  0.45777862,
        -0.00820402,  0.35736528, -0.20031004]])

## RandomForest

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [83]:
parameters_f1 = {}
for i in tqdm(range(1, len(traingroups_data) - 13, 13)):
    X_train_temp = X_train[np.where((groups_train >= i + 13) | (groups_train < i))]
    y_train_temp = y_train[np.where((groups_train >= i + 13) | (groups_train < i))]
    X_val = X_train[np.where((groups_train < i + 13) & (groups_train >= i))]
    y_val = y_train[np.where((groups_train < i + 13) & (groups_train >= i))]
    for m_depth in range(2, 11):
    #[5, 10, 15, 20, 25, 50, 100]:
        #[0.001, 0.01, 0.1, 0.5, 1, 10, 100, 1000]
        for m_features in range(2,8):
        #range(1, 10, 2):
            clf = RandomForestClassifier(max_depth=m_depth, max_features = m_features, n_estimators = 20, random_state=0)
            clf.fit(X_train_temp, y_train_temp)
            prediction = clf.predict(X_val)
            f1 = f1_score(y_val, prediction)
            fbeta_1 = fbeta_score(y_val, prediction, beta = 0.001)
            fbeta_2 = fbeta_score(y_val, prediction, beta = 100)
            if (m_depth, m_features) not in parameters_f1:
                parameters_f1[(m_depth, m_features)] = []
            parameters_f1[(m_depth, m_features)].append((f1, fbeta_1, fbeta_2))

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




In [84]:
f_df = pd.DataFrame()
for key in parameters_f1:
    temp = pd.DataFrame(data = {'f1': [i[0] for i in parameters_f1[key]], 
                                'f_beta_001': [i[1] for i in parameters_f1[key]], 
                                'f_beta_100': [i[2] for i in parameters_f1[key]]})
    temp['max_depth'] = key[0]
    temp['max_features'] = key[1]
    f_df = f_df.append(temp)

In [85]:
f_df.groupby(['max_depth', 'max_features']).f1.mean()

max_depth  max_features
2          2               0.606690
           3               0.608113
           4               0.614854
           5               0.609334
           6               0.612641
           7               0.608581
3          2               0.611555
           3               0.610934
           4               0.612026
           5               0.623253
           6               0.622693
           7               0.626111
4          2               0.616708
           3               0.620500
           4               0.618155
           5               0.636206
           6               0.640560
           7               0.635501
5          2               0.632411
           3               0.631837
           4               0.631695
           5               0.643441
           6               0.638967
           7               0.641540
6          2               0.626132
           3               0.639935
           4               0.636573
    

max_features = 4 
max_depth = 6


In [None]:
#попробуем улучшить качество за счёт комбинации моделей: дадим СЛ вероятность, предсказанную логрегрессией

In [62]:
parameters_f1 = {}
new_groups = []
for i in tqdm(range(1, len(traingroups_data) - 13, 13)):
    X_train_temp = X_train[np.where((groups_train >= i + 13) | (groups_train < i))]
    y_train_temp = y_train[np.where((groups_train >= i + 13) | (groups_train < i))]
    X_val = X_train[np.where((groups_train < i + 13) & (groups_train >= i))]
    y_val = y_train[np.where((groups_train < i + 13) & (groups_train >= i))]
    logreg = LogisticRegression(C = 0.11, solver='lbfgs')
    logreg.fit(X_train_temp, y_train_temp)
    prediction = logreg.predict_proba(X_val)[:, 1]
    prediction = (prediction - np.mean(prediction))/np.std(prediction)
    print(prediction.shape, X_train_temp.shape)
    linreg = LinearRegression()
    linreg.fit(X_train_temp, y_train_temp)
    #prediction = linreg.predict(X_val)
    prediction_t = np.asarray(linreg.predict(X_val) >= 0.35, dtype = int)
    X_val = np.hstack([X_val, prediction.reshape(-1,1)])
    X_val = np.hstack([X_val, prediction_t.reshape(-1,1)])
    if i == 1:
        X_new_train = X_val
        y_new_train = y_val
    else:
        X_new_train = np.concatenate([X_new_train, X_val])
        y_new_train = np.concatenate([y_new_train, y_val])
    new_groups.extend(groups_train[np.where((groups_train < i + 13) & (groups_train >= i))])
new_groups = np.asarray(new_groups)
print(X_new_train.shape, X_train.shape, new_groups.shape, y_new_train.shape)

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))

(1070,) (10620, 53)




(1208,) (10482, 53)




(1248,) (10442, 53)
(1163,) (10527, 53)




(1155,) (10535, 53)




(1219,) (10471, 53)




(1058,) (10632, 53)
(1292,) (10398, 53)




(1239,) (10451, 53)

(10652, 55) (11690, 53) (10652,) (10652,)


In [135]:
max(new_groups)

117

In [68]:
parameters_f1 = {}
#m_depth = 9
#m_features = 9
for i in tqdm(range(1, max(new_groups) - 13, 13)):
    X_train_temp = X_new_train[np.where((new_groups >= i + 13) | (new_groups < i))]
    y_train_temp = y_new_train[np.where((new_groups >= i + 13) | (new_groups < i))]
    X_val = X_new_train[np.where((new_groups < i + 13) & (new_groups >= i))]
    y_val = y_new_train[np.where((new_groups < i + 13) & (new_groups >= i))]
    for m_depth in range(8, 15):
        for m_features in range(8,15):
            clf = RandomForestClassifier(max_depth=m_depth, max_features = m_features, n_estimators = 50, random_state=0)
            clf.fit(X_train_temp, y_train_temp)
            prediction = clf.predict(X_val)
            f1 = f1_score(y_val, prediction)
            fbeta_1 = fbeta_score(y_val, prediction, beta = 0.001)
            fbeta_2 = fbeta_score(y_val, prediction, beta = 100)
            if (m_depth, m_features) not in parameters_f1:
                parameters_f1[(m_depth, m_features)] = []
            parameters_f1[(m_depth, m_features)].append((f1, fbeta_1, fbeta_2))
    #if i not in parameters_f1:
        #parameters_f1[i] = []
    #parameters_f1[i].append((f1, fbeta_1, fbeta_2))

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




In [69]:
f_df = pd.DataFrame()
for key in parameters_f1:
    temp = pd.DataFrame(data = {'f1': [i[0] for i in parameters_f1[key]], 
                                'f_beta_001': [i[1] for i in parameters_f1[key]], 
                                'f_beta_100': [i[2] for i in parameters_f1[key]],
                                'depth': key[0], 'features': key[1]})
    f_df = f_df.append(temp)

max_depth = 9
max_features = 9
Для модели с обеими регрессиями: 
max_depth = 10
features = 8

In [71]:
f_df.groupby(['depth', 'features']).f1.mean()

depth  features
8      8           0.643813
       9           0.637187
       10          0.643284
       11          0.641022
       12          0.636477
       13          0.638436
       14          0.637765
9      8           0.643946
       9           0.638547
       10          0.641216
       11          0.637104
       12          0.636180
       13          0.635672
       14          0.627386
10     8           0.649657
       9           0.632968
       10          0.636930
       11          0.638275
       12          0.640627
       13          0.640476
       14          0.636151
11     8           0.648004
       9           0.642812
       10          0.638876
       11          0.632249
       12          0.642055
       13          0.634267
       14          0.641224
12     8           0.642297
       9           0.641385
       10          0.638163
       11          0.634383
       12          0.632304
       13          0.631773
       14          0.639476
13  

RF_plus_logreg: 
f1 = 0.556453 
f_001 = 0.669921 
f_100 = 0.506657

n_features = 75
0.543851
0.650065
0.489387

модифицированные фичи
0.612736
0.691176
0.540276

с нормализацией: 
0.627775	
0.674735	
0.592245

с dbscan:
0.627503	
0.688778	
0.585291


с dbscan, убирая ненужные фичи
0.632101	
0.693050	
0.594130

Кажется, финальное:
0.632405	
0.695853	
0.593312

In [66]:
clf.feature_importances_

array([0.00719565, 0.00690649, 0.0337205 , 0.06424077, 0.02772767,
       0.01398973, 0.0177307 , 0.00576894, 0.00389543, 0.00414922,
       0.00161034, 0.00157841, 0.00198815, 0.00210286, 0.00259267,
       0.00451139, 0.0045441 , 0.00404249, 0.00348913, 0.00705629,
       0.00733547, 0.00627251, 0.00724933, 0.01123955, 0.01620462,
       0.04334572, 0.10315461, 0.06134109, 0.06488339, 0.05021611,
       0.0373135 , 0.02016186, 0.02528253, 0.01671422, 0.0190518 ,
       0.0189992 , 0.01381047, 0.02821467, 0.01395126, 0.01229206,
       0.01145668, 0.01040584, 0.01127642, 0.00956985, 0.01128578,
       0.01050416, 0.00917577, 0.0106528 , 0.0096914 , 0.01120926,
       0.01081618, 0.01109828, 0.01119753, 0.02708185, 0.0387033 ])

In [168]:
np.where(clf.feature_importances_ > 0.005)

(array([ 1,  2,  3,  4,  5,  6,  7,  8, 10, 11, 14, 24, 25, 26, 27, 28, 29,
        30, 31, 32, 33, 34, 35, 36, 37, 40, 44, 46, 47, 48, 49, 50, 51, 53],
       dtype=int64),)

In [None]:
'title' 0-14
'keywords' 15-18
'desc' 19-22
'title_jaccar' 23-38
'refs_jaccar' 38-52

In [217]:
parameters_f1 = {}
for i in tqdm(range(1, len(traingroups_data) - 13, 13)):
    X_train_temp = X_train[np.where((groups_train >= i + 13) | (groups_train < i))]
    y_train_temp = y_train[np.where((groups_train >= i + 13) | (groups_train < i))]
    X_val = X_train[np.where((groups_train < i + 13) & (groups_train >= i))]
    y_val = y_train[np.where((groups_train < i + 13) & (groups_train >= i))]
    clf = RandomForestClassifier(max_depth=6, max_features = 4, n_estimators = 50, random_state=0)
    clf.fit(X_train_temp, y_train_temp)
    prediction = clf.predict(X_val)
    f1 = f1_score(y_val, prediction)
    fbeta_1 = fbeta_score(y_val, prediction, beta = 0.001)
    fbeta_2 = fbeta_score(y_val, prediction, beta = 100)
    if (m_depth, m_features) not in parameters_f1:
        parameters_f1[(m_depth, m_features)] = []
    parameters_f1[(m_depth, m_features)].append((f1, fbeta_1, fbeta_2))

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




In [218]:
f_df = pd.DataFrame()
for key in parameters_f1:
    temp = pd.DataFrame(data = {'f1': [i[0] for i in parameters_f1[key]], 
                                'f_beta_001': [i[1] for i in parameters_f1[key]], 
                                'f_beta_100': [i[2] for i in parameters_f1[key]]})
    f_df = f_df.append(temp)

In [219]:
f_df.describe()

Unnamed: 0,f1,f_beta_001,f_beta_100
count,9.0,9.0,9.0
mean,0.628841,0.679399,0.590772
std,0.088443,0.096628,0.097371
min,0.455128,0.568,0.379692
25%,0.611268,0.607143,0.55091
50%,0.622449,0.645833,0.586209
75%,0.626996,0.725564,0.668722
max,0.763124,0.849003,0.693685


RF:
f1 = 0.588800
f_001 = 0.624161
f_100 = 0.569663

n_features = 75
0.558036
0.637097
0.538706


0.622449
0.645833
0.586209

In [220]:
np.where(clf.feature_importances_ > 0.01)

(array([ 4,  5,  6,  7, 10, 12, 14, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
        33, 34, 35, 36, 37], dtype=int64),)

'title' 0-15
'keywords' 15-30
'desc' 30-45
'title_jaccar' 45-60
'refs_jaccar' 60-75

In [None]:
#попробуем наоборот: в логрегрессию добавить фичу из RF


In [221]:
new_groups = []
for i in tqdm(range(1, len(traingroups_data) - 13, 13)):
    X_train_temp = X_train[np.where((groups_train >= i + 13) | (groups_train < i))]
    y_train_temp = y_train[np.where((groups_train >= i + 13) | (groups_train < i))]
    X_val = X_train[np.where((groups_train < i + 13) & (groups_train >= i))]
    y_val = y_train[np.where((groups_train < i + 13) & (groups_train >= i))]
    rf = RandomForestClassifier(max_depth=6, max_features = 4, n_estimators = 50, random_state=0)
    #LogisticRegression(C = 0.11, solver='lbfgs')
    rf.fit(X_train_temp, y_train_temp)
    prediction = rf.predict_proba(X_val)[:, 1]
    prediction = (prediction - np.mean(prediction))/np.std(prediction)
    #print(prediction.shape, X_train_temp.shape)
    X_val = np.hstack([X_val, prediction.reshape(-1,1)])
    if i == 1:
        X_new_train = X_val
        y_new_train = y_val
    else:
        X_new_train = np.concatenate([X_new_train, X_val])
        y_new_train = np.concatenate([y_new_train, y_val])
    new_groups.extend(groups_train[np.where((groups_train < i + 13) & (groups_train >= i))])
new_groups = np.asarray(new_groups)
print(X_new_train.shape, X_train.shape, new_groups.shape, y_new_train.shape)

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))


(10652, 54) (11690, 53) (10652,) (10652,)


In [157]:
rf.feature_importances_

array([0.02126564, 0.00886717, 0.01657372, 0.01294358, 0.01975668,
       0.01839428, 0.01475759, 0.00622943, 0.0175407 , 0.02316212,
       0.00240566, 0.00227449, 0.00216672, 0.00188952, 0.00211137,
       0.00232909, 0.0012974 , 0.00527419, 0.00230982, 0.00134805,
       0.00443018, 0.00890003, 0.00342641, 0.00788249, 0.00229498,
       0.00851338, 0.00213468, 0.00274811, 0.0073893 , 0.00315202,
       0.03362576, 0.07061407, 0.05947146, 0.07742325, 0.07775566,
       0.06153821, 0.09769653, 0.1095822 , 0.054124  , 0.01746112,
       0.00381174, 0.0059545 , 0.01014695, 0.00602419, 0.00648585,
       0.00980972, 0.01704717, 0.01857936, 0.00951828, 0.01956118])

In [222]:
parameters_f1 = {}
for i in tqdm(range(1, max(new_groups) - 13, 13)):
    X_train_temp = X_new_train[np.where((new_groups >= i + 13) | (new_groups < i))]
    y_train_temp = y_new_train[np.where((new_groups >= i + 13) | (new_groups < i))]
    X_val = X_new_train[np.where((new_groups < i + 13) & (new_groups >= i))]
    y_val = y_new_train[np.where((new_groups < i + 13) & (new_groups >= i))]
    clf = LogisticRegression(C = 0.11, solver='lbfgs')
    clf.fit(X_train_temp, y_train_temp)
    prediction = clf.predict(X_val)
    f1 = f1_score(y_val, prediction)
    fbeta_1 = fbeta_score(y_val, prediction, beta = 0.001)
    fbeta_2 = fbeta_score(y_val, prediction, beta = 100)
    if (m_depth, m_features) not in parameters_f1:
        parameters_f1[(m_depth, m_features)] = []
    parameters_f1[(m_depth, m_features)].append((f1, fbeta_1, fbeta_2))

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))






In [223]:
f_df = pd.DataFrame()
for key in parameters_f1:
    temp = pd.DataFrame(data = {'f1': [i[0] for i in parameters_f1[key]], 
                                'f_beta_001': [i[1] for i in parameters_f1[key]], 
                                'f_beta_100': [i[2] for i in parameters_f1[key]]})
    f_df = f_df.append(temp)

In [224]:
f_df.describe()

Unnamed: 0,f1,f_beta_001,f_beta_100
count,8.0,8.0,8.0
mean,0.582924,0.710532,0.49833
std,0.084814,0.123239,0.078079
min,0.396104,0.504132,0.326215
25%,0.576107,0.637052,0.47662
50%,0.585508,0.701686,0.53754
75%,0.631601,0.810476,0.545326
max,0.672389,0.873605,0.555353


logreg plus RF:
f1 = 0.557222
f_001 = 0.664319
f_100 = 0.481038  

logreg_plus_RF:
0.585508
0.701686
0.537540

In [None]:
#А теперь в линрегрессию добавим RF и logreg

In [110]:
new_groups = []
for i in tqdm(range(1, len(traingroups_data) - 13, 13)):
    X_train_temp = X_train[np.where((groups_train >= i + 13) | (groups_train < i))]
    y_train_temp = y_train[np.where((groups_train >= i + 13) | (groups_train < i))]
    X_val = X_train[np.where((groups_train < i + 13) & (groups_train >= i))]
    y_val = y_train[np.where((groups_train < i + 13) & (groups_train >= i))]
    rf = RandomForestClassifier(max_depth=8, max_features = 4, n_estimators = 100, random_state=0)
    #LogisticRegression(C = 0.11, solver='lbfgs')
    rf.fit(X_train_temp, y_train_temp)
    prediction = rf.predict_proba(X_val)[:, 1]
    prediction = (prediction - np.mean(prediction))/np.std(prediction)
    #print(prediction.shape, X_train_temp.shape)
    logreg = LogisticRegression(C = 0.11, solver='lbfgs')
    logreg.fit(X_train_temp, y_train_temp)
    lr_prediction = logreg.predict_proba(X_val)[:, 1]
    lr_prediction = (lr_prediction - np.mean(lr_prediction))/np.std(lr_prediction)
    X_val = np.hstack([X_val, prediction.reshape(-1,1)])
    X_val = np.hstack([X_val, lr_prediction.reshape(-1,1)])
    if i == 1:
        X_new_train = X_val
        y_new_train = y_val
    else:
        X_new_train = np.concatenate([X_new_train, X_val])
        y_new_train = np.concatenate([y_new_train, y_val])
    new_groups.extend(groups_train[np.where((groups_train < i + 13) & (groups_train >= i))])
new_groups = np.asarray(new_groups)
print(X_new_train.shape, X_train.shape, new_groups.shape, y_new_train.shape)

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




(10652, 55) (11690, 53) (10652,) (10652,)


In [111]:
thresholds = {}
for i in tqdm(range(1, max(new_groups) - 13, 13)):
    X_train_temp = X_new_train[np.where((new_groups >= i + 13) | (new_groups < i))]
    y_train_temp = y_new_train[np.where((new_groups >= i + 13) | (new_groups < i))]
    X_val = X_new_train[np.where((new_groups < i + 13) & (new_groups >= i))]
    y_val = y_new_train[np.where((new_groups < i + 13) & (new_groups >= i))]
    clf = LinearRegression()
    clf.fit(X_train_temp, y_train_temp)
    prediction = clf.predict(X_val)
    for threshold in np.linspace(0.2, 0.4, 5):
    #[0.25, 0.3, 0.4, 0.5, 0.6, 0.7]:
        prediction_t = np.asarray(prediction >= threshold, dtype = int)
        f1 = f1_score(y_val, prediction_t)
        fbeta_1 = fbeta_score(y_val, prediction_t, beta = 0.001)
        fbeta_2 = fbeta_score(y_val, prediction_t, beta = 100)
        if threshold not in thresholds:
            thresholds[threshold] = []
        thresholds[threshold].append((f1, fbeta_1, fbeta_2))
    

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




In [112]:
f_df = pd.DataFrame()
for key in thresholds:
    temp = pd.DataFrame(data = {'f1': [i[0] for i in thresholds[key]], 
                                'f_beta_001': [i[1] for i in thresholds[key]], 
                                'f_beta_100': [i[2] for i in thresholds[key]]})
    temp['t'] = key
    f_df = f_df.append(temp)

In [113]:
f_df.groupby('t').f1.mean()

t
0.20    0.636001
0.25    0.652704
0.30    0.666479
0.35    0.673385
0.40    0.672568
Name: f1, dtype: float64

In [None]:
t = 0.375

In [225]:
clf.coef_

array([[ 0.22790709,  0.10321245,  0.08623716,  0.36014026, -0.2674196 ,
        -0.08509554, -0.04558981,  0.02453484,  0.02690781, -0.00126455,
        -0.24397046,  0.11793768, -0.31272112, -0.56976581,  0.33142466,
        -0.15858302, -0.11953984,  0.22349863, -0.20873341,  0.07636935,
         0.00418748, -0.0961881 ,  0.28118025,  0.02161557,  0.22598039,
         0.04898137, -0.24923682,  0.21212416,  0.18962795,  0.22171734,
         0.53637036,  0.30481721,  0.04398986,  0.10775096,  0.09157603,
        -0.04886054, -0.2913942 ,  0.26113382, -0.0339621 ,  0.01805931,
        -0.00327944, -0.01492795, -0.28974463,  0.64893381, -0.57970492,
         0.00509042,  0.09168318, -0.0816365 , -0.21084195,  0.39186683,
         0.09175215, -0.03366557,  0.0645571 ,  0.32080901]])

# Предсказание

In [None]:
#RandomForest c добавлением фичи "вероятность 1, предсказанная логрегрессией"

In [114]:
test_data = pd.read_csv('test_groups.csv')
testgroups_data = {}
for i in range(len(test_data)):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    title = df[df.id == doc_id].title.values
    refs = df[df.id == doc_id].refs.values
    keywords = df[df.id == doc_id].keywords.values
    desc = df[df.id == doc_id].desc.values
    if doc_group not in testgroups_data:
        testgroups_data[doc_group] = []
    testgroups_data[doc_group].append((doc_id, title[0], refs[0], keywords[0], desc[0]))

In [707]:
isoutlier_t = {}
f1_list = []
for group in tqdm(list(test_data.group_id.drop_duplicates())):
    indices = np.asarray(test_data[test_data.group_id == group].doc_id)
    tfidf_mtx = vectorizer.fit_transform(df[df.id.isin(indices)].title.values)
    pca_machine = decomposition.TruncatedSVD(n_components=int(np.sqrt(tfidf_mtx.shape[1])))#
    X_pca = pca_machine.fit_transform(tfidf_mtx)
    clustering = DBSCAN(eps=0.9, min_samples=2, metric='cosine').fit(X_pca)
    isout = list(np.asarray(clustering.labels_ != -1, dtype = int))
    ind_list = list(indices)
    for i in range(len(ind_list)):
        isoutlier_t[(group, ind_list[i])] = isout[i]



HBox(children=(IntProgress(value=0, max=180), HTML(value='')))




In [115]:

X_test = []

for new_group in tqdm(testgroups_data):
    docs = testgroups_data[new_group]
    for k, (doc_id, title, refs, keywords, desc) in enumerate(docs):
        dist = {'title':[], 'keywords':[], 'desc':[], 'title_jaccar':[], 'refs_jaccar':[]}
        words = set(bag_of_words(title))
        wdesc = set(bag_of_words(desc))
        wkey = set(bag_of_words(keywords))
        temp = [i.strip().split() for i in refs.split(';') if len(i) > 4]
        wrefs = set([item.strip() for sublist in temp for item in sublist if len(item.strip()) > 3])
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j, refs_j, keywords_j, desc_j = docs[j]
            words_j = set(bag_of_words(title_j))
            dist['title'].append(len(words.intersection(words_j)))
            wdesc_j = set(bag_of_words(desc_j))
            wkey_j = set(bag_of_words(keywords_j))
            temp = [i.strip().split() for i in refs_j.split(';') if len(i) > 4]
            wrefs_j = set([item.strip() for sublist in temp for item in sublist if len(item.strip()) > 3])
            dist['keywords'].append(len(wkey.intersection(wkey_j)))
            dist['desc'].append(len(wdesc.intersection(wdesc_j)))
            dist['title_jaccar'].append(jaccar(words, words_j))
            dist['refs_jaccar'].append(jaccar(wrefs_j, wrefs))
        features = []
        for key in ['title', 'keywords', 'desc', 'title_jaccar', 'refs_jaccar']:
            if (key == 'keywords') | (key == 'desc'):
                features.extend(sorted(dist[key], reverse=True)[0:4])
            else:
                features.extend(sorted(dist[key], reverse=True)[0:15])
        #features.append(isoutlier_t[(new_group, doc_id)])
        X_test.append(features)
        
X_test = np.array(X_test)

print (X_test.shape)

HBox(children=(IntProgress(value=0, max=180), HTML(value='')))


(16627, 53)


In [116]:
scaler.mean_

array([2.74935843, 2.28751069, 2.06193328, 1.90872541, 1.79110351,
       1.69332763, 1.60402053, 1.53182207, 1.47031651, 1.41471343,
       1.36338751, 1.31950385, 1.27536356, 1.23977759, 1.20325064,
       2.09384089, 1.56347305, 1.32497861, 1.15697177, 3.52882806,
       2.66834902, 2.27673225, 2.05611634, 0.36641613, 0.28867971,
       0.25250163, 0.22689557, 0.20742181, 0.19286504, 0.18067785,
       0.17045521, 0.16236647, 0.15440535, 0.14737262, 0.14154812,
       0.13587097, 0.13099963, 0.12641718, 0.24498966, 0.16365994,
       0.13623564, 0.12020569, 0.11038912, 0.10173871, 0.09620581,
       0.09074958, 0.0872766 , 0.08288553, 0.08018009, 0.07705643,
       0.0736467 , 0.07167489, 0.0698533 ])

In [117]:
#dbs = X_test[:,-1]
X_test = scaler.transform(X_test)
#X_test = np.hstack([X_test, dbs.reshape(-1,1)])

In [118]:
X_test

array([[ 0.11909385,  0.38093618,  0.52692864, ...,  0.30919608,
         0.33078213,  0.3706948 ],
       [ 0.11909385,  0.38093618,  0.52692864, ...,  0.67591061,
         0.71760732,  0.76037873],
       [ 1.06940584,  0.38093618,  0.52692864, ...,  0.39437527,
         0.39553911,  0.42071003],
       ...,
       [ 0.11909385,  0.38093618,  0.52692864, ...,  0.20302988,
         0.21295928,  0.14942923],
       [ 0.59424984,  0.91559146,  1.08864629, ..., -0.04678112,
        -0.01390856,  0.01783139],
       [ 3.44518581,  4.12352319,  4.45895219, ...,  0.13125161,
         0.15907721,  0.17302501]])

In [119]:
logreg = LogisticRegression(C = 0.11, solver='lbfgs')
logreg.fit(X_train, y_train)
prediction = logreg.predict_proba(X_test)[:, 1]
prediction = (prediction - np.mean(prediction))/np.std(prediction)
rf = RandomForestClassifier(max_depth=8, max_features = 4, n_estimators = 100, random_state=0)
rf.fit(X_train, y_train)
rf_prediction = rf.predict_proba(X_test)[:, 1]
rf_prediction = (rf_prediction - np.mean(rf_prediction))/np.std(rf_prediction)
X_test = np.hstack([X_test, prediction.reshape(-1,1)])
X_test = np.hstack([X_test, rf_prediction.reshape(-1,1)])



In [713]:
X_train[:, :-1].shape

(11690, 53)

In [238]:
X_train.shape[0]/2

5845.0

In [241]:
y_train[:int(X_train.shape[0]/2)]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [242]:
a = np.asarray([1,2,3])
a[0:1]

array([1])

In [120]:
logreg = LogisticRegression(C = 0.11, solver='lbfgs')
x1 = X_train[:int(X_train.shape[0]/2), :]
y1 = y_train[:int(X_train.shape[0]/2)]
x2 = X_train[int(X_train.shape[0]/2):, :]
y2 = y_train[int(X_train.shape[0]/2):]
logreg.fit(x1, y1)
pred1 = logreg.predict_proba(x2)[:, 1]
#pred1 = (pred1 - np.mean(pred1))/np.std(pred1)
logreg.fit(x2, y2)
pred2 = logreg.predict_proba(x1)[:, 1]
#pred2 = (pred2 - np.mean(pred2))/np.std(pred2)
pred = np.concatenate([pred2, pred1])
pred = (pred - np.mean(pred))/np.std(pred)

In [121]:
X_train = np.hstack([X_train, pred.reshape(-1,1)])

In [122]:
rf = RandomForestClassifier(max_depth=8, max_features = 4, n_estimators = 100, random_state=0)
x1 = X_train[:int(X_train.shape[0]/2), :]
y1 = y_train[:int(X_train.shape[0]/2)]
x2 = X_train[int(X_train.shape[0]/2):, :]
y2 = y_train[int(X_train.shape[0]/2):]
rf.fit(x1, y1)
pred1 = rf.predict_proba(x2)[:, 1]
#pred1 = (pred1 - np.mean(pred1))/np.std(pred1)
rf.fit(x2, y2)
pred2 = rf.predict_proba(x1)[:, 1]
#pred2 = (pred2 - np.mean(pred2))/np.std(pred2)
pred = np.concatenate([pred2, pred1])
pred = (pred - np.mean(pred))/np.std(pred)

In [123]:
X_train = np.hstack([X_train, pred.reshape(-1,1)])

In [715]:
pred.shape

(11690,)

In [124]:
X_train.shape

(11690, 55)

In [125]:
X_test.shape

(16627, 55)

In [126]:
clf = LinearRegression()


In [720]:
#clf = RandomForestClassifier(max_depth=6, max_features = 4, n_estimators = 50, random_state=0)
#clf = RandomForestClassifier(max_depth=9, max_features = 9, n_estimators = 50, random_state=0)

In [127]:
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [128]:
finalpred = clf.predict(X_test)
finalpred_t = np.asarray(finalpred >= 0.375, dtype = int)


In [129]:
mypred = pd.DataFrame(data = {'pair_id':test_data.pair_id, 'target': finalpred_t})

In [131]:
len(finalpred_t[finalpred_t == 1])

5389

In [132]:
mypred.to_csv('linear_rf_log__comb_submission.сsv', index = False)

In [258]:
#А если просто RandomForest
clf = RandomForestClassifier(max_depth=6, max_features = 4, n_estimators = 50, random_state=0)

In [259]:
clf.fit(X_train[:,:-1], y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=6, max_features=4, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [260]:
finalpred = clf.predict(X_test[:,:-1])

In [261]:
mypred = pd.DataFrame(data = {'pair_id':test_data.pair_id, 'target': finalpred})

In [263]:
mypred.to_csv('rf_submission.сsv', index = False)