In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from multiprocessing import Pool
from tqdm.notebook import tqdm
import glob
import codecs
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.preprocessing import StandardScaler
import re
from sklearn import decomposition, manifold
from sklearn.cluster import AgglomerativeClustering, DBSCAN
import pymorphy2
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import f1_score, fbeta_score

In [None]:
morph = pymorphy2.MorphAnalyzer()

# Подготовка данных

In [None]:
#обрабатываем странички: используем title, keywords, desc, refs, 
#оставляем только русские слова длинее 3 букв, приводим все слова к нормальной форме
def russian_words(s):
    new_s = ''
    s = re.sub('\s+', ' ', s)
    for w in re.findall("[А-я]+", s):
        if len(w) >= 3:
            new_s = new_s + ' ' + morph.parse(w.lower())[0].normal_form
    #[morph.parse(word)[0].normal_form for word in new_s.split() if len(word) >= 3]
    return new_s

def smart_process_page(page):
    info = {}
    ident = re.search(r'[0-9]+', page)
    info['id'] = ident.group(0)
    with codecs.open(page, 'r', 'utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
        title = soup.title.text if soup.title else ''
        info['title'] = russian_words(title)
        ref = [russian_words(i.text.strip()) for i in soup.find_all('a')]
        info['refs'] = ';'.join(list(filter(lambda x: x != '', ref)))
        meta = soup.find_all('meta')
        info['desc'] = ''
        info['keywords'] = ''
        for tag in meta:
            if 'name' in tag.attrs:
                if tag.attrs['name'] == 'keywords':
                    keywords = tag.attrs['content'] if 'content' in tag.attrs else ''
                    info['keywords'] = russian_words(keywords)
                if tag.attrs['name'] == 'description':
                    desc = tag.attrs['content'] if 'content' in tag.attrs else ''
                    info['desc'] = russian_words(desc)
    return info

In [None]:
df.to_csv('normal_form_pages_info.tsv', sep = '\t', index = False)

# Генерация признаков

In [None]:
df = pd.read_csv('normal_form_pages_info.tsv', sep = '\t')

In [None]:
df.fillna('', inplace=True)

In [None]:
train_data = pd.read_csv('train_groups.csv')
traingroups_data = {}
for i in tqdm(range(len(train_data))):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = df[df.id == doc_id].title.values
    refs = df[df.id == doc_id].refs.values
    keywords = df[df.id == doc_id].keywords.values
    desc = df[df.id == doc_id].desc.values
    #print(doc_id, title, refs, keywords, desc, target)
    if doc_group not in traingroups_data:
        traingroups_data[doc_group] = []
    traingroups_data[doc_group].append((doc_id, title[0], refs[0], keywords[0], desc[0], target))

In [None]:
train_data = pd.read_csv('train_groups.csv')
traingroups = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    if doc_group not in traingroups:
        traingroups[doc_group] = []
    traingroups[doc_group].append([doc_id,target])

In [None]:
def jaccar(s1, s2):
    u = len(s1.union(s2))
    if u > 0:
        return len(s1.intersection(s2))/u
    else:
        return 0

In [None]:
def bag_of_words(s):
    #temp = s.strip(' \t\n!()#*?:;,.').split()
    #return [morph.parse(word)[0].normal_form for word in s.split() if len(word) >= 3]
    #return [i.strip('.,:').lower() for i in temp if len(i) >= 3]
    return s.strip().split()

In [None]:
%%time
y_train = []
X_train = []
groups_train = []
for new_group in tqdm(traingroups_data):
    docs = traingroups_data[new_group]
    for k, (doc_id, title, refs, keywords, desc, target_id) in enumerate(docs):
        y_train.append(target_id)
        groups_train.append(new_group)
        dist = {'title':[], 'keywords':[], 'desc':[], 'title_jaccar':[], 'refs_jaccar':[]}
        words = set(bag_of_words(title))
        wdesc = set(bag_of_words(desc))
        wkey = set(bag_of_words(keywords))
        temp = [i.strip().split() for i in refs.split(';') if len(i) >= 3]
        wrefs = set([item.strip() for sublist in temp for item in sublist if len(item.strip()) >= 3])
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j, refs_j, keywords_j, desc_j, target_j = docs[j]
            words_j = set(bag_of_words(title_j))
            dist['title'].append(len(words.intersection(words_j)))
            wdesc_j = set(bag_of_words(desc_j))
            wkey_j = set(bag_of_words(keywords_j))
            temp = [i.strip().split() for i in refs_j.split(';') if len(i) >= 3]
            wrefs_j = set([item.strip() for sublist in temp for item in sublist if len(item.strip()) >= 3])
            dist['keywords'].append(len(wkey.intersection(wkey_j)))
            dist['desc'].append(len(wdesc.intersection(wdesc_j)))
            dist['title_jaccar'].append(jaccar(words, words_j))
            dist['refs_jaccar'].append(jaccar(wrefs_j, wrefs))
        features = []
        for key in ['title', 'keywords', 'desc', 'title_jaccar', 'refs_jaccar']:
            if (key == 'keywords') | (key == 'desc'):
                features.extend(sorted(dist[key], reverse=True)[0:4])
            else:
                features.extend(sorted(dist[key], reverse=True)[0:15])
        X_train.append(features)
        
X_train = np.array(X_train)
y_train = np.array(y_train)
groups_train = np.array(groups_train)
print (X_train.shape, y_train.shape, groups_train.shape)

# Обучение моделей, подбор параметров

In [None]:
scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X_train)

In [None]:
#функция, с помощью которой можно тестировать разные модели
def cross_validation(traingroups_data, X_train, y_train, groups_train, model='logreg', C=0.11, 
                     m_depth=6, m_features=4, ridge=False, threshold=0.5):
    parameters_f1 = {}
    for i in tqdm(range(1, len(traingroups_data) - 13, 13)):
        X_train_temp = X_train[np.where((groups_train >= i + 13) | (groups_train < i))]
        y_train_temp = y_train[np.where((groups_train >= i + 13) | (groups_train < i))]
        X_val = X_train[np.where((groups_train < i + 13) & (groups_train >= i))]
        y_val = y_train[np.where((groups_train < i + 13) & (groups_train >= i))]
        if model == 'logreg':
            clf = LogisticRegression(C = C, solver='lbfgs')
        elif model == 'linreg':
            clf = LinearRegression()
            if ridge:
                clf = Ridge(alpha=C)
        elif model == 'rf':
            clf = RandomForestClassifier(max_depth=m_depth, max_features = m_features, n_estimators = 20, random_state=0)
        clf.fit(X_train_temp, y_train_temp)
        prediction = clf.predict(X_val)
        if model == 'linreg':
            prediction = np.asarray(prediction >= threshold, dtype = int)
        f1 = f1_score(y_val, prediction)
        fbeta_1 = fbeta_score(y_val, prediction, beta = 0.001)
        fbeta_2 = fbeta_score(y_val, prediction, beta = 100)
        if (C, m_depth, m_features, ridge, threshold) not in parameters_f1:
            parameters_f1[(C, m_depth, m_features, ridge, threshold)] = []
        parameters_f1[(C, m_depth, m_features, ridge, threshold)].append((f1, fbeta_1, fbeta_2))
    return parameters_f1

В качестве финальной модели для предсказания была выбрана линейная регрессия с добавлением признаков "вероятность, предсказанная логрегрессией", "вероятность предсказанная случайным лесом"

# Предсказание

In [None]:
test_data = pd.read_csv('test_groups.csv')
testgroups_data = {}
for i in range(len(test_data)):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    title = df[df.id == doc_id].title.values
    refs = df[df.id == doc_id].refs.values
    keywords = df[df.id == doc_id].keywords.values
    desc = df[df.id == doc_id].desc.values
    if doc_group not in testgroups_data:
        testgroups_data[doc_group] = []
    testgroups_data[doc_group].append((doc_id, title[0], refs[0], keywords[0], desc[0]))

In [None]:
X_test = []

for new_group in tqdm(testgroups_data):
    docs = testgroups_data[new_group]
    for k, (doc_id, title, refs, keywords, desc) in enumerate(docs):
        dist = {'title':[], 'keywords':[], 'desc':[], 'title_jaccar':[], 'refs_jaccar':[]}
        words = set(bag_of_words(title))
        wdesc = set(bag_of_words(desc))
        wkey = set(bag_of_words(keywords))
        temp = [i.strip().split() for i in refs.split(';') if len(i) > 4]
        wrefs = set([item.strip() for sublist in temp for item in sublist if len(item.strip()) > 3])
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j, refs_j, keywords_j, desc_j = docs[j]
            words_j = set(bag_of_words(title_j))
            dist['title'].append(len(words.intersection(words_j)))
            wdesc_j = set(bag_of_words(desc_j))
            wkey_j = set(bag_of_words(keywords_j))
            temp = [i.strip().split() for i in refs_j.split(';') if len(i) > 4]
            wrefs_j = set([item.strip() for sublist in temp for item in sublist if len(item.strip()) > 3])
            dist['keywords'].append(len(wkey.intersection(wkey_j)))
            dist['desc'].append(len(wdesc.intersection(wdesc_j)))
            dist['title_jaccar'].append(jaccar(words, words_j))
            dist['refs_jaccar'].append(jaccar(wrefs_j, wrefs))
        features = []
        for key in ['title', 'keywords', 'desc', 'title_jaccar', 'refs_jaccar']:
            if (key == 'keywords') | (key == 'desc'):
                features.extend(sorted(dist[key], reverse=True)[0:4])
            else:
                features.extend(sorted(dist[key], reverse=True)[0:15])
        #features.append(isoutlier_t[(new_group, doc_id)])
        X_test.append(features)
        
X_test = np.array(X_test)

print (X_test.shape)

In [None]:
X_test = scaler.transform(X_test)

In [None]:
logreg = LogisticRegression(C = 0.11, solver='lbfgs')
logreg.fit(X_train, y_train)
prediction = logreg.predict_proba(X_test)[:, 1]
prediction = (prediction - np.mean(prediction))/np.std(prediction)
rf = RandomForestClassifier(max_depth=8, max_features = 4, n_estimators = 100, random_state=0)
rf.fit(X_train, y_train)
rf_prediction = rf.predict_proba(X_test)[:, 1]
rf_prediction = (rf_prediction - np.mean(rf_prediction))/np.std(rf_prediction)
X_test = np.hstack([X_test, prediction.reshape(-1,1)])
X_test = np.hstack([X_test, rf_prediction.reshape(-1,1)])

In [None]:
logreg = LogisticRegression(C = 0.11, solver='lbfgs')
x1 = X_train[:int(X_train.shape[0]/2), :]
y1 = y_train[:int(X_train.shape[0]/2)]
x2 = X_train[int(X_train.shape[0]/2):, :]
y2 = y_train[int(X_train.shape[0]/2):]
logreg.fit(x1, y1)
pred1 = logreg.predict_proba(x2)[:, 1]
#pred1 = (pred1 - np.mean(pred1))/np.std(pred1)
logreg.fit(x2, y2)
pred2 = logreg.predict_proba(x1)[:, 1]
#pred2 = (pred2 - np.mean(pred2))/np.std(pred2)
pred = np.concatenate([pred2, pred1])
pred = (pred - np.mean(pred))/np.std(pred)

In [None]:
X_train = np.hstack([X_train, pred.reshape(-1,1)])

In [None]:
clf = LinearRegression()

In [None]:
clf.fit(X_train, y_train)

In [None]:
finalpred = clf.predict(X_test)
finalpred_t = np.asarray(finalpred >= 0.375, dtype = int)

In [None]:
mypred = pd.DataFrame(data = {'pair_id':test_data.pair_id, 'target': finalpred_t})

In [None]:
mypred.to_csv('linear_rf_log__comb_submission.сsv', index = False)