In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np

import re
import os
from functools import reduce

from multiprocessing.dummy import Pool as ThreadPool
from multiprocessing import Pool, Lock, Value

# import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split

## Получение всех заголовков

In [4]:
doc_to_title = {}
with open('docs_titles.tsv') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = data[1]
        doc_to_title[doc_id] = title
print (len(doc_to_title))

28026


### Preprocessing

In [6]:
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer

import re
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def to_digit(x):
    try:
        return int(x)
    except ValueError:
        pass
    
def remove_urls (vTEXT):
    regex = re.compile(
    # r'^(?:http|ftp)s?://'  # http:// or https://
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
    r'localhost|'  # localhost...
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'  # ...or ipv4
    r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'  # ...or ipv6
    r'(?::\d+)?'  # optional port
    r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    
    vTEXT = re.sub(regex, '', vTEXT)
    return(vTEXT)

def preprocess_text(document):
        # Remove urls
        # document = re.sub(url_reg, '', document)
        document = remove_urls(document)
        
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[а-яА-Я]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[а-яА-Я]\s+', ' ', str(document))
        
        # Remove 1-2 digits
#         document = re.sub(r'\b[0-9]\b', ' ', str(document))

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', str(document), flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        # print(tokens)
        # tokens = [stemmer.lemmatize(word) for word in tokens]
        # tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens \
                  if (len(word) > 2)|\
                  (type(to_digit(word)) == int)|\
                  (word.lower() in ['cs','go','vc'])]
        # print(tokens)

        preprocessed_text = ' '.join(list(map(lambda x: morph.parse(x)[0].normal_form, tokens )))

        return preprocessed_text

In [7]:
docs_titles = list(doc_to_title.values())
final_corpus = [preprocess_text(sentence) for sentence in docs_titles if sentence.strip() !='']

word_punctuation_tokenizer = nltk.WordPunctTokenizer()
word_tokenized_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in final_corpus]

## Настройка Tf-Idf

In [8]:
Vect = TfidfVectorizer()
Vect.fit(final_corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

## Train part

In [9]:
def get_embedding(vec, group_Matrix, thrshold = 0.1):
    return sorted((vec * group_Matrix > thrshold).sum(axis=1), reverse=True)[1:80]

In [10]:
import pandas as pd
train_data = pd.read_csv('train_groups.csv')
traingroups_titledata = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = preprocess_text(doc_to_title[doc_id])
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))

In [11]:
y_train = []
X_train = []
groups_train = []
for new_group in traingroups_titledata:
    docs = traingroups_titledata[new_group]
    group_TfIdf_Matrix = Vect.transform([title for _, title, _ in docs]).toarray()
    for k, ((doc_id, title, target_id), vec) in enumerate(zip(docs, group_TfIdf_Matrix)):
        y_train.append(target_id)
        groups_train.append(new_group)
        X_train.append(get_embedding(vec, group_TfIdf_Matrix))
    print(new_group, 'group')

X_train = np.array(X_train)
y_train = np.array(y_train)
groups_train = np.array(groups_train)

arr = np.zeros((X_train.shape[0], len(X_train[0])))
for i, row in enumerate(X_train):
    for j, val in enumerate(row):
        arr[i,j]=val
X_train = np.array(arr)

print (X_train.shape, y_train.shape, groups_train.shape)

1 group
2 group
3 group
4 group
5 group
6 group
7 group
8 group
9 group
10 group
11 group
12 group
13 group
14 group
15 group
16 group
17 group
18 group
19 group
20 group
21 group
22 group
23 group
24 group
25 group
26 group
27 group
28 group
29 group
30 group
31 group
32 group
33 group
34 group
35 group
36 group
37 group
38 group
39 group
40 group
41 group
42 group
43 group
44 group
45 group
46 group
47 group
48 group
49 group
50 group
51 group
52 group
53 group
54 group
55 group
56 group
57 group
58 group
59 group
60 group
61 group
62 group
63 group
64 group
65 group
66 group
67 group
68 group
69 group
70 group
71 group
72 group
73 group
74 group
75 group
76 group
77 group
78 group
79 group
80 group
81 group
82 group
83 group
84 group
85 group
86 group
87 group
88 group
89 group
90 group
91 group
92 group
93 group
94 group
95 group
96 group
97 group
98 group
99 group
100 group
101 group
102 group
103 group
104 group
105 group
106 group
107 group
108 group
109 group
110 group
111 grou

## Test part

In [13]:
test_data = pd.read_csv('test_groups.csv')
testgroups_titledata = {}
for i in range(test_data.shape[0]):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    pair_id = new_doc['pair_id']
    title = preprocess_text(doc_to_title[doc_id])
    if doc_group not in testgroups_titledata:
        testgroups_titledata[doc_group] = []
    testgroups_titledata[doc_group].append((doc_id, pair_id, title))

In [14]:
X_test = []
pairs_id = []
for new_group in testgroups_titledata:
    docs = testgroups_titledata[new_group]
    group_TfIdf_Matrix = Vect.transform([title for _, _, title in docs]).toarray()
    for k, ((doc_id, pair_id, title), vec) in enumerate(zip(docs, group_TfIdf_Matrix)):
        X_test.append(get_embedding(vec, group_TfIdf_Matrix))
        pairs_id.append(pair_id)
    print(new_group, 'group')
X_test = np.array(X_test)

arr = np.zeros((X_test.shape[0], len(X_test[0])))
for i, row in enumerate(X_test):
    for j, val in enumerate(row):
        arr[i,j]=val
X_test = np.array(arr)

print(X_test.shape)

130 group
131 group
132 group
133 group
134 group
135 group
136 group
137 group
138 group
139 group
140 group
141 group
142 group
143 group
144 group
145 group
146 group
147 group
148 group
149 group
150 group
151 group
152 group
153 group
154 group
155 group
156 group
157 group
158 group
159 group
160 group
161 group
162 group
163 group
164 group
165 group
166 group
167 group
168 group
169 group
170 group
171 group
172 group
173 group
174 group
175 group
176 group
177 group
178 group
179 group
180 group
181 group
182 group
183 group
184 group
185 group
186 group
187 group
188 group
189 group
190 group
191 group
192 group
193 group
194 group
195 group
196 group
197 group
198 group
199 group
200 group
201 group
202 group
203 group
204 group
205 group
206 group
207 group
208 group
209 group
210 group
211 group
212 group
213 group
214 group
215 group
216 group
217 group
218 group
219 group
220 group
221 group
222 group
223 group
224 group
225 group
226 group
227 group
228 group
229 group


## Валидация

In [15]:
from itertools import zip_longest, product

prev_group = 1
prev_index = 0
groups_indices = []
for k, i in zip_longest(range(len(groups_train) + 1),
                        groups_train, fillvalue=-1):
    if prev_group != i:
        groups_indices.append([prev_index, k])
        prev_group = i
        prev_index = k
groups_indices = np.array(groups_indices)

In [16]:
from sklearn.metrics import f1_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import KFold

import random

In [17]:
def predict(clf, trsh, X):
    proba = clf.predict_proba(X)
    return np.array(list(map(lambda x: 1 if x[1] > trsh else 0  , proba)))

In [None]:
AdaBoostClassifier()

In [21]:
THRSHS = np.arange(0.05, 1.0, 0.05)
ESTIMATORS = np.array([10,20,50,100,150])
LEARN_RATES = np.array([0.1, 0.2, 0.5, 1.0, 1.1, 1.2, 1.5])
result = np.zeros(len(list(product(ESTIMATORS, LEARN_RATES, THRSHS))))

for i, (estimators, learning_rate, trsh) in enumerate(product(ESTIMATORS, LEARN_RATES, THRSHS)):
    f_scores = []
    kf = KFold(n_splits=6)
    for train_index, val_index in kf.split(groups_indices):
        Train_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[train_index]))
        Val_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[val_index]))

        Train_X, Train_y = X_train[Train_Indicies], y_train[Train_Indicies]
        Val_X, Val_y = X_train[Val_Indicies], y_train[Val_Indicies]

        clf = AdaBoostClassifier(n_estimators=estimators, learning_rate=learning_rate, random_state=42)
        clf.fit(Train_X, Train_y)
        preds = predict(clf, trsh, Val_X)
        
        f_scores.append(f1_score(Val_y, preds))
        
        print((i, estimators, learning_rate, trsh),' iteration')
        result[i] = np.mean(f_scores)
        
best_index = result.argmax()
BEST_T = list(product(ESTIMATORS, LEARN_RATES, THRSHS))[best_index][2]
BEST_EST = list(product(ESTIMATORS, LEARN_RATES, THRSHS))[best_index][0]
BEST_LR = list(product(ESTIMATORS, LEARN_RATES, THRSHS))[best_index][1]
print(BEST_T, BEST_EST, BEST_LR, result[best_index])

(0, 10, 0.1, 0.05)  iteration
(0, 10, 0.1, 0.05)  iteration
(0, 10, 0.1, 0.05)  iteration
(0, 10, 0.1, 0.05)  iteration
(0, 10, 0.1, 0.05)  iteration
(0, 10, 0.1, 0.05)  iteration
(1, 10, 0.1, 0.1)  iteration
(1, 10, 0.1, 0.1)  iteration
(1, 10, 0.1, 0.1)  iteration
(1, 10, 0.1, 0.1)  iteration
(1, 10, 0.1, 0.1)  iteration
(1, 10, 0.1, 0.1)  iteration
(2, 10, 0.1, 0.15000000000000002)  iteration
(2, 10, 0.1, 0.15000000000000002)  iteration
(2, 10, 0.1, 0.15000000000000002)  iteration
(2, 10, 0.1, 0.15000000000000002)  iteration
(2, 10, 0.1, 0.15000000000000002)  iteration
(2, 10, 0.1, 0.15000000000000002)  iteration
(3, 10, 0.1, 0.2)  iteration
(3, 10, 0.1, 0.2)  iteration
(3, 10, 0.1, 0.2)  iteration
(3, 10, 0.1, 0.2)  iteration
(3, 10, 0.1, 0.2)  iteration
(3, 10, 0.1, 0.2)  iteration
(4, 10, 0.1, 0.25)  iteration
(4, 10, 0.1, 0.25)  iteration
(4, 10, 0.1, 0.25)  iteration
(4, 10, 0.1, 0.25)  iteration
(4, 10, 0.1, 0.25)  iteration
(4, 10, 0.1, 0.25)  iteration
(5, 10, 0.1, 0.3)  ite

(37, 10, 0.2, 0.9500000000000001)  iteration
(37, 10, 0.2, 0.9500000000000001)  iteration
(37, 10, 0.2, 0.9500000000000001)  iteration
(37, 10, 0.2, 0.9500000000000001)  iteration
(37, 10, 0.2, 0.9500000000000001)  iteration
(37, 10, 0.2, 0.9500000000000001)  iteration
(38, 10, 0.5, 0.05)  iteration
(38, 10, 0.5, 0.05)  iteration
(38, 10, 0.5, 0.05)  iteration
(38, 10, 0.5, 0.05)  iteration
(38, 10, 0.5, 0.05)  iteration
(38, 10, 0.5, 0.05)  iteration
(39, 10, 0.5, 0.1)  iteration
(39, 10, 0.5, 0.1)  iteration
(39, 10, 0.5, 0.1)  iteration
(39, 10, 0.5, 0.1)  iteration
(39, 10, 0.5, 0.1)  iteration
(39, 10, 0.5, 0.1)  iteration
(40, 10, 0.5, 0.15000000000000002)  iteration
(40, 10, 0.5, 0.15000000000000002)  iteration
(40, 10, 0.5, 0.15000000000000002)  iteration
(40, 10, 0.5, 0.15000000000000002)  iteration
(40, 10, 0.5, 0.15000000000000002)  iteration
(40, 10, 0.5, 0.15000000000000002)  iteration
(41, 10, 0.5, 0.2)  iteration
(41, 10, 0.5, 0.2)  iteration
(41, 10, 0.5, 0.2)  iteratio

(74, 10, 1.0, 0.9000000000000001)  iteration
(74, 10, 1.0, 0.9000000000000001)  iteration
(74, 10, 1.0, 0.9000000000000001)  iteration
(74, 10, 1.0, 0.9000000000000001)  iteration
(74, 10, 1.0, 0.9000000000000001)  iteration
(74, 10, 1.0, 0.9000000000000001)  iteration
(75, 10, 1.0, 0.9500000000000001)  iteration
(75, 10, 1.0, 0.9500000000000001)  iteration
(75, 10, 1.0, 0.9500000000000001)  iteration
(75, 10, 1.0, 0.9500000000000001)  iteration
(75, 10, 1.0, 0.9500000000000001)  iteration
(75, 10, 1.0, 0.9500000000000001)  iteration
(76, 10, 1.1, 0.05)  iteration
(76, 10, 1.1, 0.05)  iteration
(76, 10, 1.1, 0.05)  iteration
(76, 10, 1.1, 0.05)  iteration
(76, 10, 1.1, 0.05)  iteration
(76, 10, 1.1, 0.05)  iteration
(77, 10, 1.1, 0.1)  iteration
(77, 10, 1.1, 0.1)  iteration
(77, 10, 1.1, 0.1)  iteration
(77, 10, 1.1, 0.1)  iteration
(77, 10, 1.1, 0.1)  iteration
(77, 10, 1.1, 0.1)  iteration
(78, 10, 1.1, 0.15000000000000002)  iteration
(78, 10, 1.1, 0.15000000000000002)  iteration
(7

(110, 10, 1.2, 0.8)  iteration
(110, 10, 1.2, 0.8)  iteration
(110, 10, 1.2, 0.8)  iteration
(110, 10, 1.2, 0.8)  iteration
(111, 10, 1.2, 0.8500000000000001)  iteration
(111, 10, 1.2, 0.8500000000000001)  iteration
(111, 10, 1.2, 0.8500000000000001)  iteration
(111, 10, 1.2, 0.8500000000000001)  iteration
(111, 10, 1.2, 0.8500000000000001)  iteration
(111, 10, 1.2, 0.8500000000000001)  iteration
(112, 10, 1.2, 0.9000000000000001)  iteration
(112, 10, 1.2, 0.9000000000000001)  iteration
(112, 10, 1.2, 0.9000000000000001)  iteration
(112, 10, 1.2, 0.9000000000000001)  iteration
(112, 10, 1.2, 0.9000000000000001)  iteration
(112, 10, 1.2, 0.9000000000000001)  iteration
(113, 10, 1.2, 0.9500000000000001)  iteration
(113, 10, 1.2, 0.9500000000000001)  iteration
(113, 10, 1.2, 0.9500000000000001)  iteration
(113, 10, 1.2, 0.9500000000000001)  iteration
(113, 10, 1.2, 0.9500000000000001)  iteration
(113, 10, 1.2, 0.9500000000000001)  iteration
(114, 10, 1.5, 0.05)  iteration
(114, 10, 1.5, 0

(146, 20, 0.1, 0.7000000000000001)  iteration
(146, 20, 0.1, 0.7000000000000001)  iteration
(146, 20, 0.1, 0.7000000000000001)  iteration
(146, 20, 0.1, 0.7000000000000001)  iteration
(147, 20, 0.1, 0.7500000000000001)  iteration
(147, 20, 0.1, 0.7500000000000001)  iteration
(147, 20, 0.1, 0.7500000000000001)  iteration
(147, 20, 0.1, 0.7500000000000001)  iteration
(147, 20, 0.1, 0.7500000000000001)  iteration
(147, 20, 0.1, 0.7500000000000001)  iteration
(148, 20, 0.1, 0.8)  iteration
(148, 20, 0.1, 0.8)  iteration
(148, 20, 0.1, 0.8)  iteration
(148, 20, 0.1, 0.8)  iteration
(148, 20, 0.1, 0.8)  iteration
(148, 20, 0.1, 0.8)  iteration
(149, 20, 0.1, 0.8500000000000001)  iteration
(149, 20, 0.1, 0.8500000000000001)  iteration
(149, 20, 0.1, 0.8500000000000001)  iteration
(149, 20, 0.1, 0.8500000000000001)  iteration
(149, 20, 0.1, 0.8500000000000001)  iteration
(149, 20, 0.1, 0.8500000000000001)  iteration
(150, 20, 0.1, 0.9000000000000001)  iteration
(150, 20, 0.1, 0.900000000000000

(182, 20, 0.5, 0.6000000000000001)  iteration
(182, 20, 0.5, 0.6000000000000001)  iteration
(182, 20, 0.5, 0.6000000000000001)  iteration
(182, 20, 0.5, 0.6000000000000001)  iteration
(183, 20, 0.5, 0.6500000000000001)  iteration
(183, 20, 0.5, 0.6500000000000001)  iteration
(183, 20, 0.5, 0.6500000000000001)  iteration
(183, 20, 0.5, 0.6500000000000001)  iteration
(183, 20, 0.5, 0.6500000000000001)  iteration
(183, 20, 0.5, 0.6500000000000001)  iteration
(184, 20, 0.5, 0.7000000000000001)  iteration
(184, 20, 0.5, 0.7000000000000001)  iteration
(184, 20, 0.5, 0.7000000000000001)  iteration
(184, 20, 0.5, 0.7000000000000001)  iteration
(184, 20, 0.5, 0.7000000000000001)  iteration
(184, 20, 0.5, 0.7000000000000001)  iteration
(185, 20, 0.5, 0.7500000000000001)  iteration
(185, 20, 0.5, 0.7500000000000001)  iteration
(185, 20, 0.5, 0.7500000000000001)  iteration
(185, 20, 0.5, 0.7500000000000001)  iteration
(185, 20, 0.5, 0.7500000000000001)  iteration
(185, 20, 0.5, 0.7500000000000001)

(217, 20, 1.1, 0.45)  iteration
(217, 20, 1.1, 0.45)  iteration
(217, 20, 1.1, 0.45)  iteration
(218, 20, 1.1, 0.5)  iteration
(218, 20, 1.1, 0.5)  iteration
(218, 20, 1.1, 0.5)  iteration
(218, 20, 1.1, 0.5)  iteration
(218, 20, 1.1, 0.5)  iteration
(218, 20, 1.1, 0.5)  iteration
(219, 20, 1.1, 0.55)  iteration
(219, 20, 1.1, 0.55)  iteration
(219, 20, 1.1, 0.55)  iteration
(219, 20, 1.1, 0.55)  iteration
(219, 20, 1.1, 0.55)  iteration
(219, 20, 1.1, 0.55)  iteration
(220, 20, 1.1, 0.6000000000000001)  iteration
(220, 20, 1.1, 0.6000000000000001)  iteration
(220, 20, 1.1, 0.6000000000000001)  iteration
(220, 20, 1.1, 0.6000000000000001)  iteration
(220, 20, 1.1, 0.6000000000000001)  iteration
(220, 20, 1.1, 0.6000000000000001)  iteration
(221, 20, 1.1, 0.6500000000000001)  iteration
(221, 20, 1.1, 0.6500000000000001)  iteration
(221, 20, 1.1, 0.6500000000000001)  iteration
(221, 20, 1.1, 0.6500000000000001)  iteration
(221, 20, 1.1, 0.6500000000000001)  iteration
(221, 20, 1.1, 0.650

(253, 20, 1.5, 0.35000000000000003)  iteration
(253, 20, 1.5, 0.35000000000000003)  iteration
(253, 20, 1.5, 0.35000000000000003)  iteration
(253, 20, 1.5, 0.35000000000000003)  iteration
(253, 20, 1.5, 0.35000000000000003)  iteration
(253, 20, 1.5, 0.35000000000000003)  iteration
(254, 20, 1.5, 0.4)  iteration
(254, 20, 1.5, 0.4)  iteration
(254, 20, 1.5, 0.4)  iteration
(254, 20, 1.5, 0.4)  iteration
(254, 20, 1.5, 0.4)  iteration
(254, 20, 1.5, 0.4)  iteration
(255, 20, 1.5, 0.45)  iteration
(255, 20, 1.5, 0.45)  iteration
(255, 20, 1.5, 0.45)  iteration
(255, 20, 1.5, 0.45)  iteration
(255, 20, 1.5, 0.45)  iteration
(255, 20, 1.5, 0.45)  iteration
(256, 20, 1.5, 0.5)  iteration
(256, 20, 1.5, 0.5)  iteration
(256, 20, 1.5, 0.5)  iteration
(256, 20, 1.5, 0.5)  iteration
(256, 20, 1.5, 0.5)  iteration
(256, 20, 1.5, 0.5)  iteration
(257, 20, 1.5, 0.55)  iteration
(257, 20, 1.5, 0.55)  iteration
(257, 20, 1.5, 0.55)  iteration
(257, 20, 1.5, 0.55)  iteration
(257, 20, 1.5, 0.55)  iter

(288, 50, 0.2, 0.2)  iteration
(288, 50, 0.2, 0.2)  iteration
(288, 50, 0.2, 0.2)  iteration
(288, 50, 0.2, 0.2)  iteration
(288, 50, 0.2, 0.2)  iteration
(288, 50, 0.2, 0.2)  iteration
(289, 50, 0.2, 0.25)  iteration
(289, 50, 0.2, 0.25)  iteration
(289, 50, 0.2, 0.25)  iteration
(289, 50, 0.2, 0.25)  iteration
(289, 50, 0.2, 0.25)  iteration
(289, 50, 0.2, 0.25)  iteration
(290, 50, 0.2, 0.3)  iteration
(290, 50, 0.2, 0.3)  iteration
(290, 50, 0.2, 0.3)  iteration
(290, 50, 0.2, 0.3)  iteration
(290, 50, 0.2, 0.3)  iteration
(290, 50, 0.2, 0.3)  iteration
(291, 50, 0.2, 0.35000000000000003)  iteration
(291, 50, 0.2, 0.35000000000000003)  iteration
(291, 50, 0.2, 0.35000000000000003)  iteration
(291, 50, 0.2, 0.35000000000000003)  iteration
(291, 50, 0.2, 0.35000000000000003)  iteration
(291, 50, 0.2, 0.35000000000000003)  iteration
(292, 50, 0.2, 0.4)  iteration
(292, 50, 0.2, 0.4)  iteration
(292, 50, 0.2, 0.4)  iteration
(292, 50, 0.2, 0.4)  iteration
(292, 50, 0.2, 0.4)  iteration

(323, 50, 1.0, 0.05)  iteration
(323, 50, 1.0, 0.05)  iteration
(323, 50, 1.0, 0.05)  iteration
(324, 50, 1.0, 0.1)  iteration
(324, 50, 1.0, 0.1)  iteration
(324, 50, 1.0, 0.1)  iteration
(324, 50, 1.0, 0.1)  iteration
(324, 50, 1.0, 0.1)  iteration
(324, 50, 1.0, 0.1)  iteration
(325, 50, 1.0, 0.15000000000000002)  iteration
(325, 50, 1.0, 0.15000000000000002)  iteration
(325, 50, 1.0, 0.15000000000000002)  iteration
(325, 50, 1.0, 0.15000000000000002)  iteration
(325, 50, 1.0, 0.15000000000000002)  iteration
(325, 50, 1.0, 0.15000000000000002)  iteration
(326, 50, 1.0, 0.2)  iteration
(326, 50, 1.0, 0.2)  iteration
(326, 50, 1.0, 0.2)  iteration
(326, 50, 1.0, 0.2)  iteration
(326, 50, 1.0, 0.2)  iteration
(326, 50, 1.0, 0.2)  iteration
(327, 50, 1.0, 0.25)  iteration
(327, 50, 1.0, 0.25)  iteration
(327, 50, 1.0, 0.25)  iteration
(327, 50, 1.0, 0.25)  iteration
(327, 50, 1.0, 0.25)  iteration
(327, 50, 1.0, 0.25)  iteration
(328, 50, 1.0, 0.3)  iteration
(328, 50, 1.0, 0.3)  iterat

(359, 50, 1.1, 0.9000000000000001)  iteration
(359, 50, 1.1, 0.9000000000000001)  iteration
(359, 50, 1.1, 0.9000000000000001)  iteration
(359, 50, 1.1, 0.9000000000000001)  iteration
(360, 50, 1.1, 0.9500000000000001)  iteration
(360, 50, 1.1, 0.9500000000000001)  iteration
(360, 50, 1.1, 0.9500000000000001)  iteration
(360, 50, 1.1, 0.9500000000000001)  iteration
(360, 50, 1.1, 0.9500000000000001)  iteration
(360, 50, 1.1, 0.9500000000000001)  iteration
(361, 50, 1.2, 0.05)  iteration
(361, 50, 1.2, 0.05)  iteration
(361, 50, 1.2, 0.05)  iteration
(361, 50, 1.2, 0.05)  iteration
(361, 50, 1.2, 0.05)  iteration
(361, 50, 1.2, 0.05)  iteration
(362, 50, 1.2, 0.1)  iteration
(362, 50, 1.2, 0.1)  iteration
(362, 50, 1.2, 0.1)  iteration
(362, 50, 1.2, 0.1)  iteration
(362, 50, 1.2, 0.1)  iteration
(362, 50, 1.2, 0.1)  iteration
(363, 50, 1.2, 0.15000000000000002)  iteration
(363, 50, 1.2, 0.15000000000000002)  iteration
(363, 50, 1.2, 0.15000000000000002)  iteration
(363, 50, 1.2, 0.1500

(395, 50, 1.5, 0.8)  iteration
(395, 50, 1.5, 0.8)  iteration
(395, 50, 1.5, 0.8)  iteration
(395, 50, 1.5, 0.8)  iteration
(395, 50, 1.5, 0.8)  iteration
(395, 50, 1.5, 0.8)  iteration
(396, 50, 1.5, 0.8500000000000001)  iteration
(396, 50, 1.5, 0.8500000000000001)  iteration
(396, 50, 1.5, 0.8500000000000001)  iteration
(396, 50, 1.5, 0.8500000000000001)  iteration
(396, 50, 1.5, 0.8500000000000001)  iteration
(396, 50, 1.5, 0.8500000000000001)  iteration
(397, 50, 1.5, 0.9000000000000001)  iteration
(397, 50, 1.5, 0.9000000000000001)  iteration
(397, 50, 1.5, 0.9000000000000001)  iteration
(397, 50, 1.5, 0.9000000000000001)  iteration
(397, 50, 1.5, 0.9000000000000001)  iteration
(397, 50, 1.5, 0.9000000000000001)  iteration
(398, 50, 1.5, 0.9500000000000001)  iteration
(398, 50, 1.5, 0.9500000000000001)  iteration
(398, 50, 1.5, 0.9500000000000001)  iteration
(398, 50, 1.5, 0.9500000000000001)  iteration
(398, 50, 1.5, 0.9500000000000001)  iteration
(398, 50, 1.5, 0.950000000000000

(430, 100, 0.2, 0.6500000000000001)  iteration
(430, 100, 0.2, 0.6500000000000001)  iteration
(430, 100, 0.2, 0.6500000000000001)  iteration
(430, 100, 0.2, 0.6500000000000001)  iteration
(431, 100, 0.2, 0.7000000000000001)  iteration
(431, 100, 0.2, 0.7000000000000001)  iteration
(431, 100, 0.2, 0.7000000000000001)  iteration
(431, 100, 0.2, 0.7000000000000001)  iteration
(431, 100, 0.2, 0.7000000000000001)  iteration
(431, 100, 0.2, 0.7000000000000001)  iteration
(432, 100, 0.2, 0.7500000000000001)  iteration
(432, 100, 0.2, 0.7500000000000001)  iteration
(432, 100, 0.2, 0.7500000000000001)  iteration
(432, 100, 0.2, 0.7500000000000001)  iteration
(432, 100, 0.2, 0.7500000000000001)  iteration
(432, 100, 0.2, 0.7500000000000001)  iteration
(433, 100, 0.2, 0.8)  iteration
(433, 100, 0.2, 0.8)  iteration
(433, 100, 0.2, 0.8)  iteration
(433, 100, 0.2, 0.8)  iteration
(433, 100, 0.2, 0.8)  iteration
(433, 100, 0.2, 0.8)  iteration
(434, 100, 0.2, 0.8500000000000001)  iteration
(434, 100

(465, 100, 1.0, 0.5)  iteration
(465, 100, 1.0, 0.5)  iteration
(465, 100, 1.0, 0.5)  iteration
(465, 100, 1.0, 0.5)  iteration
(465, 100, 1.0, 0.5)  iteration
(465, 100, 1.0, 0.5)  iteration
(466, 100, 1.0, 0.55)  iteration
(466, 100, 1.0, 0.55)  iteration
(466, 100, 1.0, 0.55)  iteration
(466, 100, 1.0, 0.55)  iteration
(466, 100, 1.0, 0.55)  iteration
(466, 100, 1.0, 0.55)  iteration
(467, 100, 1.0, 0.6000000000000001)  iteration
(467, 100, 1.0, 0.6000000000000001)  iteration
(467, 100, 1.0, 0.6000000000000001)  iteration
(467, 100, 1.0, 0.6000000000000001)  iteration
(467, 100, 1.0, 0.6000000000000001)  iteration
(467, 100, 1.0, 0.6000000000000001)  iteration
(468, 100, 1.0, 0.6500000000000001)  iteration
(468, 100, 1.0, 0.6500000000000001)  iteration
(468, 100, 1.0, 0.6500000000000001)  iteration
(468, 100, 1.0, 0.6500000000000001)  iteration
(468, 100, 1.0, 0.6500000000000001)  iteration
(468, 100, 1.0, 0.6500000000000001)  iteration
(469, 100, 1.0, 0.7000000000000001)  iteration

(499, 100, 1.2, 0.3)  iteration
(499, 100, 1.2, 0.3)  iteration
(499, 100, 1.2, 0.3)  iteration
(500, 100, 1.2, 0.35000000000000003)  iteration
(500, 100, 1.2, 0.35000000000000003)  iteration
(500, 100, 1.2, 0.35000000000000003)  iteration
(500, 100, 1.2, 0.35000000000000003)  iteration
(500, 100, 1.2, 0.35000000000000003)  iteration
(500, 100, 1.2, 0.35000000000000003)  iteration
(501, 100, 1.2, 0.4)  iteration
(501, 100, 1.2, 0.4)  iteration
(501, 100, 1.2, 0.4)  iteration
(501, 100, 1.2, 0.4)  iteration
(501, 100, 1.2, 0.4)  iteration
(501, 100, 1.2, 0.4)  iteration
(502, 100, 1.2, 0.45)  iteration
(502, 100, 1.2, 0.45)  iteration
(502, 100, 1.2, 0.45)  iteration
(502, 100, 1.2, 0.45)  iteration
(502, 100, 1.2, 0.45)  iteration
(502, 100, 1.2, 0.45)  iteration
(503, 100, 1.2, 0.5)  iteration
(503, 100, 1.2, 0.5)  iteration
(503, 100, 1.2, 0.5)  iteration
(503, 100, 1.2, 0.5)  iteration
(503, 100, 1.2, 0.5)  iteration
(503, 100, 1.2, 0.5)  iteration
(504, 100, 1.2, 0.55)  iteration
(

(534, 150, 0.1, 0.15000000000000002)  iteration
(534, 150, 0.1, 0.15000000000000002)  iteration
(534, 150, 0.1, 0.15000000000000002)  iteration
(534, 150, 0.1, 0.15000000000000002)  iteration
(534, 150, 0.1, 0.15000000000000002)  iteration
(534, 150, 0.1, 0.15000000000000002)  iteration
(535, 150, 0.1, 0.2)  iteration
(535, 150, 0.1, 0.2)  iteration
(535, 150, 0.1, 0.2)  iteration
(535, 150, 0.1, 0.2)  iteration
(535, 150, 0.1, 0.2)  iteration
(535, 150, 0.1, 0.2)  iteration
(536, 150, 0.1, 0.25)  iteration
(536, 150, 0.1, 0.25)  iteration
(536, 150, 0.1, 0.25)  iteration
(536, 150, 0.1, 0.25)  iteration
(536, 150, 0.1, 0.25)  iteration
(536, 150, 0.1, 0.25)  iteration
(537, 150, 0.1, 0.3)  iteration
(537, 150, 0.1, 0.3)  iteration
(537, 150, 0.1, 0.3)  iteration
(537, 150, 0.1, 0.3)  iteration
(537, 150, 0.1, 0.3)  iteration
(537, 150, 0.1, 0.3)  iteration
(538, 150, 0.1, 0.35000000000000003)  iteration
(538, 150, 0.1, 0.35000000000000003)  iteration
(538, 150, 0.1, 0.3500000000000000

(568, 150, 0.2, 0.9000000000000001)  iteration
(568, 150, 0.2, 0.9000000000000001)  iteration
(569, 150, 0.2, 0.9500000000000001)  iteration
(569, 150, 0.2, 0.9500000000000001)  iteration
(569, 150, 0.2, 0.9500000000000001)  iteration
(569, 150, 0.2, 0.9500000000000001)  iteration
(569, 150, 0.2, 0.9500000000000001)  iteration
(569, 150, 0.2, 0.9500000000000001)  iteration
(570, 150, 0.5, 0.05)  iteration
(570, 150, 0.5, 0.05)  iteration
(570, 150, 0.5, 0.05)  iteration
(570, 150, 0.5, 0.05)  iteration
(570, 150, 0.5, 0.05)  iteration
(570, 150, 0.5, 0.05)  iteration
(571, 150, 0.5, 0.1)  iteration
(571, 150, 0.5, 0.1)  iteration
(571, 150, 0.5, 0.1)  iteration
(571, 150, 0.5, 0.1)  iteration
(571, 150, 0.5, 0.1)  iteration
(571, 150, 0.5, 0.1)  iteration
(572, 150, 0.5, 0.15000000000000002)  iteration
(572, 150, 0.5, 0.15000000000000002)  iteration
(572, 150, 0.5, 0.15000000000000002)  iteration
(572, 150, 0.5, 0.15000000000000002)  iteration
(572, 150, 0.5, 0.15000000000000002)  iter

(603, 150, 1.0, 0.7500000000000001)  iteration
(603, 150, 1.0, 0.7500000000000001)  iteration
(604, 150, 1.0, 0.8)  iteration
(604, 150, 1.0, 0.8)  iteration
(604, 150, 1.0, 0.8)  iteration
(604, 150, 1.0, 0.8)  iteration
(604, 150, 1.0, 0.8)  iteration
(604, 150, 1.0, 0.8)  iteration
(605, 150, 1.0, 0.8500000000000001)  iteration
(605, 150, 1.0, 0.8500000000000001)  iteration
(605, 150, 1.0, 0.8500000000000001)  iteration
(605, 150, 1.0, 0.8500000000000001)  iteration
(605, 150, 1.0, 0.8500000000000001)  iteration
(605, 150, 1.0, 0.8500000000000001)  iteration
(606, 150, 1.0, 0.9000000000000001)  iteration
(606, 150, 1.0, 0.9000000000000001)  iteration
(606, 150, 1.0, 0.9000000000000001)  iteration
(606, 150, 1.0, 0.9000000000000001)  iteration
(606, 150, 1.0, 0.9000000000000001)  iteration
(606, 150, 1.0, 0.9000000000000001)  iteration
(607, 150, 1.0, 0.9500000000000001)  iteration
(607, 150, 1.0, 0.9500000000000001)  iteration
(607, 150, 1.0, 0.9500000000000001)  iteration
(607, 150

(639, 150, 1.2, 0.6500000000000001)  iteration
(639, 150, 1.2, 0.6500000000000001)  iteration
(639, 150, 1.2, 0.6500000000000001)  iteration
(639, 150, 1.2, 0.6500000000000001)  iteration
(639, 150, 1.2, 0.6500000000000001)  iteration
(639, 150, 1.2, 0.6500000000000001)  iteration
(640, 150, 1.2, 0.7000000000000001)  iteration
(640, 150, 1.2, 0.7000000000000001)  iteration
(640, 150, 1.2, 0.7000000000000001)  iteration
(640, 150, 1.2, 0.7000000000000001)  iteration
(640, 150, 1.2, 0.7000000000000001)  iteration
(640, 150, 1.2, 0.7000000000000001)  iteration
(641, 150, 1.2, 0.7500000000000001)  iteration
(641, 150, 1.2, 0.7500000000000001)  iteration
(641, 150, 1.2, 0.7500000000000001)  iteration
(641, 150, 1.2, 0.7500000000000001)  iteration
(641, 150, 1.2, 0.7500000000000001)  iteration
(641, 150, 1.2, 0.7500000000000001)  iteration
(642, 150, 1.2, 0.8)  iteration
(642, 150, 1.2, 0.8)  iteration
(642, 150, 1.2, 0.8)  iteration
(642, 150, 1.2, 0.8)  iteration
(642, 150, 1.2, 0.8)  iter

In [24]:
best_index = result.argmax()
BEST_T = list(product(ESTIMATORS, LEARN_RATES, THRSHS))[best_index][2]
BEST_EST = list(product(ESTIMATORS, LEARN_RATES, THRSHS))[best_index][0]
BEST_LR = list(product(ESTIMATORS, LEARN_RATES, THRSHS))[best_index][1]
print('Best threshold : {}\nBest N estimators : {}\nBest learning rate : {}\nBest F-score : {}'\
      .format(BEST_T, BEST_EST, BEST_LR, result[best_index]))

Best threshold : 0.4
Best N estimators : 10
Best learning rate : 0.2
Best F-score : 0.7184959977662141


In [43]:
Test_Preds = []
f_scores = []
kf = KFold(n_splits=6, shuffle=True)

for train_index, val_index in kf.split(groups_indices):
    Train_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[train_index]))
    Val_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[val_index]))

    Train_X, Train_y = X_train[Train_Indicies], y_train[Train_Indicies]
    Val_X, Val_y = X_train[Val_Indicies], y_train[Val_Indicies]

    clf = AdaBoostClassifier(n_estimators=BEST_EST, learning_rate=BEST_LR)
    clf.fit(Train_X, Train_y)
    preds = predict(clf, BEST_T, Val_X)
    Test_Preds.append(predict(clf, BEST_T, X_test))
    f_scores.append(f1_score(Val_y, preds))
Test_Preds = np.array(Test_Preds)

In [44]:
print(np.round(f_scores,3), np.mean(f_scores))

[0.657 0.728 0.693 0.791 0.716 0.772] 0.7263407789464779


In [41]:
weights = f_scores/sum(f_scores)
Predictions = []
for i in Test_Preds.T:
    Predictions.append(int(np.dot(i, weights).round()))

In [42]:
SUBMIT = pd.DataFrame(columns=['pair_id', 'target'])
SUBMIT['pair_id'] = pairs_id
SUBMIT['target'] = Predictions

SUBMIT.to_csv('submit_tfidf_AdaBoost.csv', index=0)