_Fast Text Tutorial_
https://stackabuse.com/python-for-nlp-working-with-facebook-fasttext-library/

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from keras.preprocessing.text import Tokenizer
from gensim.models.fasttext import FastText
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer

from sklearn.preprocessing import StandardScaler

import wikipedia
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('russian'))

%matplotlib inline

Using TensorFlow backend.
[nltk_data] Downloading package punkt to /Users/pus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/pus/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/pus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
en_stop = set(nltk.corpus.stopwords.words('russian'))

## Scraping Wikipedia Articles

To scrape a Wikipedia `page`, we can use the page method from the `wikipedia` module. The name of the page that you want to scrap is passed as a parameter to the `page` method. The method returns`WikipediaPage` object, which you can then use to retrieve the page contents via the `content` attribute, as shown in the above script.

The scraped content from the four Wikipedia pages are then tokenized into sentences using the `sent_tokenize` method. The `sent_tokenize` method returns list of sentences. The sentences for the four pages are tokenized separately. Finally, sentences from the four articles are joined together via the `extend` method.

In [3]:
artificial_intelligence = wikipedia.page("Artificial Intelligence").content
machine_learning = wikipedia.page("Machine Learning").content
deep_learning = wikipedia.page("Deep Learning").content
neural_network = wikipedia.page("Neural Network").content

artificial_intelligence = sent_tokenize(artificial_intelligence)
machine_learning = sent_tokenize(machine_learning)
deep_learning = sent_tokenize(deep_learning)
neural_network = sent_tokenize(neural_network)

artificial_intelligence.extend(machine_learning)
artificial_intelligence.extend(deep_learning)
artificial_intelligence.extend(neural_network)

## Data Preprocessing

In [4]:
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer

import re
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def to_digit(x):
    try:
        return int(x)
    except ValueError:
        pass
    
def remove_urls (vTEXT):
    regex = re.compile(
    # r'^(?:http|ftp)s?://'  # http:// or https://
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
    r'localhost|'  # localhost...
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'  # ...or ipv4
    r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'  # ...or ipv6
    r'(?::\d+)?'  # optional port
    r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    
    vTEXT = re.sub(regex, '', vTEXT)
    return(vTEXT)

def preprocess_text(document):
        # Renove urls
        document = re.sub(url_reg, '', document)
        
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[а-яА-Я]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[а-яА-Я]\s+', ' ', str(document))
        
        # Remove 1-2 digits
#         document = re.sub(r'\b[0-9]\b', ' ', str(document))

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', str(document), flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        # print(tokens)
        # tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens \
                  if (len(word) > 2)|\
                  (type(to_digit(word)) == int)|\
                  (word.lower() in ['cs','go','vc'])]
        # print(tokens)
        
        tokens = list(map(lambda x: morph.parse(x)[0].normal_form not in en_stop, tokens ))
        preprocessed_text = ' '.join(tokens)

        return preprocessed_text

In [5]:
docs_titles = list(doc_to_title.values())
final_corpus = [preprocess_text(sentence) for sentence in docs_titles if sentence.strip() !='']

word_punctuation_tokenizer = nltk.WordPunctTokenizer()
word_tokenized_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in final_corpus]

NameError: name 'doc_to_title' is not defined

## Creating Words Representation

- Here `embedding_siz` is the size of the embedding vector. In other words, each word in our corpus will be represented as a 60-dimensional vector. 
- The `window_size` is the size of the number of words occurring before and after the word based on which the word representations will be learned for the word. This might sound tricky, however in the skip-gram model we input a word to the algorithm and the output is the context words. If the window size is 40, for each input there will be 80 outputs: 40 words that occur before the input word and 40 words that occur after the input word. The word embeddings for the input word are learned using these 80 output words.
- The next hyper-parameter is the `min_word`, which specifies the minimum frequency of a word in the corpus for which the word representations will be generated. Finally, the most frequently occurring word will be down-sampled by a number specified by the `down_sampling` attribute.

In [12]:
embedding_size = 60
window_size = 40
min_word = 5
down_sampling = 1e-2

In [13]:
ft_model = FastText(word_tokenized_corpus,
                      size=embedding_size,
                      window=window_size,
                      min_count=min_word,
                      sample=down_sampling,
                      sg=1,
                      iter=100)

# The sg parameter defines the type of model that we want to create.
# A value of 1 specifies that we want to create skip-gram model.
# Whereas zero specifies the bag of words model, which is the default value as well.

In [14]:
print(ft_model.wv['artificial'])

[-0.010052    0.13214384 -0.4272143  -0.1452809   0.06146226  0.4963034
 -0.556517    0.5758822  -0.06464335  0.10455821  0.22314543  0.14673352
  0.00474362 -0.14722382 -0.041366    0.2610917  -0.13483426 -0.05320083
  0.56306475 -0.02367714  0.02820591 -0.11981182 -0.36316746 -0.32711077
 -0.19725317  0.26250145  0.39921963 -0.08543238  0.24469097 -0.47006655
  0.01140171 -0.22262837  0.03067547 -0.32319245 -0.13651682 -0.11120653
  0.18618856  0.13662049  0.2326443   0.41894165 -0.21702096  0.5213076
  0.1453292   0.34450504  0.3092899  -0.28326142  0.10446052  0.6193946
  0.4019543  -0.07616904  0.525276    0.04189179 -0.19804813 -0.02871146
  0.42058858 -0.2515648   0.37351826  0.15123446  0.15210412  0.15749191]


In [15]:
semantically_similar_words = {words: [item[0] for item in ft_model.wv.most_similar([words], topn=5)]
                  for words in ['artificial', 'intelligence', 'machine', 'network', 'recurrent', 'deep']}

for k,v in semantically_similar_words.items():
    print(k+":"+str(v))

artificial:['intelligence', 'inspired', 'simulate', 'technology', 'simulation']
intelligence:['artificial', 'intelligent', 'simulate', 'simulation', 'simulated']
machine:['ethic', 'learning', 'concerned', 'intelligence', 'previously']
network:['neural', 'neuron', 'feedforward', 'biological', 'recurrent']
recurrent:['hopfield', 'short', 'rnns', 'network', 'deep']
deep:['speech', 'generative', 'convolutional', 'cnns', 'learning']


# Применение к нашей задаче

## Получение всех заголовков

In [4]:
doc_to_title = {}
with open('docs_titles.tsv') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = data[1]
        doc_to_title[doc_id] = title
print (len(doc_to_title))

28026


## Preprocessing

In [9]:
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer

import re
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def to_digit(x):
    try:
        return int(x)
    except ValueError:
        pass
    
def remove_urls (vTEXT):
    regex = re.compile(
    # r'^(?:http|ftp)s?://'  # http:// or https://
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
    r'localhost|'  # localhost...
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'  # ...or ipv4
    r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'  # ...or ipv6
    r'(?::\d+)?'  # optional port
    r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    
    vTEXT = re.sub(regex, '', vTEXT)
    return(vTEXT)

def preprocess_text(document):
        # Renove urls
        document = remove_urls(document)
        
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[а-яА-Я]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[а-яА-Я]\s+', ' ', str(document))
        
        # Remove 1-2 digits
#         document = re.sub(r'\b[0-9]\b', ' ', str(document))

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', str(document), flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        # print(tokens)
        # tokens = [stemmer.lemmatize(word) for word in tokens]
        # tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens \
                  if (len(word) > 2)|\
                  (type(to_digit(word)) == int)|\
                  (word.lower() in ['cs','go','vc'])]
        # print(tokens)
        
        tokens = list(map(lambda x: morph.parse(x)[0].normal_form , tokens ))
        tokens = [word for word in tokens if word not in en_stop]
        preprocessed_text = ' '.join(tokens)

        return preprocessed_text

In [10]:
docs_titles = list(doc_to_title.values())

In [12]:
sent = preprocess_text("Artificial intelligence, is the most advanced technology of the present era")
print(sent)


final_corpus = [preprocess_text(sentence) for sentence in docs_titles if sentence.strip() !='']

word_punctuation_tokenizer = nltk.WordPunctTokenizer()
word_tokenized_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in final_corpus]

artificial intelligence the most advanced technology the present era


In [11]:
embedding_size = 10
window_size = 4
min_word = 2
down_sampling = 1e-3

In [23]:
ft_model = FastText(word_tokenized_corpus,
                      size=embedding_size,
                      window=window_size,
                      min_count=min_word,
                      sample=down_sampling,
                      sg=1,
                      iter=100)

## Train part

In [25]:
train_data = pd.read_csv('train_groups.csv')
traingoups_fasttext_data = {}
for i in range(train_data.shape[0]):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    tokenized_sent = preprocess_text(doc_to_title[doc_id]).split(' ')
    vect = np.array(sum([ft_model.wv[word] for word in tokenized_sent]))
    sent_embending = vect
    if doc_group not in traingoups_fasttext_data:
        traingoups_fasttext_data[doc_group] = []
    traingoups_fasttext_data[doc_group].append((doc_id, sent_embending, target))

In [101]:
import numpy as np
from scipy.spatial.distance import cosine

y_train = []
X_train = []
groups_train = []
for new_group in traingoups_fasttext_data:
    docs = traingoups_fasttext_data[new_group]
    for k, (doc_id, sent_embeding, target_id) in enumerate(docs):
        y_train.append(target_id)
        groups_train.append(new_group)
        all_sims = []
        # words = set(title.strip().split())
        for j in range(0, len(docs)):
            if k == j:
                continue
            _, sent_embeding_j, _ = docs[j]
            # words_j = set(title_j.strip().split())
            all_sims.append(1-cosine(sent_embeding, sent_embeding_j))
        X_train.append(sorted(all_sims, reverse=True)[0:30])

# X_train = np.array(X_train)
a = np.zeros((len(X_train), 30))
for i, row in enumerate(X_train):
    for j, val in enumerate(row):
        a[i,j] = val
X_train = a

y_train = np.array(y_train)
groups_train = np.array(groups_train)
print (X_train.shape, y_train.shape, groups_train.shape)



  dist = 1.0 - uv / np.sqrt(uu * vv)


(11690, 30) (11690,) (11690,)


In [149]:
X_train = np.nan_to_num(np.array(X_train))
ss = StandardScaler()
X_train = ss.fit_transform(X_train)

## Test part

In [115]:
test_data = pd.read_csv('test_groups.csv')
testgroups_fasttext_data = {}
for i in range(test_data.shape[0]):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    pair_id = new_doc['pair_id']
    tokenized_sent = preprocess_text(doc_to_title[doc_id]).split(' ')
    vect = np.array(sum([ft_model.wv[word] for word in tokenized_sent]))
    sent_embending = vect
    if doc_group not in testgroups_fasttext_data:
        testgroups_fasttext_data[doc_group] = []
    testgroups_fasttext_data[doc_group].append((doc_id, sent_embending, pair_id))

In [119]:
X_test = []
pairs_id = []
for new_group in testgroups_fasttext_data:
    docs = testgroups_fasttext_data[new_group]
    for k, (doc_id, embending, pair_id) in enumerate(docs):
        pairs_id.append(pair_id)
        all_sims = []
        for j in range(0, len(docs)):
            if k == j:
                continue
            _, sent_embeding_j, _ = docs[j]
            all_sims.append(1-cosine(sent_embeding, sent_embeding_j))
        X_test.append(sorted(all_sims, reverse=True)[0:30])

a = np.zeros((len(X_test), 30))
for i, row in enumerate(X_test):
    for j, val in enumerate(row):
        a[i,j] = val
X_test = a

pairs_id = np.array(pairs_id)

print (X_test.shape, pairs_id.shape)

  dist = 1.0 - uv / np.sqrt(uu * vv)


(16627, 30) (16627,)


In [150]:
X_test = np.nan_to_num(X_test)
ss = StandardScaler()
X_test = ss.fit_transform(X_test)

***
## Валидация

In [151]:
from sklearn.metrics import f1_score
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import KFold

import random

from functools import reduce

In [152]:
from itertools import zip_longest

prev_group = 1
prev_index = 0
groups_indices = []
for k, i in zip_longest(range(len(groups_train) + 1),
                        groups_train, fillvalue=-1):
    if prev_group != i:
        groups_indices.append([prev_index, k])
        prev_group = i
        prev_index = k
groups_indices = np.array(groups_indices)

In [153]:
def predict(clf, trsh, X):
    proba = clf.predict_proba(X)
    return np.array(list(map(lambda x: 1 if x[1] > trsh else 0  , proba)))

In [154]:
from itertools import product

In [155]:
THRSHS = np.arange(0.05, 1.0, 0.05)
REGUL_C = np.logspace(0, 2.5, 4)

result = np.zeros(len(list(product(REGUL_C, THRSHS))))
for i, (C, trsh) in enumerate(product(REGUL_C, THRSHS)):
    f_scores = []
    kf = KFold(n_splits=4)
    for train_index, val_index in kf.split(groups_indices):
        Train_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[train_index]))
        Val_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[val_index]))

        Train_X, Train_y = X_train[Train_Indicies], y_train[Train_Indicies]
        Val_X, Val_y = X_train[Val_Indicies], y_train[Val_Indicies]

        clf = LogisticRegression(C=C, solver='lbfgs')
        clf.fit(Train_X, Train_y)
        preds = predict(clf, trsh, Val_X)
        
        f_scores.append(f1_score(Val_y, preds))
        
        print((i, C, trsh),' iteration')
        result[i] = np.mean(f_scores)
        
best_index = result.argmax()
BEST_T = list(product(REGUL_C, THRSHS))[best_index][1]
BEST_REGUL_C = list(product(REGUL_C, THRSHS))[best_index][0]
print(BEST_REGUL_C, BEST_T, result[best_index])



(0, 1.0, 0.05)  iteration
(0, 1.0, 0.05)  iteration
(0, 1.0, 0.05)  iteration




(0, 1.0, 0.05)  iteration
(1, 1.0, 0.1)  iteration
(1, 1.0, 0.1)  iteration
(1, 1.0, 0.1)  iteration
(1, 1.0, 0.1)  iteration
(2, 1.0, 0.15000000000000002)  iteration




(2, 1.0, 0.15000000000000002)  iteration
(2, 1.0, 0.15000000000000002)  iteration
(2, 1.0, 0.15000000000000002)  iteration




(3, 1.0, 0.2)  iteration
(3, 1.0, 0.2)  iteration
(3, 1.0, 0.2)  iteration




(3, 1.0, 0.2)  iteration
(4, 1.0, 0.25)  iteration
(4, 1.0, 0.25)  iteration
(4, 1.0, 0.25)  iteration
(4, 1.0, 0.25)  iteration




(5, 1.0, 0.3)  iteration
(5, 1.0, 0.3)  iteration
(5, 1.0, 0.3)  iteration




(5, 1.0, 0.3)  iteration
(6, 1.0, 0.35000000000000003)  iteration
(6, 1.0, 0.35000000000000003)  iteration
(6, 1.0, 0.35000000000000003)  iteration
(6, 1.0, 0.35000000000000003)  iteration
(7, 1.0, 0.4)  iteration




(7, 1.0, 0.4)  iteration
(7, 1.0, 0.4)  iteration
(7, 1.0, 0.4)  iteration




(8, 1.0, 0.45)  iteration
(8, 1.0, 0.45)  iteration
(8, 1.0, 0.45)  iteration




(8, 1.0, 0.45)  iteration
(9, 1.0, 0.5)  iteration
(9, 1.0, 0.5)  iteration
(9, 1.0, 0.5)  iteration
(9, 1.0, 0.5)  iteration
(10, 1.0, 0.55)  iteration




(10, 1.0, 0.55)  iteration
(10, 1.0, 0.55)  iteration
(10, 1.0, 0.55)  iteration




(11, 1.0, 0.6000000000000001)  iteration
(11, 1.0, 0.6000000000000001)  iteration
(11, 1.0, 0.6000000000000001)  iteration




(11, 1.0, 0.6000000000000001)  iteration
(12, 1.0, 0.6500000000000001)  iteration
(12, 1.0, 0.6500000000000001)  iteration
(12, 1.0, 0.6500000000000001)  iteration
(12, 1.0, 0.6500000000000001)  iteration
(13, 1.0, 0.7000000000000001)  iteration




(13, 1.0, 0.7000000000000001)  iteration
(13, 1.0, 0.7000000000000001)  iteration
(13, 1.0, 0.7000000000000001)  iteration




(14, 1.0, 0.7500000000000001)  iteration
(14, 1.0, 0.7500000000000001)  iteration
(14, 1.0, 0.7500000000000001)  iteration




(14, 1.0, 0.7500000000000001)  iteration
(15, 1.0, 0.8)  iteration
(15, 1.0, 0.8)  iteration
(15, 1.0, 0.8)  iteration
(15, 1.0, 0.8)  iteration
(16, 1.0, 0.8500000000000001)  iteration




(16, 1.0, 0.8500000000000001)  iteration
(16, 1.0, 0.8500000000000001)  iteration
(16, 1.0, 0.8500000000000001)  iteration




(17, 1.0, 0.9000000000000001)  iteration
(17, 1.0, 0.9000000000000001)  iteration
(17, 1.0, 0.9000000000000001)  iteration


  'precision', 'predicted', average, warn_for)


(17, 1.0, 0.9000000000000001)  iteration
(18, 1.0, 0.9500000000000001)  iteration
(18, 1.0, 0.9500000000000001)  iteration
(18, 1.0, 0.9500000000000001)  iteration
(18, 1.0, 0.9500000000000001)  iteration
(19, 6.812920690579613, 0.05)  iteration




(19, 6.812920690579613, 0.05)  iteration
(19, 6.812920690579613, 0.05)  iteration
(19, 6.812920690579613, 0.05)  iteration




(20, 6.812920690579613, 0.1)  iteration
(20, 6.812920690579613, 0.1)  iteration
(20, 6.812920690579613, 0.1)  iteration




(20, 6.812920690579613, 0.1)  iteration
(21, 6.812920690579613, 0.15000000000000002)  iteration
(21, 6.812920690579613, 0.15000000000000002)  iteration




(21, 6.812920690579613, 0.15000000000000002)  iteration
(21, 6.812920690579613, 0.15000000000000002)  iteration
(22, 6.812920690579613, 0.2)  iteration
(22, 6.812920690579613, 0.2)  iteration
(22, 6.812920690579613, 0.2)  iteration
(22, 6.812920690579613, 0.2)  iteration




(23, 6.812920690579613, 0.25)  iteration
(23, 6.812920690579613, 0.25)  iteration
(23, 6.812920690579613, 0.25)  iteration




(23, 6.812920690579613, 0.25)  iteration
(24, 6.812920690579613, 0.3)  iteration
(24, 6.812920690579613, 0.3)  iteration




(24, 6.812920690579613, 0.3)  iteration
(24, 6.812920690579613, 0.3)  iteration
(25, 6.812920690579613, 0.35000000000000003)  iteration
(25, 6.812920690579613, 0.35000000000000003)  iteration
(25, 6.812920690579613, 0.35000000000000003)  iteration
(25, 6.812920690579613, 0.35000000000000003)  iteration




(26, 6.812920690579613, 0.4)  iteration
(26, 6.812920690579613, 0.4)  iteration
(26, 6.812920690579613, 0.4)  iteration




(26, 6.812920690579613, 0.4)  iteration
(27, 6.812920690579613, 0.45)  iteration
(27, 6.812920690579613, 0.45)  iteration




(27, 6.812920690579613, 0.45)  iteration
(27, 6.812920690579613, 0.45)  iteration
(28, 6.812920690579613, 0.5)  iteration
(28, 6.812920690579613, 0.5)  iteration
(28, 6.812920690579613, 0.5)  iteration
(28, 6.812920690579613, 0.5)  iteration




(29, 6.812920690579613, 0.55)  iteration
(29, 6.812920690579613, 0.55)  iteration
(29, 6.812920690579613, 0.55)  iteration




(29, 6.812920690579613, 0.55)  iteration
(30, 6.812920690579613, 0.6000000000000001)  iteration
(30, 6.812920690579613, 0.6000000000000001)  iteration




(30, 6.812920690579613, 0.6000000000000001)  iteration
(30, 6.812920690579613, 0.6000000000000001)  iteration
(31, 6.812920690579613, 0.6500000000000001)  iteration
(31, 6.812920690579613, 0.6500000000000001)  iteration
(31, 6.812920690579613, 0.6500000000000001)  iteration
(31, 6.812920690579613, 0.6500000000000001)  iteration




(32, 6.812920690579613, 0.7000000000000001)  iteration
(32, 6.812920690579613, 0.7000000000000001)  iteration
(32, 6.812920690579613, 0.7000000000000001)  iteration




(32, 6.812920690579613, 0.7000000000000001)  iteration
(33, 6.812920690579613, 0.7500000000000001)  iteration
(33, 6.812920690579613, 0.7500000000000001)  iteration




(33, 6.812920690579613, 0.7500000000000001)  iteration
(33, 6.812920690579613, 0.7500000000000001)  iteration
(34, 6.812920690579613, 0.8)  iteration
(34, 6.812920690579613, 0.8)  iteration
(34, 6.812920690579613, 0.8)  iteration
(34, 6.812920690579613, 0.8)  iteration




(35, 6.812920690579613, 0.8500000000000001)  iteration
(35, 6.812920690579613, 0.8500000000000001)  iteration
(35, 6.812920690579613, 0.8500000000000001)  iteration




(35, 6.812920690579613, 0.8500000000000001)  iteration
(36, 6.812920690579613, 0.9000000000000001)  iteration
(36, 6.812920690579613, 0.9000000000000001)  iteration




(36, 6.812920690579613, 0.9000000000000001)  iteration
(36, 6.812920690579613, 0.9000000000000001)  iteration
(37, 6.812920690579613, 0.9500000000000001)  iteration


  'precision', 'predicted', average, warn_for)


(37, 6.812920690579613, 0.9500000000000001)  iteration
(37, 6.812920690579613, 0.9500000000000001)  iteration
(37, 6.812920690579613, 0.9500000000000001)  iteration




(38, 46.4158883361278, 0.05)  iteration
(38, 46.4158883361278, 0.05)  iteration
(38, 46.4158883361278, 0.05)  iteration




(38, 46.4158883361278, 0.05)  iteration
(39, 46.4158883361278, 0.1)  iteration
(39, 46.4158883361278, 0.1)  iteration
(39, 46.4158883361278, 0.1)  iteration
(39, 46.4158883361278, 0.1)  iteration
(40, 46.4158883361278, 0.15000000000000002)  iteration




(40, 46.4158883361278, 0.15000000000000002)  iteration
(40, 46.4158883361278, 0.15000000000000002)  iteration
(40, 46.4158883361278, 0.15000000000000002)  iteration




(41, 46.4158883361278, 0.2)  iteration
(41, 46.4158883361278, 0.2)  iteration
(41, 46.4158883361278, 0.2)  iteration




(41, 46.4158883361278, 0.2)  iteration
(42, 46.4158883361278, 0.25)  iteration
(42, 46.4158883361278, 0.25)  iteration
(42, 46.4158883361278, 0.25)  iteration
(42, 46.4158883361278, 0.25)  iteration
(43, 46.4158883361278, 0.3)  iteration




(43, 46.4158883361278, 0.3)  iteration
(43, 46.4158883361278, 0.3)  iteration
(43, 46.4158883361278, 0.3)  iteration




(44, 46.4158883361278, 0.35000000000000003)  iteration
(44, 46.4158883361278, 0.35000000000000003)  iteration
(44, 46.4158883361278, 0.35000000000000003)  iteration




(44, 46.4158883361278, 0.35000000000000003)  iteration
(45, 46.4158883361278, 0.4)  iteration
(45, 46.4158883361278, 0.4)  iteration
(45, 46.4158883361278, 0.4)  iteration
(45, 46.4158883361278, 0.4)  iteration
(46, 46.4158883361278, 0.45)  iteration




(46, 46.4158883361278, 0.45)  iteration
(46, 46.4158883361278, 0.45)  iteration
(46, 46.4158883361278, 0.45)  iteration




(47, 46.4158883361278, 0.5)  iteration
(47, 46.4158883361278, 0.5)  iteration
(47, 46.4158883361278, 0.5)  iteration




(47, 46.4158883361278, 0.5)  iteration
(48, 46.4158883361278, 0.55)  iteration
(48, 46.4158883361278, 0.55)  iteration
(48, 46.4158883361278, 0.55)  iteration




(48, 46.4158883361278, 0.55)  iteration
(49, 46.4158883361278, 0.6000000000000001)  iteration
(49, 46.4158883361278, 0.6000000000000001)  iteration
(49, 46.4158883361278, 0.6000000000000001)  iteration
(49, 46.4158883361278, 0.6000000000000001)  iteration




(50, 46.4158883361278, 0.6500000000000001)  iteration
(50, 46.4158883361278, 0.6500000000000001)  iteration
(50, 46.4158883361278, 0.6500000000000001)  iteration




(50, 46.4158883361278, 0.6500000000000001)  iteration
(51, 46.4158883361278, 0.7000000000000001)  iteration
(51, 46.4158883361278, 0.7000000000000001)  iteration
(51, 46.4158883361278, 0.7000000000000001)  iteration
(51, 46.4158883361278, 0.7000000000000001)  iteration




(52, 46.4158883361278, 0.7500000000000001)  iteration
(52, 46.4158883361278, 0.7500000000000001)  iteration
(52, 46.4158883361278, 0.7500000000000001)  iteration
(52, 46.4158883361278, 0.7500000000000001)  iteration




(53, 46.4158883361278, 0.8)  iteration
(53, 46.4158883361278, 0.8)  iteration
(53, 46.4158883361278, 0.8)  iteration




(53, 46.4158883361278, 0.8)  iteration
(54, 46.4158883361278, 0.8500000000000001)  iteration
(54, 46.4158883361278, 0.8500000000000001)  iteration
(54, 46.4158883361278, 0.8500000000000001)  iteration
(54, 46.4158883361278, 0.8500000000000001)  iteration
(55, 46.4158883361278, 0.9000000000000001)  iteration




(55, 46.4158883361278, 0.9000000000000001)  iteration
(55, 46.4158883361278, 0.9000000000000001)  iteration
(55, 46.4158883361278, 0.9000000000000001)  iteration


  'precision', 'predicted', average, warn_for)


(56, 46.4158883361278, 0.9500000000000001)  iteration
(56, 46.4158883361278, 0.9500000000000001)  iteration
(56, 46.4158883361278, 0.9500000000000001)  iteration




(56, 46.4158883361278, 0.9500000000000001)  iteration
(57, 316.22776601683796, 0.05)  iteration
(57, 316.22776601683796, 0.05)  iteration




(57, 316.22776601683796, 0.05)  iteration
(57, 316.22776601683796, 0.05)  iteration
(58, 316.22776601683796, 0.1)  iteration
(58, 316.22776601683796, 0.1)  iteration
(58, 316.22776601683796, 0.1)  iteration
(58, 316.22776601683796, 0.1)  iteration




(59, 316.22776601683796, 0.15000000000000002)  iteration
(59, 316.22776601683796, 0.15000000000000002)  iteration
(59, 316.22776601683796, 0.15000000000000002)  iteration




(59, 316.22776601683796, 0.15000000000000002)  iteration
(60, 316.22776601683796, 0.2)  iteration
(60, 316.22776601683796, 0.2)  iteration




(60, 316.22776601683796, 0.2)  iteration
(60, 316.22776601683796, 0.2)  iteration
(61, 316.22776601683796, 0.25)  iteration
(61, 316.22776601683796, 0.25)  iteration
(61, 316.22776601683796, 0.25)  iteration
(61, 316.22776601683796, 0.25)  iteration




(62, 316.22776601683796, 0.3)  iteration
(62, 316.22776601683796, 0.3)  iteration
(62, 316.22776601683796, 0.3)  iteration




(62, 316.22776601683796, 0.3)  iteration
(63, 316.22776601683796, 0.35000000000000003)  iteration
(63, 316.22776601683796, 0.35000000000000003)  iteration




(63, 316.22776601683796, 0.35000000000000003)  iteration
(63, 316.22776601683796, 0.35000000000000003)  iteration
(64, 316.22776601683796, 0.4)  iteration
(64, 316.22776601683796, 0.4)  iteration
(64, 316.22776601683796, 0.4)  iteration
(64, 316.22776601683796, 0.4)  iteration




(65, 316.22776601683796, 0.45)  iteration
(65, 316.22776601683796, 0.45)  iteration
(65, 316.22776601683796, 0.45)  iteration




(65, 316.22776601683796, 0.45)  iteration
(66, 316.22776601683796, 0.5)  iteration
(66, 316.22776601683796, 0.5)  iteration




(66, 316.22776601683796, 0.5)  iteration
(66, 316.22776601683796, 0.5)  iteration
(67, 316.22776601683796, 0.55)  iteration
(67, 316.22776601683796, 0.55)  iteration
(67, 316.22776601683796, 0.55)  iteration
(67, 316.22776601683796, 0.55)  iteration




(68, 316.22776601683796, 0.6000000000000001)  iteration
(68, 316.22776601683796, 0.6000000000000001)  iteration
(68, 316.22776601683796, 0.6000000000000001)  iteration




(68, 316.22776601683796, 0.6000000000000001)  iteration
(69, 316.22776601683796, 0.6500000000000001)  iteration
(69, 316.22776601683796, 0.6500000000000001)  iteration




(69, 316.22776601683796, 0.6500000000000001)  iteration
(69, 316.22776601683796, 0.6500000000000001)  iteration
(70, 316.22776601683796, 0.7000000000000001)  iteration
(70, 316.22776601683796, 0.7000000000000001)  iteration
(70, 316.22776601683796, 0.7000000000000001)  iteration
(70, 316.22776601683796, 0.7000000000000001)  iteration




(71, 316.22776601683796, 0.7500000000000001)  iteration
(71, 316.22776601683796, 0.7500000000000001)  iteration
(71, 316.22776601683796, 0.7500000000000001)  iteration




(71, 316.22776601683796, 0.7500000000000001)  iteration
(72, 316.22776601683796, 0.8)  iteration
(72, 316.22776601683796, 0.8)  iteration




(72, 316.22776601683796, 0.8)  iteration
(72, 316.22776601683796, 0.8)  iteration
(73, 316.22776601683796, 0.8500000000000001)  iteration
(73, 316.22776601683796, 0.8500000000000001)  iteration
(73, 316.22776601683796, 0.8500000000000001)  iteration
(73, 316.22776601683796, 0.8500000000000001)  iteration




(74, 316.22776601683796, 0.9000000000000001)  iteration
(74, 316.22776601683796, 0.9000000000000001)  iteration
(74, 316.22776601683796, 0.9000000000000001)  iteration




(74, 316.22776601683796, 0.9000000000000001)  iteration
(75, 316.22776601683796, 0.9500000000000001)  iteration
(75, 316.22776601683796, 0.9500000000000001)  iteration
(75, 316.22776601683796, 0.9500000000000001)  iteration
(75, 316.22776601683796, 0.9500000000000001)  iteration
1.0 0.3 0.6999233964025862


  'precision', 'predicted', average, warn_for)


In [156]:
Test_Preds = []
f_scores = []
kf = KFold(n_splits=5, shuffle=True)

for train_index, val_index in kf.split(groups_indices):
    Train_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[train_index]))
    Val_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[val_index]))

    Train_X, Train_y = X_train[Train_Indicies], y_train[Train_Indicies]
    Val_X, Val_y = X_train[Val_Indicies], y_train[Val_Indicies]

    clf = LogisticRegression(C=1*10**0)
    clf.fit(Train_X, Train_y)
    preds = predict(clf, 0.3, Val_X)
    Test_Preds.append(predict(clf, 0.3, X_test))
    f_scores.append(f1_score(Val_y, preds))
Test_Preds = np.array(Test_Preds)



In [157]:
print(np.round(f_scores,3), np.mean(f_scores))

[0.671 0.722 0.722 0.722 0.634] 0.694054418687069


# SUBMIT

In [158]:
weights = f_scores/sum(f_scores)
Predictions = []
for i in Test_Preds.T:
    Predictions.append(int(np.dot(i, weights).round()))

In [159]:
SUBMIT = pd.DataFrame(columns=['pair_id', 'target'])
SUBMIT['pair_id'] = pairs_id
SUBMIT['target'] = Predictions

SUBMIT.to_csv('submit_FastText_LogReg.csv', index=0) #  0.11997 LB score (FUCK)

***

# Train part

In [13]:
docs_titles = list(doc_to_title.values())
final_corpus = [preprocess_text(sentence) for sentence in docs_titles if sentence.strip() !='']

word_punctuation_tokenizer = nltk.WordPunctTokenizer()
word_tokenized_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in final_corpus]

In [33]:
embedding_size = 20
window_size = 1

In [61]:
import pandas as pd
train_data = pd.read_csv('train_groups.csv')
traingroups_titledata = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = preprocess_text(doc_to_title[doc_id])
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))

In [119]:
import numpy as np
from scipy.spatial.distance import cosine
word_punctuation_tokenizer = nltk.WordPunctTokenizer()

y_train = []
X_train = []
groups_train = []
for new_group in traingroups_titledata:
    docs = traingroups_titledata[new_group]
    group_titles = [title for _,title,_ in docs]
    group_corpus = [preprocess_text(sentence) for sentence in group_titles if sentence.strip() !='']
    word_tokenized_group_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in group_corpus]
    ft_model = FastText(word_tokenized_group_corpus,
                      size=embedding_size,
                      window=window_size,
                      min_count=min_word,
                      sample=down_sampling,
                      sg=0,
                      iter=50, workers=6)
    print(new_group,' step')
    for k, (doc_id, title, target_id) in enumerate(docs):
        y_train.append(target_id)
        groups_train.append(new_group)
        all_sims = []
        # words = set(title.strip().split())
        sent_embeding = np.array(sum([ft_model.wv[word] for word in title.split()]))
        for j, (_, title_j, _) in enumerate(docs):
            if k == j:
                continue
            sent_embeding_j = np.array(sum([ft_model.wv[word] for word in title_j.split()]))
            # words_j = set(title_j.strip().split())
            all_sims.append(1-cosine(sent_embeding, sent_embeding_j))
        X_train.append(sorted(all_sims, reverse=True)[0:30])

# X_train = np.array(X_train)
a = np.zeros((len(X_train), 30))
for i, row in enumerate(X_train):
    for j, val in enumerate(row):
        a[i,j] = val
X_train = a

y_train = np.array(y_train)
groups_train = np.array(groups_train)
print (X_train.shape, y_train.shape, groups_train.shape)

1  step
2  step
3  step
4  step
5  step
6  step


  dist = 1.0 - uv / np.sqrt(uu * vv)


7  step
8  step
9  step
10  step
11  step
12  step
13  step
14  step
15  step
16  step
17  step
18  step
19  step
20  step
21  step
22  step
23  step
24  step
25  step
26  step
27  step
28  step
29  step
30  step
31  step
32  step
33  step
34  step
35  step
36  step
37  step
38  step
39  step
40  step
41  step
42  step
43  step
44  step
45  step
46  step
47  step
48  step
49  step
50  step
51  step
52  step
53  step
54  step
55  step
56  step
57  step
58  step
59  step
60  step
61  step
62  step
63  step
64  step
65  step
66  step
67  step
68  step
69  step
70  step
71  step
72  step
73  step
74  step
75  step
76  step
77  step
78  step
79  step
80  step
81  step
82  step
83  step
84  step
85  step
86  step
87  step
88  step
89  step
90  step
91  step
92  step
93  step
94  step
95  step
96  step
97  step
98  step
99  step
100  step
101  step
102  step
103  step
104  step
105  step
106  step
107  step
108  step
109  step
110  step
111  step
112  step
113  step
114  step
115  step
116  s

In [106]:
ss = StandardScaler()
X_train = np.nan_to_num(X_train)
X_train = ss.fit_transform(X_train)

In [132]:
X_train = np.nan_to_num(X_train)

## Test part 

In [53]:
test_data = pd.read_csv('test_groups.csv')
testgroups_titledata = {}
for i in range(test_data.shape[0]):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    pair_id = new_doc['pair_id']
    title = preprocess_text(doc_to_title[doc_id])
    if doc_group not in testgroups_titledata:
        testgroups_titledata[doc_group] = []
    testgroups_titledata[doc_group].append((doc_id, title, pair_id))

In [172]:
word_punctuation_tokenizer = nltk.WordPunctTokenizer()

X_test = []
groups_test = []
pairs_id = []
for new_group in testgroups_titledata:
    docs = testgroups_titledata[new_group]
    group_titles = [title for _, title, _ in docs]
    group_corpus = [preprocess_text(sentence) for sentence in group_titles if sentence.strip() !='']
    word_tokenized_group_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in group_corpus]
    ft_model = FastText(word_tokenized_group_corpus,
                      size=embedding_size,
                      window=window_size,
                      min_count=min_word,
                      sample=down_sampling,
                      sg=0,
                      iter=50, workers=10)
    print(new_group,' step')
    for k, (doc_id, title, pair_id) in enumerate(docs):
        groups_test.append(new_group)
        all_sims = []
        # words = set(title.strip().split())
        sent_embeding = np.array(sum([ft_model.wv[word] for word in title.split()]))
        for j, (_, title_j, _) in enumerate(docs):
            if k == j:
                continue
            sent_embeding_j = np.array(sum([ft_model.wv[word] for word in title_j.split()]))
            # words_j = set(title_j.strip().split())
            all_sims.append(1-cosine(sent_embeding, sent_embeding_j))
        X_test.append(sorted(all_sims, reverse=True)[0:30])
        pairs_id.append(pair_id)

# X_train = np.array(X_train)
a = np.zeros((len(X_test), 30))
for i, row in enumerate(X_test):
    for j, val in enumerate(row):
        a[i,j] = val
X_test = a

groups_test = np.array(groups_test)
print (X_test.shape, groups_test.shape)

130  step
131  step
132  step
133  step
134  step
135  step
136  step


  dist = 1.0 - uv / np.sqrt(uu * vv)


137  step
138  step
139  step
140  step
141  step
142  step
143  step
144  step
145  step
146  step
147  step
148  step
149  step
150  step
151  step
152  step
153  step
154  step
155  step
156  step
157  step
158  step
159  step
160  step
161  step
162  step
163  step
164  step
165  step
166  step
167  step
168  step
169  step
170  step
171  step
172  step
173  step
174  step
175  step
176  step
177  step
178  step
179  step
180  step
181  step
182  step
183  step
184  step
185  step
186  step
187  step
188  step
189  step
190  step
191  step
192  step
193  step
194  step
195  step
196  step
197  step
198  step
199  step
200  step
201  step
202  step
203  step
204  step
205  step
206  step
207  step
208  step
209  step
210  step
211  step
212  step
213  step
214  step
215  step
216  step
217  step
218  step
219  step
220  step
221  step
222  step
223  step
224  step
225  step
226  step
227  step
228  step
229  step
230  step
231  step
232  step
233  step
234  step
235  step
236  step


In [173]:
X_test = np.nan_to_num(X_test)

***
## Валидация

In [202]:
group_titles = [title for _,title,_ in traingroups_titledata[1]]

In [204]:
group_corpus = [preprocess_text(sentence) for sentence in group_titles if sentence.strip() !='']
word_tokenized_group_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in group_corpus]
ft_model = FastText(word_tokenized_group_corpus,
                  size=embedding_size,
                  window=window_size,
                  min_count=min_word,
                  sample=down_sampling,
                  sg=0,
                  iter=100)

In [210]:
group_vectors = []
for sent in word_tokenized_group_corpus:
    group_vectors.append(np.array(sum([ft_model.wv[word] for word in sent])))

In [227]:
import codecs

In [233]:
print(codecs.decode(codecs.encode('ваза 21213 замена подшипник ступица нива')))

ваза 21213 замена подшипник ступица нива


In [226]:
group_corpus

['ваза 21213 замена подшипник ступица нива',
 'ваза 2107 оптом сочи сравнить цена купить потребительский товар',
 'купить ступица лада калина2 трансмиссия переходный ступица цена замена тюнинг',
 'классика 21010 21074',
 'ступица нива замена подшипник свой рука',
 'ваза 2110',
 'обзор подшипник полуось ваза 2101 07 2121 2123',
 'купить подшипник ступица fag страница 23',
 'horsepowers автомобильный интернет портал отзыв владелец ваза 2121 нива 2007 год',
 'новость сообщение официальный группа вконтакте торговый компания 33 sport магазин тольятти',
 'инструкция замена подшипник передний ступица ивеко дейли',
 'ступица olx страница 80',
 'маааленький проблёмкий бортжурнал автокам 2160 1994 год drive2',
 'разгрузить полуось нива 24 шлиц 765',
 'прошивка нива 9 7 скачать файлообменник',
 'страница 6 раздел каталог подвеска',
 'продать нива 2121',
 'рекомендация проведение независимый экспертиза осаго',
 'втулка подшипник омск сравнить цена поставщик промышленный товар',
 'шеврол блейзер ел

In [244]:
len(group_corpus)

102

In [245]:
y_train[:102]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

In [247]:
from sklearn.decomposition import PCA
pca_model = PCA(n_components=3)

pca_group_vectors = pd.DataFrame(pca_model.fit_transform(np.array(group_vectors)))
pca_group_vectors['sentence'] = list(map(lambda x: codecs.decode(codecs.encode(x)),  group_corpus))
pca_group_vectors['target'] = y_train[:102]
pca_group_vectors.to_csv('pca_group_vectors.csv', index=0, encoding='utf-8')


In [133]:
from sklearn.metrics import f1_score
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import KFold

import random

from functools import reduce

In [134]:
from itertools import zip_longest

prev_group = 1
prev_index = 0
groups_indices = []
for k, i in zip_longest(range(len(groups_train) + 1),
                        groups_train, fillvalue=-1):
    if prev_group != i:
        groups_indices.append([prev_index, k])
        prev_group = i
        prev_index = k
groups_indices = np.array(groups_indices)

In [135]:
def predict(clf, trsh, X):
    proba = clf.predict_proba(X)
    return np.array(list(map(lambda x: 1 if x[1] > trsh else 0  , proba)))

In [136]:
from itertools import product

In [137]:
THRSHS = np.arange(0.05, 1.0, 0.05)
REGUL_C = np.logspace(0, 2.5, 4)

result = np.zeros(len(list(product(REGUL_C, THRSHS))))
for i, (C, trsh) in enumerate(product(REGUL_C, THRSHS)):
    f_scores = []
    kf = KFold(n_splits=4)
    for train_index, val_index in kf.split(groups_indices):
        Train_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[train_index]))
        Val_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[val_index]))

        Train_X, Train_y = X_train[Train_Indicies], y_train[Train_Indicies]
        Val_X, Val_y = X_train[Val_Indicies], y_train[Val_Indicies]

        clf = LogisticRegression(C=C, solver='lbfgs')
        clf.fit(Train_X, Train_y)
        preds = predict(clf, trsh, Val_X)
        
        f_scores.append(f1_score(Val_y, preds))
        
        print((i, C, trsh),' iteration')
        result[i] = np.mean(f_scores)
        
best_index = result.argmax()
BEST_T = list(product(REGUL_C, THRSHS))[best_index][1]
BEST_REGUL_C = list(product(REGUL_C, THRSHS))[best_index][0]
print(BEST_REGUL_C, BEST_T, result[best_index])

(0, 1.0, 0.05)  iteration
(0, 1.0, 0.05)  iteration
(0, 1.0, 0.05)  iteration
(0, 1.0, 0.05)  iteration
(1, 1.0, 0.1)  iteration
(1, 1.0, 0.1)  iteration
(1, 1.0, 0.1)  iteration
(1, 1.0, 0.1)  iteration
(2, 1.0, 0.15000000000000002)  iteration
(2, 1.0, 0.15000000000000002)  iteration
(2, 1.0, 0.15000000000000002)  iteration
(2, 1.0, 0.15000000000000002)  iteration
(3, 1.0, 0.2)  iteration
(3, 1.0, 0.2)  iteration
(3, 1.0, 0.2)  iteration
(3, 1.0, 0.2)  iteration
(4, 1.0, 0.25)  iteration
(4, 1.0, 0.25)  iteration
(4, 1.0, 0.25)  iteration
(4, 1.0, 0.25)  iteration
(5, 1.0, 0.3)  iteration
(5, 1.0, 0.3)  iteration
(5, 1.0, 0.3)  iteration
(5, 1.0, 0.3)  iteration
(6, 1.0, 0.35000000000000003)  iteration
(6, 1.0, 0.35000000000000003)  iteration
(6, 1.0, 0.35000000000000003)  iteration
(6, 1.0, 0.35000000000000003)  iteration
(7, 1.0, 0.4)  iteration
(7, 1.0, 0.4)  iteration
(7, 1.0, 0.4)  iteration
(7, 1.0, 0.4)  iteration
(8, 1.0, 0.45)  iteration
(8, 1.0, 0.45)  iteration
(8, 1.0, 0.4

  'precision', 'predicted', average, warn_for)


(14, 1.0, 0.7500000000000001)  iteration
(15, 1.0, 0.8)  iteration
(15, 1.0, 0.8)  iteration
(15, 1.0, 0.8)  iteration
(15, 1.0, 0.8)  iteration


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(16, 1.0, 0.8500000000000001)  iteration
(16, 1.0, 0.8500000000000001)  iteration
(16, 1.0, 0.8500000000000001)  iteration


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(16, 1.0, 0.8500000000000001)  iteration
(17, 1.0, 0.9000000000000001)  iteration
(17, 1.0, 0.9000000000000001)  iteration


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(17, 1.0, 0.9000000000000001)  iteration
(17, 1.0, 0.9000000000000001)  iteration
(18, 1.0, 0.9500000000000001)  iteration
(18, 1.0, 0.9500000000000001)  iteration


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(18, 1.0, 0.9500000000000001)  iteration
(18, 1.0, 0.9500000000000001)  iteration
(19, 6.812920690579613, 0.05)  iteration




(19, 6.812920690579613, 0.05)  iteration
(19, 6.812920690579613, 0.05)  iteration




(19, 6.812920690579613, 0.05)  iteration
(20, 6.812920690579613, 0.1)  iteration
(20, 6.812920690579613, 0.1)  iteration




(20, 6.812920690579613, 0.1)  iteration
(20, 6.812920690579613, 0.1)  iteration
(21, 6.812920690579613, 0.15000000000000002)  iteration




(21, 6.812920690579613, 0.15000000000000002)  iteration
(21, 6.812920690579613, 0.15000000000000002)  iteration
(21, 6.812920690579613, 0.15000000000000002)  iteration




(22, 6.812920690579613, 0.2)  iteration
(22, 6.812920690579613, 0.2)  iteration
(22, 6.812920690579613, 0.2)  iteration




(22, 6.812920690579613, 0.2)  iteration
(23, 6.812920690579613, 0.25)  iteration
(23, 6.812920690579613, 0.25)  iteration




(23, 6.812920690579613, 0.25)  iteration
(23, 6.812920690579613, 0.25)  iteration
(24, 6.812920690579613, 0.3)  iteration




(24, 6.812920690579613, 0.3)  iteration
(24, 6.812920690579613, 0.3)  iteration
(24, 6.812920690579613, 0.3)  iteration




(25, 6.812920690579613, 0.35000000000000003)  iteration
(25, 6.812920690579613, 0.35000000000000003)  iteration
(25, 6.812920690579613, 0.35000000000000003)  iteration




(25, 6.812920690579613, 0.35000000000000003)  iteration
(26, 6.812920690579613, 0.4)  iteration
(26, 6.812920690579613, 0.4)  iteration




(26, 6.812920690579613, 0.4)  iteration
(26, 6.812920690579613, 0.4)  iteration
(27, 6.812920690579613, 0.45)  iteration




(27, 6.812920690579613, 0.45)  iteration
(27, 6.812920690579613, 0.45)  iteration
(27, 6.812920690579613, 0.45)  iteration




(28, 6.812920690579613, 0.5)  iteration
(28, 6.812920690579613, 0.5)  iteration
(28, 6.812920690579613, 0.5)  iteration




(28, 6.812920690579613, 0.5)  iteration
(29, 6.812920690579613, 0.55)  iteration




(29, 6.812920690579613, 0.55)  iteration
(29, 6.812920690579613, 0.55)  iteration




(29, 6.812920690579613, 0.55)  iteration
(30, 6.812920690579613, 0.6000000000000001)  iteration




(30, 6.812920690579613, 0.6000000000000001)  iteration
(30, 6.812920690579613, 0.6000000000000001)  iteration




(30, 6.812920690579613, 0.6000000000000001)  iteration
(31, 6.812920690579613, 0.6500000000000001)  iteration




(31, 6.812920690579613, 0.6500000000000001)  iteration
(31, 6.812920690579613, 0.6500000000000001)  iteration




(31, 6.812920690579613, 0.6500000000000001)  iteration
(32, 6.812920690579613, 0.7000000000000001)  iteration




(32, 6.812920690579613, 0.7000000000000001)  iteration
(32, 6.812920690579613, 0.7000000000000001)  iteration




(32, 6.812920690579613, 0.7000000000000001)  iteration
(33, 6.812920690579613, 0.7500000000000001)  iteration
(33, 6.812920690579613, 0.7500000000000001)  iteration




(33, 6.812920690579613, 0.7500000000000001)  iteration
(33, 6.812920690579613, 0.7500000000000001)  iteration
(34, 6.812920690579613, 0.8)  iteration




(34, 6.812920690579613, 0.8)  iteration
(34, 6.812920690579613, 0.8)  iteration
(34, 6.812920690579613, 0.8)  iteration




(35, 6.812920690579613, 0.8500000000000001)  iteration
(35, 6.812920690579613, 0.8500000000000001)  iteration
(35, 6.812920690579613, 0.8500000000000001)  iteration




(35, 6.812920690579613, 0.8500000000000001)  iteration
(36, 6.812920690579613, 0.9000000000000001)  iteration
(36, 6.812920690579613, 0.9000000000000001)  iteration




(36, 6.812920690579613, 0.9000000000000001)  iteration
(36, 6.812920690579613, 0.9000000000000001)  iteration
(37, 6.812920690579613, 0.9500000000000001)  iteration


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(37, 6.812920690579613, 0.9500000000000001)  iteration
(37, 6.812920690579613, 0.9500000000000001)  iteration
(37, 6.812920690579613, 0.9500000000000001)  iteration




(38, 46.4158883361278, 0.05)  iteration
(38, 46.4158883361278, 0.05)  iteration
(38, 46.4158883361278, 0.05)  iteration




(38, 46.4158883361278, 0.05)  iteration
(39, 46.4158883361278, 0.1)  iteration
(39, 46.4158883361278, 0.1)  iteration




(39, 46.4158883361278, 0.1)  iteration
(39, 46.4158883361278, 0.1)  iteration
(40, 46.4158883361278, 0.15000000000000002)  iteration




(40, 46.4158883361278, 0.15000000000000002)  iteration
(40, 46.4158883361278, 0.15000000000000002)  iteration
(40, 46.4158883361278, 0.15000000000000002)  iteration




(41, 46.4158883361278, 0.2)  iteration
(41, 46.4158883361278, 0.2)  iteration
(41, 46.4158883361278, 0.2)  iteration




(41, 46.4158883361278, 0.2)  iteration
(42, 46.4158883361278, 0.25)  iteration
(42, 46.4158883361278, 0.25)  iteration




(42, 46.4158883361278, 0.25)  iteration
(42, 46.4158883361278, 0.25)  iteration
(43, 46.4158883361278, 0.3)  iteration




(43, 46.4158883361278, 0.3)  iteration
(43, 46.4158883361278, 0.3)  iteration
(43, 46.4158883361278, 0.3)  iteration




(44, 46.4158883361278, 0.35000000000000003)  iteration
(44, 46.4158883361278, 0.35000000000000003)  iteration
(44, 46.4158883361278, 0.35000000000000003)  iteration




(44, 46.4158883361278, 0.35000000000000003)  iteration
(45, 46.4158883361278, 0.4)  iteration
(45, 46.4158883361278, 0.4)  iteration




(45, 46.4158883361278, 0.4)  iteration
(45, 46.4158883361278, 0.4)  iteration
(46, 46.4158883361278, 0.45)  iteration




(46, 46.4158883361278, 0.45)  iteration
(46, 46.4158883361278, 0.45)  iteration
(46, 46.4158883361278, 0.45)  iteration




(47, 46.4158883361278, 0.5)  iteration
(47, 46.4158883361278, 0.5)  iteration
(47, 46.4158883361278, 0.5)  iteration




(47, 46.4158883361278, 0.5)  iteration
(48, 46.4158883361278, 0.55)  iteration
(48, 46.4158883361278, 0.55)  iteration




(48, 46.4158883361278, 0.55)  iteration
(48, 46.4158883361278, 0.55)  iteration
(49, 46.4158883361278, 0.6000000000000001)  iteration




(49, 46.4158883361278, 0.6000000000000001)  iteration
(49, 46.4158883361278, 0.6000000000000001)  iteration
(49, 46.4158883361278, 0.6000000000000001)  iteration




(50, 46.4158883361278, 0.6500000000000001)  iteration
(50, 46.4158883361278, 0.6500000000000001)  iteration
(50, 46.4158883361278, 0.6500000000000001)  iteration




(50, 46.4158883361278, 0.6500000000000001)  iteration
(51, 46.4158883361278, 0.7000000000000001)  iteration
(51, 46.4158883361278, 0.7000000000000001)  iteration




(51, 46.4158883361278, 0.7000000000000001)  iteration
(51, 46.4158883361278, 0.7000000000000001)  iteration




(52, 46.4158883361278, 0.7500000000000001)  iteration
(52, 46.4158883361278, 0.7500000000000001)  iteration
(52, 46.4158883361278, 0.7500000000000001)  iteration




(52, 46.4158883361278, 0.7500000000000001)  iteration
(53, 46.4158883361278, 0.8)  iteration
(53, 46.4158883361278, 0.8)  iteration




(53, 46.4158883361278, 0.8)  iteration
(53, 46.4158883361278, 0.8)  iteration
(54, 46.4158883361278, 0.8500000000000001)  iteration




(54, 46.4158883361278, 0.8500000000000001)  iteration
(54, 46.4158883361278, 0.8500000000000001)  iteration
(54, 46.4158883361278, 0.8500000000000001)  iteration




(55, 46.4158883361278, 0.9000000000000001)  iteration
(55, 46.4158883361278, 0.9000000000000001)  iteration




(55, 46.4158883361278, 0.9000000000000001)  iteration
(55, 46.4158883361278, 0.9000000000000001)  iteration


  'precision', 'predicted', average, warn_for)


(56, 46.4158883361278, 0.9500000000000001)  iteration
(56, 46.4158883361278, 0.9500000000000001)  iteration




(56, 46.4158883361278, 0.9500000000000001)  iteration
(56, 46.4158883361278, 0.9500000000000001)  iteration




(57, 316.22776601683796, 0.05)  iteration
(57, 316.22776601683796, 0.05)  iteration




(57, 316.22776601683796, 0.05)  iteration
(57, 316.22776601683796, 0.05)  iteration




(58, 316.22776601683796, 0.1)  iteration
(58, 316.22776601683796, 0.1)  iteration




(58, 316.22776601683796, 0.1)  iteration
(58, 316.22776601683796, 0.1)  iteration




(59, 316.22776601683796, 0.15000000000000002)  iteration
(59, 316.22776601683796, 0.15000000000000002)  iteration




(59, 316.22776601683796, 0.15000000000000002)  iteration
(59, 316.22776601683796, 0.15000000000000002)  iteration




(60, 316.22776601683796, 0.2)  iteration
(60, 316.22776601683796, 0.2)  iteration




(60, 316.22776601683796, 0.2)  iteration
(60, 316.22776601683796, 0.2)  iteration




(61, 316.22776601683796, 0.25)  iteration
(61, 316.22776601683796, 0.25)  iteration




(61, 316.22776601683796, 0.25)  iteration
(61, 316.22776601683796, 0.25)  iteration




(62, 316.22776601683796, 0.3)  iteration
(62, 316.22776601683796, 0.3)  iteration




(62, 316.22776601683796, 0.3)  iteration
(62, 316.22776601683796, 0.3)  iteration




(63, 316.22776601683796, 0.35000000000000003)  iteration
(63, 316.22776601683796, 0.35000000000000003)  iteration




(63, 316.22776601683796, 0.35000000000000003)  iteration
(63, 316.22776601683796, 0.35000000000000003)  iteration




(64, 316.22776601683796, 0.4)  iteration
(64, 316.22776601683796, 0.4)  iteration




(64, 316.22776601683796, 0.4)  iteration
(64, 316.22776601683796, 0.4)  iteration




(65, 316.22776601683796, 0.45)  iteration
(65, 316.22776601683796, 0.45)  iteration




(65, 316.22776601683796, 0.45)  iteration




(65, 316.22776601683796, 0.45)  iteration




(66, 316.22776601683796, 0.5)  iteration
(66, 316.22776601683796, 0.5)  iteration




(66, 316.22776601683796, 0.5)  iteration
(66, 316.22776601683796, 0.5)  iteration




(67, 316.22776601683796, 0.55)  iteration
(67, 316.22776601683796, 0.55)  iteration




(67, 316.22776601683796, 0.55)  iteration
(67, 316.22776601683796, 0.55)  iteration




(68, 316.22776601683796, 0.6000000000000001)  iteration
(68, 316.22776601683796, 0.6000000000000001)  iteration




(68, 316.22776601683796, 0.6000000000000001)  iteration
(68, 316.22776601683796, 0.6000000000000001)  iteration




(69, 316.22776601683796, 0.6500000000000001)  iteration
(69, 316.22776601683796, 0.6500000000000001)  iteration
(69, 316.22776601683796, 0.6500000000000001)  iteration




(69, 316.22776601683796, 0.6500000000000001)  iteration
(70, 316.22776601683796, 0.7000000000000001)  iteration




(70, 316.22776601683796, 0.7000000000000001)  iteration
(70, 316.22776601683796, 0.7000000000000001)  iteration




(70, 316.22776601683796, 0.7000000000000001)  iteration
(71, 316.22776601683796, 0.7500000000000001)  iteration




(71, 316.22776601683796, 0.7500000000000001)  iteration
(71, 316.22776601683796, 0.7500000000000001)  iteration
(71, 316.22776601683796, 0.7500000000000001)  iteration




(72, 316.22776601683796, 0.8)  iteration
(72, 316.22776601683796, 0.8)  iteration




(72, 316.22776601683796, 0.8)  iteration
(72, 316.22776601683796, 0.8)  iteration




(73, 316.22776601683796, 0.8500000000000001)  iteration
(73, 316.22776601683796, 0.8500000000000001)  iteration




(73, 316.22776601683796, 0.8500000000000001)  iteration
(73, 316.22776601683796, 0.8500000000000001)  iteration
(74, 316.22776601683796, 0.9000000000000001)  iteration




(74, 316.22776601683796, 0.9000000000000001)  iteration
(74, 316.22776601683796, 0.9000000000000001)  iteration
(74, 316.22776601683796, 0.9000000000000001)  iteration


  'precision', 'predicted', average, warn_for)


(75, 316.22776601683796, 0.9500000000000001)  iteration
(75, 316.22776601683796, 0.9500000000000001)  iteration
(75, 316.22776601683796, 0.9500000000000001)  iteration
(75, 316.22776601683796, 0.9500000000000001)  iteration
316.22776601683796 0.45 0.6218199482324935




In [184]:
Test_Preds = []
f_scores = []
kf = KFold(n_splits=5, shuffle=True)

for train_index, val_index in kf.split(groups_indices):
    Train_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[train_index]))
    Val_Indicies = reduce(lambda x,y: x+y, map(lambda x: list(range(x[0],x[1])), groups_indices[val_index]))

    Train_X, Train_y = X_train[Train_Indicies], y_train[Train_Indicies]
    Val_X, Val_y = X_train[Val_Indicies], y_train[Val_Indicies]

    clf = LogisticRegression(C=2.5*10**2)
    clf.fit(Train_X, Train_y)
    preds = predict(clf, 0.45, Val_X)
    Test_Preds.append(predict(clf, 0.45, X_test))
    f_scores.append(f1_score(Val_y, preds))
Test_Preds = np.array(Test_Preds)



In [185]:
print(np.round(f_scores,3), np.mean(f_scores))

[0.571 0.576 0.789 0.618 0.547] 0.6201545090828605


In [186]:
weights = f_scores/sum(f_scores)
Predictions = []
for i in Test_Preds.T:
    Predictions.append(int(np.dot(i, weights).round()))

In [187]:
SUBMIT = pd.DataFrame(columns=['pair_id', 'target'])
SUBMIT['pair_id'] = pairs_id
SUBMIT['target'] = Predictions

SUBMIT.to_csv('submit_FastText_LogReg.csv', index=0)