In [56]:
# !pip install gensim
# !pip install razdel
# !pip install pymorphy2

In [57]:
import pandas as pd

In [58]:
news = pd.read_csv("articles.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [59]:
users = pd.read_csv("users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [60]:
from gensim.corpora.dictionary import Dictionary

In [61]:
#предобработка текстов
import re
import numpy as np

from nltk.corpus import stopwords
#from nltk.tokenize import word_tokenize

from razdel import tokenize # https://github.com/natasha/razdel # разделение на токены
#!pip install razdel

import pymorphy2  # pip install pymorphy2 # приводим к нормальной форме с помощью этой библиотеки

In [62]:
import nltk
nltk.download('stopwords')

stopword_ru = stopwords.words('russian')
print(len(stopword_ru))

morph = pymorphy2.MorphAnalyzer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gribanov\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


151


In [63]:
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords
len(stopword_ru)

776

In [64]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация - приведение слова к норм форме
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [70]:
%%time
#Запускаем очистку текста. Будет долго...
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

Wall time: 19.5 s


In [11]:
%%time
#Запускаем лемматизацию текста. Будет очень долго...
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

Wall time: 2min 32s


In [12]:
#сформируем список наших текстов, разбив еще и на пробелы
texts = [t for t in news['title'].values]

# Create a corpus from a list of texts
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [13]:
type(news['title'])

pandas.core.series.Series

In [14]:
from gensim.models import LdaModel

In [15]:
%%time
from gensim.models import LdaModel
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)#, passes=10) # уже обученная модель и можем пользоваться

Wall time: 27.1 s


In [16]:
from gensim.test.utils import datapath
# Save model to disk.
temp_file = datapath("model.lda")
lda.save(temp_file)

# Load a potentially pretrained model from disk.
lda = LdaModel.load(temp_file)

In [17]:
news['title'].iloc[:3]

0    [заместитель, председатель, правительство, рф,...
1    [матч, финал, кубок, россия, футбол, приостано...
2    [форвард, авангард, томаш, заборский, прокомме...
Name: title, dtype: object

In [18]:
# Create a new corpus, made of previously unseen documents.
other_texts = [t for t in news['title'].iloc[:3]]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[2]
print(other_texts[2])
lda[unseen_doc] 

['форвард', 'авангард', 'томаш', 'заборский', 'прокомментировать', 'игра', 'свой', 'команда', 'матч', 'чемпионат', 'кхл', 'против', 'атланта', 'nnnn', 'провести', 'плохой', 'матч', 'нижний', 'новгород', 'против', 'торпедо', 'настраиваться', 'первый', 'минута', 'включиться', 'работа', 'сказать', 'заборский', 'получиться', 'забросить', 'быстрый', 'гол', 'задать', 'хороший', 'темп', 'поединок', 'мочь', 'играть', 'ещё', 'хороший', 'сторона', 'пять', 'очко', 'выезд', 'девять', 'это', 'хороший']


[(0, 0.6504813),
 (3, 0.14451604),
 (5, 0.0359825),
 (8, 0.11822274),
 (15, 0.03288183)]

In [19]:
x=lda.show_topics(num_topics=25, num_words=7,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

#Below Code Prints Only Words 
for topic,words in topics_words:
    print("topic_{}: ".format(topic)+" ".join(words))

topic_0: это год мочь всё россия который свой
topic_1: военный год человек операция тыс тело банк
topic_2: фонд ракета доклад оборудование рейтинг активность ниже
topic_3: источник вода авария банк пострадать строительство район
topic_4: исследование препарат остров лаборатория обнаружить след применение
topic_5: газ россия российский санкция территория торговый иран
topic_6: который это страна сша американский год новый
topic_7: президент nn россия путин наука владимир российский
topic_8: производитель одежда экипаж звёздный надпись экспериментальный ресторан
topic_9: украина украинский пациент взрыв погибнуть произойти товар
topic_10: космос физика захватить атмосферный вино сон пляж
topic_11: год это компания власть который заявить министр
topic_12: сенатор художественный жюри винтовка вена моисеев кудрявцев
topic_13: город день nn это москва человек который
topic_14: квартира врач рейс лечение медведев реформа перевод
topic_15: год рубль млрд статья россия журнал млн
topic_16: пове

для тематич моделирования крайне важно исключать мусорные токены, и с каждой итерацией получать все более хорошие тематич вектора

потом охарактеризовали кластеры-центроиды (назвали спорт политика и тд)

In [20]:
#text = news['title'].iloc[0]

def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [21]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])

In [22]:
topic_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.000000,0.000000,0.000000,0.000000,0.010316,0.000000,0.014613,0.890100,0.000000,0.000000,...,0.000000,0.016328,0.0,0.000000,0.033901,0.000000,0.000000,0.0,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.252944,0.546651,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
2,0.650468,0.000000,0.000000,0.144548,0.000000,0.035975,0.000000,0.000000,0.118223,0.000000,...,0.032871,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
3,0.522469,0.000000,0.000000,0.000000,0.000000,0.014169,0.000000,0.032863,0.000000,0.231978,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.094092
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.458470,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.212738,0.304288,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26995,0.000000,0.000000,0.159215,0.000000,0.075286,0.000000,0.000000,0.116537,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.213223,0.0,0.202213,0.000000
26996,0.229649,0.052594,0.000000,0.051569,0.000000,0.000000,0.000000,0.483073,0.000000,0.000000,...,0.000000,0.000000,0.0,0.035571,0.000000,0.000000,0.038776,0.0,0.000000,0.000000
26997,0.000000,0.224507,0.000000,0.028148,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.160667,0.000000,0.0,0.000000,0.192426,0.000000,0.328344,0.0,0.000000,0.052572
26998,0.000000,0.000000,0.000000,0.000000,0.000000,0.038068,0.000000,0.000000,0.000000,0.000000,...,0.233520,0.000000,0.0,0.000000,0.000000,0.000000,0.511671,0.0,0.000000,0.054461


In [23]:
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,0.000000,0.000000,0.000000,0.000000,0.010316,0.000000,0.014613,0.890100,0.000000,0.000000,...,0.000000,0.016328,0.0,0.000000,0.033901,0.000000,0.000000,0.0,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.252944,0.546651,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
2,0.650468,0.000000,0.000000,0.144548,0.000000,0.035975,0.000000,0.000000,0.118223,0.000000,...,0.032871,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
3,0.522469,0.000000,0.000000,0.000000,0.000000,0.014169,0.000000,0.032863,0.000000,0.231978,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.094092
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.458470,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.212738,0.304288,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26995,0.000000,0.000000,0.159215,0.000000,0.075286,0.000000,0.000000,0.116537,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.213223,0.0,0.202213,0.000000
26996,0.229649,0.052594,0.000000,0.051569,0.000000,0.000000,0.000000,0.483073,0.000000,0.000000,...,0.000000,0.000000,0.0,0.035571,0.000000,0.000000,0.038776,0.0,0.000000,0.000000
26997,0.000000,0.224507,0.000000,0.028148,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.160667,0.000000,0.0,0.000000,0.192426,0.000000,0.328344,0.0,0.000000,0.052572
26998,0.000000,0.000000,0.000000,0.000000,0.000000,0.038068,0.000000,0.000000,0.000000,0.000000,...,0.233520,0.000000,0.0,0.000000,0.000000,0.000000,0.511671,0.0,0.000000,0.054461


In [24]:
topic_matrix['doc_id'] = news['doc_id'].values

In [25]:
topic_matrix

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,doc_id
0,0.000000,0.000000,0.000000,0.000000,0.010316,0.000000,0.014613,0.890100,0.000000,0.000000,...,0.016328,0.0,0.000000,0.033901,0.000000,0.000000,0.0,0.000000,0.000000,6
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.252944,0.546651,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,4896
2,0.650468,0.000000,0.000000,0.144548,0.000000,0.035975,0.000000,0.000000,0.118223,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,4897
3,0.522469,0.000000,0.000000,0.000000,0.000000,0.014169,0.000000,0.032863,0.000000,0.231978,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.094092,4898
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.458470,0.000000,0.000000,...,0.000000,0.0,0.000000,0.212738,0.304288,0.000000,0.0,0.000000,0.000000,4899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26995,0.000000,0.000000,0.159215,0.000000,0.075286,0.000000,0.000000,0.116537,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.213223,0.0,0.202213,0.000000,513441
26996,0.229649,0.052594,0.000000,0.051569,0.000000,0.000000,0.000000,0.483073,0.000000,0.000000,...,0.000000,0.0,0.035571,0.000000,0.000000,0.038776,0.0,0.000000,0.000000,513442
26997,0.000000,0.224507,0.000000,0.028148,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.192426,0.000000,0.328344,0.0,0.000000,0.052572,513443
26998,0.000000,0.000000,0.000000,0.000000,0.000000,0.038068,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.511671,0.0,0.000000,0.054461,513444


In [26]:
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.0,0.0,0.0,0.010316,0.0,0.014613,0.8901,0.0,...,0.0,0.016328,0.0,0.0,0.033901,0.0,0.0,0.0,0.0,0.0
1,4896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.252944,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4897,0.650468,0.0,0.0,0.144548,0.0,0.035975,0.0,0.0,0.118223,...,0.032871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4898,0.522469,0.0,0.0,0.0,0.0,0.014169,0.0,0.032863,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094092
4,4899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.45847,0.0,...,0.0,0.0,0.0,0.0,0.212738,0.304288,0.0,0.0,0.0,0.0


In [27]:
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [28]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))
doc_dict[293622]

array([0.        , 0.        , 0.05207721, 0.        , 0.        ,
       0.        , 0.        , 0.13936855, 0.        , 0.        ,
       0.04426962, 0.        , 0.        , 0.12684089, 0.        ,
       0.        , 0.        , 0.        , 0.09047393, 0.0812877 ,
       0.        , 0.27454448, 0.        , 0.        , 0.18081215])

In [29]:
user_articles_list = users['articles'].iloc[33]

In [30]:
user_articles_list

'[323329, 321961, 324743, 323186, 324632, 474690]'

**2. Модифицировать код функции get_user_embedding таким образом, чтобы считалось не среднее (как в примере np.mean), а медиана. Применить такое преобразование к данным, обучить модель прогнозирования оттока и посчитать метрики качества и сохранить их: roc auc, precision/recall/f_score (для 3 последних - подобрать оптимальный порог с помощью precision_recall_curve, как это делалось на уроке)**

In [31]:
def get_user_embedding_mean(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.mean(user_vector, 0)
    return user_vector

def get_user_embedding_median(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.median(user_vector, 0)
    return user_vector

def get_user_embedding_max(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.max(user_vector, 0)
    return user_vector

In [32]:
get_user_embedding_mean(user_articles_list)

array([0.07258587, 0.03593703, 0.02266465, 0.0185915 , 0.00210072,
       0.0049691 , 0.15149364, 0.19302236, 0.01034601, 0.01045642,
       0.        , 0.02159142, 0.        , 0.01246749, 0.00941199,
       0.10625544, 0.00257136, 0.01228193, 0.01798005, 0.08904533,
       0.        , 0.11617622, 0.        , 0.00319274, 0.07653292])

In [33]:
user_embeddings_mean = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_mean(x), 1)])
user_embeddings_mean.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings_mean['uid'] = users['uid'].values
user_embeddings_mean = user_embeddings_mean[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings_mean.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.027407,0.016636,0.0134,0.008022,0.0,0.009119,0.036027,0.04441,0.0,...,0.054994,0.005102,0.0,0.015079,0.173962,0.0,0.128587,0.0,0.04595,0.035663
1,u108690,0.081047,0.055877,0.002496,0.011726,0.0,0.013827,0.142345,0.062579,0.0,...,0.078389,0.01104,0.0,0.003258,0.027214,0.002687,0.171121,0.0,0.028553,0.053404
2,u108339,0.046832,0.09455,0.0,0.008061,0.0,0.011133,0.116844,0.093825,0.010453,...,0.098774,0.008288,0.0,0.006116,0.066485,0.002066,0.032234,0.0,0.009072,0.106505


In [34]:
user_embeddings_median = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_median(x), 1)])
user_embeddings_median.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings_median['uid'] = users['uid'].values
user_embeddings_median = user_embeddings_median[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings_median.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.0,0.0,0.0,0.0,0.0,0.0,0.005851,0.0,0.0,...,0.007253,0.0,0.0,0.0,0.132923,0.0,0.121907,0.0,0.0,0.0
1,u108690,0.041338,0.016448,0.0,0.006754,0.0,0.0,0.102376,0.036787,0.0,...,0.069726,0.0,0.0,0.0,0.014093,0.0,0.155793,0.0,0.023851,0.018459
2,u108339,0.051339,0.062025,0.0,0.0,0.0,0.0,0.077834,0.057067,0.0,...,0.070104,0.0,0.0,0.0,0.068134,0.0,0.010414,0.0,0.0,0.11608


In [35]:
user_embeddings_max = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_max(x), 1)])
user_embeddings_max.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings_max['uid'] = users['uid'].values
user_embeddings_max = user_embeddings_max[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings_max.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.124368,0.099818,0.052077,0.048135,0.0,0.054716,0.116956,0.139369,0.0,...,0.197264,0.03061,0.0,0.090474,0.466684,0.0,0.274544,0.0,0.275698,0.180812
1,u108690,0.214957,0.211881,0.014975,0.02927,0.0,0.082961,0.433617,0.144914,0.0,...,0.181798,0.047201,0.0,0.019545,0.069308,0.016122,0.366073,0.0,0.071377,0.161932
2,u108339,0.089204,0.247437,0.0,0.048369,0.0,0.066799,0.334098,0.230576,0.062716,...,0.244311,0.049729,0.0,0.024157,0.167193,0.012394,0.125156,0.0,0.042307,0.177644


In [36]:
target = pd.read_csv("users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [37]:
X_mean = pd.merge(user_embeddings_mean, target, 'left')
X_mean.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,churn
0,u105138,0.027407,0.016636,0.0134,0.008022,0.0,0.009119,0.036027,0.04441,0.0,...,0.005102,0.0,0.015079,0.173962,0.0,0.128587,0.0,0.04595,0.035663,0
1,u108690,0.081047,0.055877,0.002496,0.011726,0.0,0.013827,0.142345,0.062579,0.0,...,0.01104,0.0,0.003258,0.027214,0.002687,0.171121,0.0,0.028553,0.053404,1
2,u108339,0.046832,0.09455,0.0,0.008061,0.0,0.011133,0.116844,0.093825,0.010453,...,0.008288,0.0,0.006116,0.066485,0.002066,0.032234,0.0,0.009072,0.106505,1


In [38]:
X_median = pd.merge(user_embeddings_median, target, 'left')
X_median.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,churn
0,u105138,0.0,0.0,0.0,0.0,0.0,0.0,0.005851,0.0,0.0,...,0.0,0.0,0.0,0.132923,0.0,0.121907,0.0,0.0,0.0,0
1,u108690,0.041338,0.016448,0.0,0.006754,0.0,0.0,0.102376,0.036787,0.0,...,0.0,0.0,0.0,0.014093,0.0,0.155793,0.0,0.023851,0.018459,1
2,u108339,0.051339,0.062025,0.0,0.0,0.0,0.0,0.077834,0.057067,0.0,...,0.0,0.0,0.0,0.068134,0.0,0.010414,0.0,0.0,0.11608,1


In [39]:
X_max = pd.merge(user_embeddings_max, target, 'left')
X_max.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,churn
0,u105138,0.124368,0.099818,0.052077,0.048135,0.0,0.054716,0.116956,0.139369,0.0,...,0.03061,0.0,0.090474,0.466684,0.0,0.274544,0.0,0.275698,0.180812,0
1,u108690,0.214957,0.211881,0.014975,0.02927,0.0,0.082961,0.433617,0.144914,0.0,...,0.047201,0.0,0.019545,0.069308,0.016122,0.366073,0.0,0.071377,0.161932,1
2,u108339,0.089204,0.247437,0.0,0.048369,0.0,0.066799,0.334098,0.230576,0.062716,...,0.049729,0.0,0.024157,0.167193,0.012394,0.125156,0.0,0.042307,0.177644,1


In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#import itertools

import matplotlib.pyplot as plt

%matplotlib inline

In [41]:
#разделим данные на train/test
X_train_mean, X_test_mean, y_train_mean, y_test_mean = train_test_split(X_mean[['topic_{}'.format(i) for i in range(25)]], 
                                                                        X_mean['churn'], random_state=0)

In [42]:
#разделим данные на train/test
X_train_median, X_test_median, y_train_median, y_test_median = train_test_split(X_median[['topic_{}'.format(i) for i in range(25)]], 
                                                                                X_median['churn'], random_state=0)

In [43]:
#разделим данные на train/test
X_train_max, X_test_max, y_train_max, y_test_max = train_test_split(X_max[['topic_{}'.format(i) for i in range(25)]], 
                                                                    X_max['churn'], random_state=0)

In [44]:
logreg_mean = LogisticRegression()
#обучим 
logreg_mean.fit(X_train_mean, y_train_mean)

LogisticRegression()

In [45]:
logreg_median = LogisticRegression()
#обучим 
logreg_median.fit(X_train_median, y_train_median)

LogisticRegression()

In [46]:
logreg_max = LogisticRegression()
#обучим 
logreg_max.fit(X_train_max, y_train_max)

LogisticRegression()

In [47]:
#наши прогнозы для тестовой выборки
preds_mean = logreg_mean.predict_proba(X_test_mean)[:, 1]
preds_mean[:10]

array([0.0588728 , 0.01768402, 0.46493294, 0.20791344, 0.00953624,
       0.0521639 , 0.11357267, 0.020732  , 0.20276159, 0.05236518])

In [48]:
#наши прогнозы для тестовой выборки
preds_median = logreg_median.predict_proba(X_test_median)[:, 1]
preds_median[:10]

array([0.09295487, 0.01376115, 0.76106165, 0.33204886, 0.00800041,
       0.04547271, 0.09009957, 0.01183907, 0.15473174, 0.04226974])

In [49]:
#наши прогнозы для тестовой выборки
preds_max = logreg_max.predict_proba(X_test_max)[:, 1]
preds_max[:10]

array([0.01440591, 0.00098787, 0.65636665, 0.30792343, 0.00272625,
       0.00650311, 0.23292724, 0.00544471, 0.03241175, 0.02180731])

In [50]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

In [51]:
import itertools

In [52]:
precision_mean, recall_mean, thresholds_mean = precision_recall_curve(y_test_mean, preds_mean)
fscore_mean = (2 * precision_mean * recall_mean) / (precision_mean + recall_mean)

ix = np.argmax(fscore_mean)
precision_mean, recall_mean, fscore_mean = precision_mean[ix], recall_mean[ix], fscore_mean[ix]

roc_auc_score_mean = roc_auc_score(y_test_mean, preds_mean)

In [53]:
precision_median, recall_median, thresholds_median = precision_recall_curve(y_test_median, preds_median)
fscore_median = (2 * precision_median * recall_median) / (precision_median + recall_median)

ix = np.argmax(fscore_median)
precision_median, recall_median, fscore_median = precision_median[ix], recall_median[ix], fscore_median[ix]

roc_auc_score_median = roc_auc_score(y_test_median, preds_median)

In [54]:
precision_max, recall_max, thresholds_max = precision_recall_curve(y_test_max, preds_max)
fscore_max = (2 * precision_max * recall_max) / (precision_max + recall_max)

ix = np.argmax(fscore_max)
precision_max, recall_max, fscore_max = precision_max[ix], recall_max[ix], fscore_max[ix]

roc_auc_score_max = roc_auc_score(y_test_max, preds_max)

In [55]:
table = pd.DataFrame(data={'Metrics':['Precision', 'Recall', 'F-score', 'Roc_auc_score'],
                          'Mean':[precision_mean, recall_mean, fscore_mean, roc_auc_score_mean],
                          'Median':[precision_median, recall_median, fscore_median, roc_auc_score_median],
                          'Max':[precision_max, recall_max, fscore_max, roc_auc_score_max]}, dtype=np.float) 
table.round(3)

Unnamed: 0,Metrics,Mean,Median,Max
0,Precision,0.702,0.8,0.829
1,Recall,0.722,0.833,0.771
2,F-score,0.712,0.816,0.799
3,Roc_auc_score,0.961,0.985,0.976


Эффективнее остльных оказался способ агрегирования max. 

Когда мы закладывали в вектор пользователя - в значение каждой темы - максимальную похожесть на тему из новостей, прочитанных пользователем. Возможно это дает более четкий слепок пользователя для модели.

Этот вывод был сделан до того как я перезапустил блокнот - и получил следующее (лучшие метрики дает медиана)

Metrics	Mean	Median	Max

0	Precision	0.626	0.759	0.731

1	Recall	0.833	0.808	0.788

2	F-score	0.715	0.783	0.758

3	Roc_auc	0.955	0.972	0.958

Metrics	Mean	Median	Max

0	Precision	0.545	0.719	0.735

1	Recall	0.710	0.784	0.780

2	F-score	0.617	0.750	0.756

3	Roc_auc	0.927	0.967	0.963

Думаю, что нет какого-то явно лучшего способа агрегирования - median max mean - из-за высокой случайности (бросаем центроиды) каждый раз получается разный результат.