In [3]:
pip install razdel


Collecting razdel
  Using cached razdel-0.5.0-py3-none-any.whl (21 kB)
Installing collected packages: razdel
Successfully installed razdel-0.5.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install pymorphy2

Collecting pymorphy2
  Using cached pymorphy2-0.9.1-py3-none-any.whl (55 kB)
Collecting docopt>=0.6
  Using cached docopt-0.6.2-py2.py3-none-any.whl
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Using cached pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
Collecting dawg-python>=0.7.1
  Using cached DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Installing collected packages: pymorphy2-dicts-ru, docopt, dawg-python, pymorphy2
Successfully installed dawg-python-0.7.2 docopt-0.6.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844
Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
from gensim.corpora.dictionary import Dictionary
import re
import numpy as np
from nltk.corpus import stopwords
from razdel import tokenize
import pymorphy2

In [8]:
news = pd.read_csv('articles.csv')
news.head()

Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...
3,4898,Главный тренер «Кубани» Юрий Красножан прокомм...
4,4899,Решением попечительского совета владивостокско...


In [9]:
users = pd.read_csv('users_articles.csv')
users.head()

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"
3,u101138,"[5933, 6186, 5055, 6977, 5206, 488389]"
4,u108248,"[707, 1144, 2532, 2928, 3133, 324592]"


In [10]:
stopword_ru = stopwords.words('russian')
morph = pymorphy2.MorphAnalyzer()

In [11]:
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
    
stopword_ru += additional_stopwords
len(stopword_ru)

776

In [40]:
def clean_text(text):
    # проверяем тип входных данных
    if not isinstance(text, str): 
        text = str(text)
    #приводим к одному формату и очищаем текст 
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))
    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip()) 
    
    return text

cache = {}

def lemmatization(text):
#     global cache
   
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одним символом
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''
    # [0]
    if not isinstance(text, str):
        text = str(text)
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0]=='-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cache = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cache)

    words_lem_without_stopwords = [i for i in words_lem if not i in stopword_ru] # [6]

    return words_lem_without_stopwords

In [23]:
%%time
#очистка текста
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)


Wall time: 21.2 s


In [21]:
news['title'] #было

0        Заместитель председателяnправительства РФnСерг...
1        Матч 1/16 финала Кубка России по футболу был п...
2        Форвард «Авангарда» Томаш Заборский прокоммент...
3        Главный тренер «Кубани» Юрий Красножан прокомм...
4        Решением попечительского совета владивостокско...
                               ...                        
26995    Ученые Токийского университета морских наук и ...
26996    Главой кафедры отечественной истории XX века и...
26997    Американские ученые уточнили возраст расположе...
26998    За последние 50 лет тропический углеродный цик...
26999    У живших примерно 7 тыс. лет назад на территор...
Name: title, Length: 27000, dtype: object

In [25]:
news['title'] #стало

0        заместитель председателяnправительства рфnсерг...
1        матч  финала кубка россии по футболу был приос...
2        форвард авангарда томаш заборский прокомментир...
3        главный тренер кубани юрий красножан прокоммен...
4        решением попечительского совета владивостокско...
                               ...                        
26995    ученые токийского университета морских наук и ...
26996    главой кафедры отечественной истории xx века и...
26997    американские ученые уточнили возраст расположе...
26998    за последние  лет тропический углеродный цикл ...
26999    у живших примерно  тыс лет назад на территории...
Name: title, Length: 27000, dtype: object

In [41]:
%%time

cache = {}
#Запускаем лемматизацию текста. 
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

Wall time: 2min 41s


In [47]:
#сформируем список наших текстов, разбив еще и на пробелы
texts = [t for t in news['title'].values]

In [49]:
common_dictionary = Dictionary(texts)

In [61]:
common_dictionary[16]

'генеральный'

In [55]:
common_corpus = [common_dictionary.doc2bow(text) for text in texts]
# получили корпус с тем, сколько раз слова с определенным id встречается в тексте

In [75]:
%%time
from gensim.models import LdaModel
# Обучение LDA
lda = LdaModel(common_corpus, num_topics=30, id2word=common_dictionary)

Wall time: 26.9 s


In [83]:
other_texts = [t for t in news['title'].iloc[:3]]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
unseen_doc = other_corpus[0]
print(other_texts[0])
for i in lda[other_corpus]:
    print(i)
# Получили вероятность отнесения корпуса к определенным темам

['заместитель', 'председатель', 'правительство', 'рф', 'сергей', 'иванов', 'избрать', 'председатель', 'совет', 'пбк', 'цска', 'сообщать', 'прессслужба', 'армеец', 'решение', 'единогласно', 'принять', 'первый', 'заседание', 'совет', 'клуб', 'основной', 'функция', 'это', 'орган', 'обсуждение', 'текущий', 'состояние', 'планирование', 'утверждение', 'стратегия', 'развитие', 'клуб', 'близкий', 'перспектива', 'nn', 'состав', 'совет', 'войти', 'георгий', 'полтавченко', 'полномочный', 'представитель', 'президент', 'центральный', 'федеральный', 'округ', 'алексей', 'седов', 'генералполковник', 'начальник', 'служба', 'защита', 'конституционный', 'строй', 'борьба', 'терроризм', 'фсб', 'рф', 'nn', 'александр', 'новак', 'заместитель', 'министр', 'финансы', 'рф', 'вячеслав', 'полтавец', 'заместитель', 'генеральный', 'директор', 'руководитель', 'блок', 'взаимодействие', 'орган', 'власть', 'социальный', 'корпоративный', 'политика', 'гмк', 'норильский', 'никель', 'nn', 'леонид', 'исакович', 'генеральный

In [78]:
x=lda.show_topics(num_topics=30, num_words=12,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

for topic,words in topics_words:
    print("topic_{}: ".format(topic)+" ".join(words))
# по ключевым сдлвам можно понять и объеденить topic_ в опеределенную тему. 

topic_0: год это который рынок рост свой мочь цена страна человек риск говорить
topic_1: остров миссия египет высота дания тверской рим челябинский размещать хх люксембург венгерский
topic_2: исследование пациент место космос планета рейтинг nn товар россиянин оказаться третий самый
topic_3: военный экипаж район оборудование боевой космонавт восточный км вирус флот чёрный система
topic_4: украина украинский произойти поток расследование следствие удар инцидент офицер киев нанести который
topic_5: ракета дело источник который сотрудник nn год военный информация данные задержать служба
topic_6: который человек год nn фонд это москва город погибнуть россия также свой
topic_7: ребёнок который это журнал свой дом кровь организм белый жизнь найти человек
topic_8: мужчина женщина конкурс фотография золото диск кг супруг золотой женский вес приток
topic_9: научный технология который запуск век поверхность турция северный компания опубликовать это турецкий
topic_10: земля солнце километр болезн

In [91]:

def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text) #формируем корпус
    lda_tuple = lda[unseen_doc] # предсказания модели
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple])) #формируем словарь с вероятностями, вместо списка
    output_vector = []
    for i in range(30): #количесвто наших тем
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [92]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(30)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(30)]]
topic_matrix.head()

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_20,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29
0,6,0.0,0.0,0.028698,0.0,0.0,0.893006,0.0,0.0,0.0,...,0.0,0.022741,0.0,0.0,0.0,0.0,0.0,0.047829,0.0,0.0
1,4896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4897,0.0,0.0,0.342661,0.0,0.0,0.0,0.089885,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.100928,0.0
3,4898,0.0,0.0,0.137365,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.017318,0.0,0.0,0.0,0.0
4,4899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.875771,0.0,0.0,0.0,0.0,0.0


In [93]:
# Получили вектора наших новостей

In [94]:
users.head()

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"
3,u101138,"[5933, 6186, 5055, 6977, 5206, 488389]"
4,u108248,"[707, 1144, 2532, 2928, 3133, 324592]"


In [95]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(30)]].values))

In [96]:
doc_dict[6]

array([0.        , 0.        , 0.02869763, 0.        , 0.        ,
       0.89300573, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.02274125, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.04782933, 0.        , 0.        ])

In [118]:
user_articles_list = users['articles'].iloc[0]

def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.mean(user_vector, 0)
    return user_vector

In [119]:
get_user_embedding(user_articles_list) #получили среднее отношение прочитанных новостей юзера к определенным темам

array([0.06476   , 0.        , 0.01230242, 0.00425405, 0.05740724,
       0.08931768, 0.11448632, 0.05454984, 0.        , 0.02611264,
       0.00274648, 0.02357459, 0.01262899, 0.01685271, 0.02101832,
       0.        , 0.00602513, 0.03877534, 0.08788218, 0.2388506 ,
       0.        , 0.02526675, 0.        , 0.        , 0.04112218,
       0.        , 0.01599173, 0.00728137, 0.01380523, 0.0064724 ])

In [120]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(30)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(30)]]
user_embeddings.head()
# получили эмбеддинги для каждого пользователя

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_20,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29
0,u105138,0.06476,0.0,0.012302,0.004254,0.057407,0.089318,0.114486,0.05455,0.0,...,0.0,0.025267,0.0,0.0,0.041122,0.0,0.015992,0.007281,0.013805,0.006472
1,u108690,0.103949,0.0,0.0,0.00168,0.060931,0.112527,0.063173,0.022513,0.001895,...,0.0,0.015261,0.0,0.0,0.0,0.006055,0.012053,0.0,0.0,0.021019
2,u108339,0.069364,0.0,0.004604,0.007542,0.037024,0.127166,0.130019,0.042616,0.0,...,0.0,0.004991,0.005839,0.003988,0.008487,0.0,0.019437,0.003111,0.008861,0.002114
3,u101138,0.044315,0.0,0.083096,0.007749,0.017384,0.004126,0.0,0.08033,0.167666,...,0.0,0.015539,0.0,0.0,0.0,0.0,0.012647,0.01047,0.0,0.0
4,u108248,0.061723,0.0,0.01305,0.0,0.030796,0.007236,0.189151,0.017988,0.004941,...,0.001668,0.022369,0.002326,0.0,0.007544,0.0,0.022622,0.0,0.010014,0.0


In [121]:
target = pd.read_csv("users_churn.csv")
# разметка

In [122]:
X = pd.merge(user_embeddings, target, 'left')
X.head()

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29,churn
0,u105138,0.06476,0.0,0.012302,0.004254,0.057407,0.089318,0.114486,0.05455,0.0,...,0.025267,0.0,0.0,0.041122,0.0,0.015992,0.007281,0.013805,0.006472,0
1,u108690,0.103949,0.0,0.0,0.00168,0.060931,0.112527,0.063173,0.022513,0.001895,...,0.015261,0.0,0.0,0.0,0.006055,0.012053,0.0,0.0,0.021019,1
2,u108339,0.069364,0.0,0.004604,0.007542,0.037024,0.127166,0.130019,0.042616,0.0,...,0.004991,0.005839,0.003988,0.008487,0.0,0.019437,0.003111,0.008861,0.002114,1
3,u101138,0.044315,0.0,0.083096,0.007749,0.017384,0.004126,0.0,0.08033,0.167666,...,0.015539,0.0,0.0,0.0,0.0,0.012647,0.01047,0.0,0.0,0
4,u108248,0.061723,0.0,0.01305,0.0,0.030796,0.007236,0.189151,0.017988,0.004941,...,0.022369,0.002326,0.0,0.007544,0.0,0.022622,0.0,0.010014,0.0,1


In [123]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

%matplotlib inline

In [127]:
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(30)]], # берем датасет без id
                                                    X['churn'], random_state=0)

In [128]:
logreg = LogisticRegression()

logreg.fit(X_train, y_train)

LogisticRegression()

In [129]:
preds = logreg.predict_proba(X_test)[:, 1]

In [130]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

In [131]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.229618, F-Score=0.644, Precision=0.567, Recall=0.747


In [132]:
roc_auc_score(y_test, preds)

0.9310215710215711

### 2. Модифицировать код функции get_user_embedding таким образом, чтобы считалось не среднее (как в примере np.mean), а медиана. Применить такое преобразование к данным, обучить модель прогнозирования оттока и посчитать метрики качества и сохранить их: roc auc, precision/recall/f_score (для 3 последних - подобрать оптимальный порог с помощью precision_recall_curve, как это делалось на уроке)
### 3. Повторить п.2, но используя уже не медиану, а max

In [133]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.median(user_vector, 0)
    return user_vector

In [134]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(30)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(30)]]
user_embeddings.head()

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_20,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29
0,u105138,0.04561,0.0,0.0,0.0,0.0,0.018463,0.082252,0.01805,0.0,...,0.0,0.008717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,u108690,0.092043,0.0,0.0,0.0,0.046976,0.062139,0.053164,0.024331,0.0,...,0.0,0.005149,0.0,0.0,0.0,0.0,0.007061,0.0,0.0,0.0
2,u108339,0.077484,0.0,0.0,0.0,0.03031,0.136211,0.10216,0.030994,0.0,...,0.0,0.0,0.0,0.0,0.006672,0.0,0.014947,0.0,0.0,0.0
3,u101138,0.0,0.0,0.038978,0.0,0.0,0.0,0.0,0.035702,0.087818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,u108248,0.022715,0.0,0.0,0.0,0.030491,0.0,0.182764,0.006497,0.0,...,0.0,0.007025,0.0,0.0,0.0,0.0,0.013601,0.0,0.0,0.0


In [135]:
X = pd.merge(user_embeddings, target, 'left')
X.head()

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29,churn
0,u105138,0.04561,0.0,0.0,0.0,0.0,0.018463,0.082252,0.01805,0.0,...,0.008717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,u108690,0.092043,0.0,0.0,0.0,0.046976,0.062139,0.053164,0.024331,0.0,...,0.005149,0.0,0.0,0.0,0.0,0.007061,0.0,0.0,0.0,1
2,u108339,0.077484,0.0,0.0,0.0,0.03031,0.136211,0.10216,0.030994,0.0,...,0.0,0.0,0.0,0.006672,0.0,0.014947,0.0,0.0,0.0,1
3,u101138,0.0,0.0,0.038978,0.0,0.0,0.0,0.0,0.035702,0.087818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,u108248,0.022715,0.0,0.0,0.0,0.030491,0.0,0.182764,0.006497,0.0,...,0.007025,0.0,0.0,0.0,0.0,0.013601,0.0,0.0,0.0,1


In [136]:
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(30)]], # берем датасет без id
                                                    X['churn'], random_state=0)

In [137]:
logreg.fit(X_train, y_train)
preds = logreg.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.259518, F-Score=0.677, Precision=0.633, Recall=0.727


In [138]:
roc_auc_score(y_test, preds) #С медианой получили улучшение

0.9427757427757427

### теперь для эмбеддингов используем max

In [139]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.max(user_vector, 0)
    return user_vector

In [140]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(30)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(30)]]
user_embeddings.head()

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_20,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29
0,u105138,0.191066,0.0,0.057488,0.025524,0.289343,0.361285,0.321546,0.25449,0.0,...,0.0,0.078396,0.0,0.0,0.139349,0.0,0.063335,0.043688,0.082831,0.038834
1,u108690,0.216871,0.0,0.0,0.01008,0.154182,0.384065,0.151272,0.056547,0.011372,...,0.0,0.060075,0.0,0.0,0.0,0.021498,0.040672,0.0,0.0,0.096837
2,u108339,0.168491,0.0,0.027626,0.04525,0.072416,0.287864,0.259024,0.138468,0.0,...,0.0,0.015007,0.035036,0.023926,0.022998,0.0,0.045004,0.018665,0.036437,0.012681
3,u101138,0.178594,0.0,0.316122,0.02381,0.104302,0.024756,0.0,0.310573,0.472959,...,0.0,0.075094,0.0,0.0,0.0,0.0,0.075879,0.062822,0.0,0.0
4,u108248,0.195394,0.0,0.078299,0.0,0.071668,0.043414,0.314149,0.062445,0.018674,...,0.010008,0.100312,0.013957,0.0,0.045265,0.0,0.058152,0.0,0.060084,0.0


In [141]:
X = pd.merge(user_embeddings, target, 'left')
X.head()

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29,churn
0,u105138,0.191066,0.0,0.057488,0.025524,0.289343,0.361285,0.321546,0.25449,0.0,...,0.078396,0.0,0.0,0.139349,0.0,0.063335,0.043688,0.082831,0.038834,0
1,u108690,0.216871,0.0,0.0,0.01008,0.154182,0.384065,0.151272,0.056547,0.011372,...,0.060075,0.0,0.0,0.0,0.021498,0.040672,0.0,0.0,0.096837,1
2,u108339,0.168491,0.0,0.027626,0.04525,0.072416,0.287864,0.259024,0.138468,0.0,...,0.015007,0.035036,0.023926,0.022998,0.0,0.045004,0.018665,0.036437,0.012681,1
3,u101138,0.178594,0.0,0.316122,0.02381,0.104302,0.024756,0.0,0.310573,0.472959,...,0.075094,0.0,0.0,0.0,0.0,0.075879,0.062822,0.0,0.0,0
4,u108248,0.195394,0.0,0.078299,0.0,0.071668,0.043414,0.314149,0.062445,0.018674,...,0.100312,0.013957,0.0,0.045265,0.0,0.058152,0.0,0.060084,0.0,1


In [142]:
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(30)]], # берем датасет без id
                                                    X['churn'], random_state=0)

In [143]:
logreg.fit(X_train, y_train)
preds = logreg.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.354716, F-Score=0.799, Precision=0.802, Recall=0.796


In [144]:
roc_auc_score(y_test, preds) # С Max стало еще лучше

0.970821559392988