1. *Самостоятельно повторить tfidf (документация https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)

In [1]:
import pandas as pd

Наши новости

In [3]:
news = pd.read_csv("articles.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


Загрузим пользователей и списки последних прочитанных новостей

In [4]:
users = pd.read_csv("users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


Итак, нам нужно получить векторные представления пользователей на основе прочитанным ими новостей и самих новостей

### 1. Получаем векторные представления новостей

In [5]:
!pip install razdel pymorphy2



In [6]:
# предобработка текстов
import re
import numpy as np
from gensim.corpora.dictionary import Dictionary
from razdel import tokenize  # сегментация русскоязычного текста на токены и предложения https://github.com/natasha/razdel
import pymorphy2  # Морфологический анализатор

In [7]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Danil\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [8]:
stopword_ru = stopwords.words('russian')
print(len(stopword_ru))

151


In [10]:
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
    
stopword_ru += additional_stopwords
len(stopword_ru)

776

In [11]:
def clean_text(text):
    # проверяем тип входных данных
    if not isinstance(text, str): 
        text = str(text)
    #приводим к одному формату и очищаем текст 
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))
    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip()) 
    
    return text

cache = {}

def lemmatization(text):
#     global cache
   
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одним символом
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''
    # [0]
    if not isinstance(text, str):
        text = str(text)
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0]=='-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cache = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cache)

    words_lem_without_stopwords = [i for i in words_lem if not i in stopword_ru] # [6]

    return words_lem_without_stopwords

In [12]:
from tqdm import tqdm
tqdm.pandas()

# очистка текста
news['title'] = news['title'].progress_apply(lambda x: clean_text(x))

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
100%|██████████████████████████████████████████████████████████████████████████| 27000/27000 [00:21<00:00, 1256.98it/s]


In [13]:
news['title'].iloc[:10]

0    заместитель председателя правительства рф серг...
1    матч  финала кубка россии по футболу был приос...
2    форвард авангарда томаш заборский прокомментир...
3    главный тренер кубани юрий красножан прокоммен...
4    решением попечительского совета владивостокско...
5    ио главного тренера вячеслав буцаев прокоммент...
6    запорожский металлург дома потерпел разгромное...
7    сборная сша одержала победу над австрией со сч...
8    бывший защитник сборной россии дарюс каспарайт...
9    полузащитник цска зоран тошич после победы над...
Name: title, dtype: object

In [14]:
# лкмматизация
news['title'] = news['title'].progress_apply(lambda x: lemmatization(x))

100%|███████████████████████████████████████████████████████████████████████████| 27000/27000 [02:59<00:00, 150.22it/s]


А теперь в 3 строчки обучим нашу модель

In [15]:
# сформируем список наших текстов
texts = list(news['title'].values)

# Создадим корпус из списка с текстами
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

Запускаем обучение

In [16]:
N_topic = 20  

In [17]:
from gensim.models import LdaModel

# Обучаем модель на корпусе
lda = LdaModel(common_corpus, num_topics=N_topic, id2word=common_dictionary, passes=2)  # можно было менять

In [18]:
from gensim.test.utils import datapath

# Сохраняем модель на диск
temp_file = datapath("model.lda")
lda.save(temp_file)

In [19]:
# Загружаем обученную модель с диска
lda = LdaModel.load(temp_file)

In [20]:
# Создаем новый корпус документов
other_texts = list(news['title'].iloc[:3])
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[2]
print(other_texts[2])
lda[unseen_doc] 

['форвард', 'авангард', 'томаш', 'заборский', 'прокомментировать', 'игра', 'свой', 'команда', 'матч', 'чемпионат', 'кхл', 'против', 'атланта', 'провести', 'плохой', 'матч', 'нижний', 'новгород', 'против', 'торпедо', 'настраиваться', 'первый', 'минута', 'включиться', 'работа', 'сказать', 'заборский', 'получиться', 'забросить', 'быстрый', 'гол', 'задать', 'хороший', 'темп', 'поединок', 'мочь', 'играть', 'ещё', 'хороший', 'сторона', 'пять', 'очко', 'выезд', 'девять', 'это', 'хороший']


[(4, 0.4091069), (7, 0.21130395), (9, 0.24226236), (14, 0.119462356)]

In [21]:
x = lda.show_topics(num_topics=N_topic, num_words=7, formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

# Печатаем только слова
for topic, words in topics_words:
    print(f"topic_{topic}: " + " ".join(words))

topic_0: год проект это исследование фонд который тыс
topic_1: остров океан японский япония флот бензин северный
topic_2: год это который также новый весь компания
topic_3: станция поверхность турция рейс турецкий агентство аэропорт
topic_4: рак команда игра грунт актёр стресс играть
topic_5: налог рейтинг мышь корея определение золото место
topic_6: гражданин россиянин товар египет народный фронт белоруссия
topic_7: год млн составить тыс январь сообщать первый
topic_8: суд дело год который компания это пенсия
topic_9: это человек который всё весь мочь свой
topic_10: это сша мочь ракета россия американский российский
topic_11: россия банк рубль закон который рф санкция
topic_12: обнаружить экипаж вода продукция авария пострадать катастрофа
topic_13: сша газ взрыв смерть данные который восток
topic_14: военный убийство следователь превысить польша германия подозревать
topic_15: год женщина мужчина исследование время выяснить день
topic_16: ребёнок область год погибнуть рубль проверка це

Очень неплохо - большинство тем вполне можно описать о чем они

In [22]:
def get_lda_vector(lda, text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]

    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(N_topic):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [23]:
get_lda_vector(lda, news['title'].iloc[0])

array([0.        , 0.        , 0.        , 0.        , 0.10710611,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.09644619, 0.77820235, 0.01068109, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [24]:
topic_matrix = pd.DataFrame([get_lda_vector(lda, text) for text in news['title'].values])
topic_matrix.columns = [f'topic_{i}' for i in range(N_topic)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+[f'topic_{i}' for i in range(N_topic)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,6,0.0,0.0,0.0,0.0,0.107106,0.0,0.0,0.0,0.0,...,0.096442,0.778206,0.010681,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4896,0.0,0.0,0.0,0.0,0.355043,0.0,0.0,0.0,0.103274,...,0.0,0.0,0.0,0.520426,0.0,0.0,0.0,0.0,0.0,0.0
2,4897,0.0,0.0,0.0,0.0,0.409152,0.0,0.0,0.211323,0.0,...,0.0,0.0,0.0,0.0,0.119465,0.0,0.0,0.0,0.0,0.0
3,4898,0.0,0.0,0.281346,0.0,0.349635,0.0,0.0,0.127229,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020698,0.0,0.0
4,4899,0.0,0.0,0.371663,0.0,0.126959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.028544,0.0,0.03824,0.0,0.413664,0.0


### Следующий шаг - векторные представления пользователей

In [25]:
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [26]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[[f'topic_{i}' for i in range(N_topic)]].values))

In [27]:
doc_dict[293672]

array([0.62118745, 0.        , 0.07030057, 0.        , 0.        ,
       0.        , 0.        , 0.14596426, 0.        , 0.09238151,
       0.        , 0.        , 0.        , 0.        , 0.05348561,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [28]:
def get_user_embedding(user_articles_list, doc_dict, func):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = func(user_vector, axis=0)
    return user_vector

In [29]:
user_articles_list = users['articles'].iloc[33]

get_user_embedding(user_articles_list, doc_dict, np.mean)

array([0.06095585, 0.        , 0.12308646, 0.        , 0.        ,
       0.        , 0.02860054, 0.03006075, 0.05955698, 0.07324149,
       0.158386  , 0.1957166 , 0.        , 0.04872397, 0.02790425,
       0.02732952, 0.00410461, 0.        , 0.15136748, 0.        ])

Теперь получим эмбединги для всех пользователей и проверим их качество на конкретной downstream-задаче

In [30]:
FUNC = np.mean

In [31]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x, doc_dict, FUNC))])
user_embeddings.columns = [f'topic_{i}' for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+[f'topic_{i}' for i in range(N_topic)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,u105138,0.106669,0.0,0.193859,0.008767,0.01351,0.0,0.006568,0.024327,0.010985,...,0.043741,0.064588,0.004016,0.026438,0.011069,0.016099,0.181302,0.085009,0.075854,0.006232
1,u108690,0.05071,0.0,0.156814,0.007456,0.0,0.0,0.004931,0.0179,0.042019,...,0.126186,0.090279,0.0,0.010723,0.024466,0.056574,0.048255,0.060014,0.161542,0.002364
2,u108339,0.020052,0.0,0.164307,0.022378,0.0017,0.002112,0.004521,0.022708,0.050239,...,0.034123,0.116991,0.033898,0.094388,0.013939,0.078483,0.103016,0.0,0.150801,0.0


Датасет готов - можно попробовать обучить модель. Загрузим нашу разметку

In [33]:
target = pd.read_csv("users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [66]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,churn
0,u105138,0.621187,0.0,0.36813,0.0526,0.081058,0.0,0.039409,0.145964,0.037427,...,0.227016,0.024098,0.158629,0.053486,0.096595,0.396297,0.510052,0.269617,0.037393,0
1,u108690,0.176005,0.0,0.242499,0.030106,0.0,0.0,0.029584,0.066877,0.091413,...,0.340238,0.0,0.027507,0.083648,0.185968,0.101706,0.173158,0.281585,0.014183,1
2,u108339,0.040335,0.0,0.29373,0.07519,0.010201,0.012673,0.016057,0.088872,0.161498,...,0.222993,0.121905,0.27301,0.033023,0.167429,0.253,0.0,0.318427,0.0,1


In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [36]:
# разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[[f'topic_{i}' for i in range(N_topic)]], 
                                                    X['churn'], random_state=0)

In [37]:
model = LogisticRegression()
# обучим 
model.fit(X_train, y_train)

LogisticRegression()

In [38]:
# прогнозы для тестовой выборки
preds = model.predict_proba(X_test)[:, 1]
preds[:10]

array([0.14877275, 0.08297714, 0.57635283, 0.12056284, 0.02169723,
       0.05391526, 0.11140027, 0.00699849, 0.12731584, 0.05141983])

In [39]:
metrics_df = pd.DataFrame(columns=['model', 'thresh', 'F-Score', 'Precision', 'Recall', 'ROC AUC'])
metrics_df

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC


In [40]:
from sklearn.metrics import (f1_score, roc_auc_score, precision_score,
                             classification_report, precision_recall_curve, confusion_matrix)

In [41]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')                                                                        

Best Threshold=0.2700500282246537, F-Score=0.751, Precision=0.689, Recall=0.824


In [42]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.9646700389557532

In [43]:
metrics_df = metrics_df.append({
    'model': FUNC.__name__,
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc
}, ignore_index=True)

metrics_df

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,mean,0.27005,0.750929,0.68942,0.82449,0.96467


2. Модифицировать код функции get_user_embedding таким образом, чтобы считалось не среднее (как в примере np.mean), а медиана. Применить такое преобразование к данным, обучить модель прогнозирования оттока и посчитать метрики качества и сохранить их: roc auc, precision/recall/f_score (для 3 последних - подобрать оптимальный порог)

In [44]:
FUNC = np.median

In [45]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x, doc_dict, FUNC))])
user_embeddings.columns = [f'topic_{i}' for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+[f'topic_{i}' for i in range(N_topic)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,u105138,0.0,0.0,0.188682,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.127117,0.0,0.04493,0.0
1,u108690,0.009816,0.0,0.178144,0.0,0.0,0.0,0.0,0.0,0.039093,...,0.06862,0.043465,0.0,0.007724,0.013041,0.033813,0.052587,0.032022,0.142689,0.0
2,u108339,0.021281,0.0,0.163448,0.008453,0.0,0.0,0.0,0.011195,0.036822,...,0.020393,0.088619,0.014588,0.090939,0.010461,0.084786,0.071741,0.0,0.160695,0.0


In [46]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,churn
0,u105138,0.0,0.0,0.188682,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.127117,0.0,0.04493,0.0,0
1,u108690,0.009816,0.0,0.178144,0.0,0.0,0.0,0.0,0.0,0.039093,...,0.043465,0.0,0.007724,0.013041,0.033813,0.052587,0.032022,0.142689,0.0,1
2,u108339,0.021281,0.0,0.163448,0.008453,0.0,0.0,0.0,0.011195,0.036822,...,0.088619,0.014588,0.090939,0.010461,0.084786,0.071741,0.0,0.160695,0.0,1


In [47]:
# разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[[f'topic_{i}' for i in range(N_topic)]], 
                                                    X['churn'], random_state=0)

In [48]:
model = LogisticRegression()
# обучим 
model.fit(X_train, y_train)

LogisticRegression()

In [49]:
# прогнозы для тестовой выборки
preds = model.predict_proba(X_test)[:, 1]
preds[:10]

array([0.16710525, 0.10770714, 0.62323988, 0.16650672, 0.01765877,
       0.04065469, 0.0462688 , 0.00382966, 0.19491827, 0.05562306])

In [50]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')                                                                        

Best Threshold=0.27631848134275194, F-Score=0.793, Precision=0.735, Recall=0.861


In [51]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.9741589627303914

In [52]:
metrics_df = metrics_df.append({
    'model': FUNC.__name__,
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc
}, ignore_index=True)

metrics_df

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,mean,0.27005,0.750929,0.68942,0.82449,0.96467
1,median,0.276318,0.793233,0.735192,0.861224,0.974159



3. Повторить п.2, но используя уже не медиану, а max


In [53]:
FUNC = np.max

In [54]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x, doc_dict, FUNC))])
user_embeddings.columns = [f'topic_{i}' for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+[f'topic_{i}' for i in range(N_topic)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,u105138,0.621187,0.0,0.36813,0.0526,0.081058,0.0,0.039409,0.145964,0.037427,...,0.188018,0.227016,0.024098,0.158629,0.053486,0.096595,0.396297,0.510052,0.269617,0.037393
1,u108690,0.176005,0.0,0.242499,0.030106,0.0,0.0,0.029584,0.066877,0.091413,...,0.453461,0.340238,0.0,0.027507,0.083648,0.185968,0.101706,0.173158,0.281585,0.014183
2,u108339,0.040335,0.0,0.29373,0.07519,0.010201,0.012673,0.016057,0.088872,0.161498,...,0.12534,0.222993,0.121905,0.27301,0.033023,0.167429,0.253,0.0,0.318427,0.0


In [55]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,churn
0,u105138,0.621187,0.0,0.36813,0.0526,0.081058,0.0,0.039409,0.145964,0.037427,...,0.227016,0.024098,0.158629,0.053486,0.096595,0.396297,0.510052,0.269617,0.037393,0
1,u108690,0.176005,0.0,0.242499,0.030106,0.0,0.0,0.029584,0.066877,0.091413,...,0.340238,0.0,0.027507,0.083648,0.185968,0.101706,0.173158,0.281585,0.014183,1
2,u108339,0.040335,0.0,0.29373,0.07519,0.010201,0.012673,0.016057,0.088872,0.161498,...,0.222993,0.121905,0.27301,0.033023,0.167429,0.253,0.0,0.318427,0.0,1


In [56]:
# разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[[f'topic_{i}' for i in range(N_topic)]], 
                                                    X['churn'], random_state=0)

In [57]:
model = LogisticRegression()
# обучим 
model.fit(X_train, y_train)

LogisticRegression()

In [58]:
# прогнозы для тестовой выборки
preds = model.predict_proba(X_test)[:, 1]
preds[:10]

array([0.03009726, 0.00245468, 0.70119733, 0.18711435, 0.00751183,
       0.00603329, 0.18432246, 0.02617413, 0.08720788, 0.00828718])

In [59]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')                                                                        

Best Threshold=0.37019336526423324, F-Score=0.791, Precision=0.771, Recall=0.812


In [60]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.9762381533810105

In [61]:
metrics_df = metrics_df.append({
    'model': FUNC.__name__,
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc
}, ignore_index=True)

metrics_df

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,mean,0.27005,0.750929,0.68942,0.82449,0.96467
1,median,0.276318,0.793233,0.735192,0.861224,0.974159
2,amax,0.370193,0.791252,0.771318,0.812245,0.976238


4. *Воспользовавшись полученными знаниями из п.1, повторить пункт 2, но уже взвешивая новости по tfidf (взяв список новостей пользователя)
	- подсказка 1: нужно получить веса-коэффициенты для каждого документа. Не все документы одинаково информативны и несут какой-то положительный сигнал
	- подсказка 2: нужен именно idf, как вес.

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [63]:
users['articles_str'] = users['articles'].apply(lambda x: x.replace('[','').replace(']', '').replace(',', ''))

users['articles_str'].iloc[0]

'293672 293328 293001 293622 293126 1852'

In [64]:
tfidf = TfidfVectorizer()
tfidf.fit(users['articles_str'])

TfidfVectorizer()

In [67]:
idf = pd.DataFrame({'article_id': tfidf.get_feature_names(),
              'idf': tfidf.idf_})

idf

Unnamed: 0,article_id,idf
0,10,8.888710
1,100,7.907880
2,1000,8.041412
3,1001,8.888710
4,1002,8.888710
...,...,...
14776,995,8.377884
14777,996,8.195562
14778,997,8.601027
14779,998,9.294175


In [68]:
def get_user_embedding_idf(user_articles_list, doc_dict):
    user_articles_list = eval(user_articles_list)
    
    user_vector = np.zeros((len(user_articles_list), N_topic))
    for i, doc_id in enumerate(user_articles_list):
        try:
            weight = idf[idf['article_id'] == str(doc_id)]['idf'].values[0]
        except Exception as e:
            weight = 0
        user_vector[i] = doc_dict[doc_id] * weight

    user_vector = np.median(user_vector, axis=0)
    return user_vector

In [69]:
from tqdm import tqdm
tqdm.pandas()

user_embeddings = pd.DataFrame([i for i in users['articles'].progress_apply(lambda x: get_user_embedding_idf(x, doc_dict))])
user_embeddings.columns = [f'topic_{i}' for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+[f'topic_{i}' for i in range(N_topic)]]
user_embeddings.head(3)

100%|█████████████████████████████████████████████████████████████████████████████| 8000/8000 [00:47<00:00, 166.92it/s]


Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,u105138,0.0,0.0,1.667517,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.09647,0.0,0.38644,0.0
1,u108690,0.087252,0.0,1.425473,0.0,0.0,0.0,0.0,0.0,0.357492,...,0.571063,0.373216,0.0,0.068661,0.112169,0.273604,0.47864,0.284631,1.296572,0.0
2,u108339,0.180512,0.0,1.356248,0.078565,0.0,0.0,0.0,0.096292,0.324575,...,0.16825,0.768055,0.125476,0.806889,0.08764,0.698172,0.610928,0.0,1.283078,0.0


In [70]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,churn
0,u105138,0.0,0.0,1.667517,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.09647,0.0,0.38644,0.0,0
1,u108690,0.087252,0.0,1.425473,0.0,0.0,0.0,0.0,0.0,0.357492,...,0.373216,0.0,0.068661,0.112169,0.273604,0.47864,0.284631,1.296572,0.0,1
2,u108339,0.180512,0.0,1.356248,0.078565,0.0,0.0,0.0,0.096292,0.324575,...,0.768055,0.125476,0.806889,0.08764,0.698172,0.610928,0.0,1.283078,0.0,1


In [71]:
# разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[[f'topic_{i}' for i in range(N_topic)]], 
                                                    X['churn'], random_state=0)

In [72]:
model = LogisticRegression()
# обучим 
model.fit(X_train, y_train)

LogisticRegression()

In [73]:
# прогнозы для тестовой выборки
preds = model.predict_proba(X_test)[:, 1]
preds[:10]

array([9.51364930e-02, 1.66847041e-03, 9.06458241e-01, 3.18899391e-01,
       1.94199586e-03, 2.62978610e-03, 1.10041542e-02, 6.00526779e-07,
       2.50371948e-02, 2.80446625e-03])

In [74]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')                                                                        

Best Threshold=0.36699553705511967, F-Score=0.851, Precision=0.827, Recall=0.878


In [75]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.9851200651200651

In [76]:
metrics_df = metrics_df.append({
    'model': 'idf_median',
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc
}, ignore_index=True)


5. Сформировать на выходе единую таблицу, сравнивающую качество 2/3 разных метода получения эмбедингов пользователей: median, max, idf_mean по метрикам roc_auc, precision, recall, f_score


In [77]:
metrics_df

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,mean,0.27005,0.750929,0.68942,0.82449,0.96467
1,median,0.276318,0.793233,0.735192,0.861224,0.974159
2,amax,0.370193,0.791252,0.771318,0.812245,0.976238
3,idf_median,0.366996,0.851485,0.826923,0.877551,0.98512
