In [79]:
import re
import pymorphy2
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary
from nltk.corpus import stopwords
from razdel import tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

In [80]:
news = pd.read_csv("articles.csv")
users = pd.read_csv("users_articles.csv")

In [81]:
stopword_ru = stopwords.words('russian')
len(stopword_ru)

morph = pymorphy2.MorphAnalyzer()

In [82]:
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords

In [83]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [84]:
n_topics = 25

In [85]:
%%time
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

CPU times: user 14.5 s, sys: 28.9 ms, total: 14.6 s
Wall time: 14.6 s


In [86]:
%%time
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

CPU times: user 1min 51s, sys: 0 ns, total: 1min 51s
Wall time: 1min 51s


In [87]:
texts = [t for t in news['title'].values]
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]
lda = LdaModel(common_corpus, num_topics=n_topics, id2word=common_dictionary)#, passes=10)

In [88]:
#text = news['title'].iloc[0]

def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [89]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(n_topics)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(n_topics)]]

In [90]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(n_topics)]].values))

In [91]:
def get_user_embedding_1(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.mean(user_vector, 0)
    return user_vector

In [92]:
user_embeddings_1 = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_1(x), 1)])
user_embeddings_1.columns = ['topic_{}'.format(i) for i in range(n_topics)]
user_embeddings_1['uid'] = users['uid'].values
user_embeddings_1 = user_embeddings_1[['uid']+['topic_{}'.format(i) for i in range(n_topics)]]

In [93]:
target = pd.read_csv("users_churn.csv")

In [94]:
X_1 = pd.merge(user_embeddings_1, target, 'left')

In [95]:
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1[['topic_{}'.format(i) for i in range(n_topics)]], 
                                                    X_1['churn'], random_state=0)

In [96]:
logreg = LogisticRegression()
logreg.fit(X_1_train, y_1_train)

In [97]:
preds_1 = logreg.predict_proba(X_1_test)[:, 1]

In [98]:
precision_1, recall_1, thresholds_1 = precision_recall_curve(y_1_test, preds_1)
fscore_1 = (2 * precision_1 * recall_1) / (precision_1 + recall_1)
# locate the index of the largest f score
ix_1 = np.argmax(fscore_1)
roc_auc_1 = roc_auc_score(y_1_test, preds_1)
threshold_1, fscore_1_, precision_1_, recall_1_ = thresholds_1[ix_1], fscore_1[ix_1], precision_1[ix_1], recall_1[ix_1]

1. Самостоятельно разобраться с тем, что такое tfidf (документация https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html и еще - https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction)

2. Модифицировать код функции get_user_embedding таким образом, чтобы считалось не среднее (как в примере np.mean), а медиана. Применить такое преобразование к данным, обучить модель прогнозирования оттока и посчитать метрики качества и сохранить их: roc auc, precision/recall/f_score (для 3 последних - подобрать оптимальный порог с помощью precision_recall_curve, как это делалось на уроке)

In [99]:
def get_user_embedding_2(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.median(user_vector, 0)
    return user_vector

In [100]:
user_embeddings_2 = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_2(x), 1)])
user_embeddings_2.columns = ['topic_{}'.format(i) for i in range(n_topics)]
user_embeddings_2['uid'] = users['uid'].values
user_embeddings_2 = user_embeddings_2[['uid']+['topic_{}'.format(i) for i in range(n_topics)]]

In [101]:
target = pd.read_csv("users_churn.csv")

In [102]:
X_2 = pd.merge(user_embeddings_2, target, 'left')

In [103]:
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2[['topic_{}'.format(i) for i in range(n_topics)]], 
                                                    X_2['churn'], random_state=0)

In [104]:
logreg = LogisticRegression()
logreg.fit(X_2_train, y_2_train)

In [105]:
preds_2 = logreg.predict_proba(X_2_test)[:, 1]

In [106]:
precision_2, recall_2, thresholds_2 = precision_recall_curve(y_2_test, preds_2)
fscore_2 = (2 * precision_2 * recall_2) / (precision_2 + recall_2)
# locate the index of the largest f score
ix_2 = np.argmax(fscore_2)
roc_auc_2 = roc_auc_score(y_2_test, preds_2)
threshold_2, fscore_2_, precision_2_, recall_2_ = thresholds_2[ix_2], fscore_2[ix_2], precision_2[ix_2], recall_2[ix_2]

3. Повторить п.2, но используя уже не медиану, а max

In [107]:
def get_user_embedding_3(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.max(user_vector, 0)
    return user_vector

In [108]:
user_embeddings_3 = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_3(x), 1)])
user_embeddings_3.columns = ['topic_{}'.format(i) for i in range(n_topics)]
user_embeddings_3['uid'] = users['uid'].values
user_embeddings_3 = user_embeddings_3[['uid']+['topic_{}'.format(i) for i in range(n_topics)]]

In [109]:
target = pd.read_csv("users_churn.csv")

In [110]:
X_3 = pd.merge(user_embeddings_3, target, 'left')

In [111]:
X_3_train, X_3_test, y_3_train, y_3_test = train_test_split(X_3[['topic_{}'.format(i) for i in range(n_topics)]], 
                                                    X_3['churn'], random_state=0)

In [112]:
logreg = LogisticRegression()
logreg.fit(X_3_train, y_3_train)

In [113]:
preds_3 = logreg.predict_proba(X_3_test)[:, 1]

In [114]:
precision_3, recall_3, thresholds_3 = precision_recall_curve(y_3_test, preds_3)
fscore_3 = (2 * precision_3 * recall_3) / (precision_3 + recall_3)
# locate the index of the largest f score
ix_3 = np.argmax(fscore_3)
roc_auc_3 = roc_auc_score(y_3_test, preds_3)
threshold_3, fscore_3_, precision_3_, recall_3_ = thresholds_3[ix_3], fscore_2[ix_3], precision_3[ix_3], recall_3[ix_3]

4. (опциональное, если очень хочется) Воспользовавшись полученными знаниями из п.1, повторить пункт 2, но уже взвешивая новости по tfidf (подсказка: нужно получить веса-коэффициенты для каждого документа. Не все документы одинаково информативны и несут какой-то положительный сигнал). Подсказка 2 - нужен именно idf, как вес.

In [115]:
users['articles_str'] = users['articles'].apply(lambda x: x.replace('[','').replace(']', '').replace(',', ''))

In [116]:
tfidf = TfidfVectorizer()
tfidf.fit(users['articles_str'])

In [117]:
idf = pd.DataFrame({'article_id': tfidf.get_feature_names_out(), 'idf': tfidf.idf_})

In [118]:
def get_user_embedding_idf(user_articles_list, doc_dict, n_topics, idf):
    user_articles_list = eval(user_articles_list)
    
    user_vector = np.zeros((len(user_articles_list), n_topics))
    for i, doc_id in enumerate(user_articles_list):
        try:
            weight = idf[idf['article_id'] == str(doc_id)]['idf'].values[0]
        except Exception as e:
            weight = 0
        user_vector[i] = doc_dict[doc_id] * weight

    user_vector = np.median(user_vector, axis=0)
    return user_vector

In [119]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_idf(x, doc_dict, n_topics, idf))])
user_embeddings.columns = [f'topic_{i}' for i in range(n_topics)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+[f'topic_{i}' for i in range(n_topics)]]

In [120]:
X = pd.merge(user_embeddings, target, 'left')

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X[[f'topic_{i}' for i in range(n_topics)]], 
                                                    X['churn'], random_state=0)

In [122]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [123]:
preds = logreg.predict_proba(X_test)[:, 1]

In [124]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
roc_auc = roc_auc_score(y_test, preds)
threshold, fscore_, precision_, recall_ = thresholds[ix], fscore[ix], precision[ix], recall[ix]

5. Сформировать на выходе единую таблицу, сравнивающую качество 3 разных метода получения эмбедингов пользователей: mean, median, max, idf_mean по метрикам roc_auc, precision, recall, f_score

In [125]:
dict_eval = {
    'method': ['mean', 'median', 'max', 'idf_mean'],
    'roc_auc': [roc_auc_1, roc_auc_2, roc_auc_3, roc_auc],
    'precision': [precision_1_, precision_2_, precision_3_, precision_],
    'recall': [recall_1_, recall_2_, recall_3_, recall_],
    'f_score': [fscore_1_, fscore_2_, fscore_3_, fscore_]
    }
final_eval = pd.DataFrame(dict_eval)
final_eval

Unnamed: 0,method,roc_auc,precision,recall,f_score
0,mean,0.931094,0.529248,0.77551,0.629139
1,median,0.97608,0.775281,0.844898,0.808594
2,max,0.982762,0.782456,0.910204,0.803774
3,idf_mean,0.991088,0.882114,0.885714,0.88391


6. Сделать самостоятельные выводы и предположения о том, почему тот или ной способ оказался эффективнее остальных

mean и max чувствительны к выбросам, что затем сказывается на точности предсказаний. median помогает увидеть общую картину, а idf благодаря учету весов позволяет хорошо отследить статистику. Поэтому median и idf_mean позволяют добиться наибольшей точности предсказаний.