In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/users-churn/users_churn.csv
/kaggle/input/article-data/users_articles.csv
/kaggle/input/article-data/articles_idf.csv
/kaggle/input/stopwords/stopwords.txt
/kaggle/input/articles/materials.csv


In [2]:
!pip install razdel

Collecting razdel
  Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Installing collected packages: razdel
Successfully installed razdel-0.5.0
[0m

In [3]:
!pip install pymorphy2

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m458.7 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Installing collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2
Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844
[0m

In [4]:
!pip install pyLDAvis

[0m

In [5]:
import pandas as pd
from gensim.corpora.dictionary import Dictionary
import re
import numpy as np
from nltk.corpus import stopwords
from razdel import tokenize
import pymorphy2
from gensim.models import LdaModel
from gensim.test.utils import datapath
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, precision_recall_curve
import itertools

import warnings
warnings.filterwarnings('ignore')

In [6]:
news = pd.read_csv('../input/articles/materials.csv')
users = pd.read_csv('../input/article-data/users_articles.csv')

In [7]:
stopword_ru = stopwords.words('russian')
morph = pymorphy2.MorphAnalyzer()

with open('../input/stopwords/stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
    
stopword_ru += additional_stopwords

In [8]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())

    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [9]:
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)
texts = [t for t in news['title'].values]
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)
temp_file = datapath("model.lda")
lda.save(temp_file)
lda = LdaModel.load(temp_file)
other_texts = [t for t in news['title'].iloc[:3]]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
unseen_doc = other_corpus[2]
x=lda.show_topics(num_topics=25, num_words=7,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

In [10]:
def do_aggf(aggf):
    def get_lda_vector(text):
        unseen_doc = common_dictionary.doc2bow(text)
        lda_tuple = lda[unseen_doc]
        not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))
        num_topics = 25

        output_vector = []

        for i in range(num_topics):
            if i not in not_null_topics:
                output_vector.append(0)
            else:
                output_vector.append(not_null_topics[i])
        return np.array(output_vector)
    
    num_topics = 25
    topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
    topic_matrix.columns = ['topic_{}'.format(i) for i in range(num_topics)]
    topic_matrix['doc_id'] = news['doc_id'].values
    topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(num_topics)]]
    doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(num_topics)]].values))
    
    def get_user_embedding(user_articles_list):
        user_articles_list = eval(user_articles_list)
        user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
        user_vector = aggf(user_vector, 0)
        return user_vector

    user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
    user_embeddings.columns = ['topic_{}'.format(i) for i in range(num_topics)]
    user_embeddings['uid'] = users['uid'].values
    user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(num_topics)]]
    
    target = pd.read_csv('../input/users-churn/users_churn.csv')
    X = pd.merge(user_embeddings, target, 'left')

    X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(num_topics)]], 
                                                        X['churn'], random_state=0)
    
    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)

    preds = logreg.predict_proba(X_test)[:, 1]

    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(fscore)
    
    return thresholds[ix], fscore[ix], precision[ix], recall[ix], roc_auc_score(y_test, preds)

In [11]:
metrics_names = ['F-Score', 'Precision', 'Recall', 'Roc_Auc_Score']

In [12]:
metrics_results_mean = []

for i in do_aggf(np.mean)[1:]:
    metrics_results_mean.append(i)

print(f'Best Threshold = {round(do_aggf(np.mean)[0], 5)}')

for result, name in zip(metrics_results_mean, metrics_names):
    print(f'{name} = {round(result, 5)}')

Best Threshold = 0.22327
F-Score = 0.67606
Precision = 0.59443
Recall = 0.78367
Roc_Auc_Score = 0.94273


### Задание №2

In [13]:
# Используем медиану

metrics_results_median = []

for i in do_aggf(np.median)[1:]:
    metrics_results_median.append(i)

print(f'Best Threshold = {round(do_aggf(np.median)[0], 5)}')

for result, name in zip(metrics_results_median, metrics_names):
    print(f'{name} = {round(result, 5)}')

Best Threshold = 0.23466
F-Score = 0.73381
Precision = 0.65595
Recall = 0.83265
Roc_Auc_Score = 0.96653


###  Задание №3

In [14]:
# Используем максимум

metrics_results_max = []

for i in do_aggf(np.max)[1:]:
    metrics_results_max.append(i)

print(f'Best Threshold = {round(do_aggf(np.max)[0], 5)}')

for result, name in zip(metrics_results_max, metrics_names):
    print(f'{name} = {round(result, 5)}')

Best Threshold = 0.37444
F-Score = 0.8008
Precision = 0.78968
Recall = 0.81224
Roc_Auc_Score = 0.97615


### Задание №5

In [15]:
summary = pd.DataFrame(data = {'Metrics': metrics_names,
                               'Mean': metrics_results_mean,
                               'Median': metrics_results_median,
                               'Max': metrics_results_max})
summary

Unnamed: 0,Metrics,Mean,Median,Max
0,F-Score,0.676056,0.733813,0.800805
1,Precision,0.594427,0.655949,0.789683
2,Recall,0.783673,0.832653,0.812245
3,Roc_Auc_Score,0.942732,0.966531,0.97615


### Задание №6


Используя максимум, мы получаем более высокие результаты. Метрики про расчёте среднего выглядят хуже всего. Используя медиану, мы получаем довольно сбалансированные и близкие друг к другу значения, у максимума разброс чуть больше. У максимума сильно выделяется Recall на фоне остальных. Если в задаче бизнеса стоит максимизировать Recall, то неплохо использовать максимум для расчёта.