In [1]:
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
news = pd.read_csv("articles.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [4]:
users = pd.read_csv("users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


### 1. Векторные представления новостей

In [5]:
from gensim.corpora.dictionary import Dictionary

In [6]:
#предобработка текстов
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
#from nltk.tokenize import word_tokenize

from razdel import tokenize # https://github.com/natasha/razdel
#!pip install razdel

import pymorphy2  # pip install pymorphy2

In [7]:
stopword_ru = stopwords.words('russian')

morph = pymorphy2.MorphAnalyzer()

In [8]:
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords
len(stopword_ru)

776

In [9]:
add_words = ['nn', 'это', 'всё', 'который', 'год']
for w in add_words:
    stopword_ru.append(w)   
    
len(stopword_ru)

781

In [10]:
def clean_text(text):
    
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''    
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова
   
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [11]:
%%time
# Очистка текста
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

Wall time: 44.3 s


In [12]:
%%time
#Лемматизация текста.
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

Wall time: 5min 45s


In [13]:
#сформируем список наших текстов, разбив еще и на пробелы
texts = [t for t in news['title'].values]

# Create a corpus from a list of texts
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [14]:
from gensim.models import LdaModel
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)#, passes=10)

Wall time: 1min


In [15]:
from gensim.test.utils import datapath
# Save model to disk.
temp_file = datapath("model.lda")
lda.save(temp_file)

# Load a potentially pretrained model from disk.
lda = LdaModel.load(temp_file)

In [16]:
# Create a new corpus, made of previously unseen documents.
other_texts = [t for t in news['title'].iloc[:3]]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[2]
print(other_texts[2])
lda[unseen_doc] 

['форвард', 'авангард', 'томаш', 'заборский', 'прокомментировать', 'игра', 'свой', 'команда', 'матч', 'чемпионат', 'кхл', 'против', 'атланта', 'nnnn', 'провести', 'плохой', 'матч', 'нижний', 'новгород', 'против', 'торпедо', 'настраиваться', 'первый', 'минута', 'включиться', 'работа', 'сказать', 'заборский', 'получиться', 'забросить', 'быстрый', 'гол', 'задать', 'хороший', 'темп', 'поединок', 'мочь', 'играть', 'ещё', 'хороший', 'сторона', 'пять', 'очко', 'выезд', 'девять', 'хороший']


[(6, 0.25256193),
 (12, 0.13096908),
 (18, 0.43434012),
 (20, 0.020914292),
 (21, 0.11208652),
 (22, 0.0317936)]

In [17]:
x=lda.show_topics(num_topics=25, num_words=7,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

#Below Code Prints Only Words 
for topic,words in topics_words:
    print("topic_{}: ".format(topic)+" ".join(words))

topic_0: исследование сша страна эксперт мочь россия российский
topic_1: тело температура фестиваль человек жертва штат помощь
topic_2: парламент кожа космос озеро вход святой пляж
topic_3: фонд доклад ким шутка чен сустав петров
topic_4: погибнуть произойти человек взрыв восток операция данные
topic_5: ракета пациент запуск исследование министерство технология область
topic_6: препарат организм рассказывать фильм стать специальный получить
topic_7: компания доллар продажа доход производитель египет продукция
topic_8: ребёнок гражданин человек автор также тысяча россия
topic_9: экономика земля млн также тыс цена район
topic_10: проверка закон налоговый законопроект вирус поправка депутат
topic_11: президент россия путин сша российский политический глава
topic_12: газ украина украинский поток турция рейс турецкий
topic_13: наука японский офицер япония подросток академия клинический
topic_14: суд решение банк рубль дело глава министр
topic_15: рак смерть спрос дональд университет предпри

In [18]:
#text = news['title'].iloc[0]

def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [19]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.0,0.0,0.0,0.010953,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.076575,0.0,0.017442,0.351322
1,4896,0.0,0.0,0.0,0.0,0.478572,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.432885,0.0,0.0,0.0
2,4897,0.0,0.0,0.0,0.0,0.0,0.0,0.252562,0.0,0.0,...,0.0,0.0,0.0,0.434363,0.0,0.02091,0.112069,0.031794,0.0,0.0
3,4898,0.0,0.0,0.0,0.0,0.0,0.0,0.046793,0.0,0.0,...,0.0,0.0,0.385373,0.319156,0.108267,0.0,0.130113,0.0,0.0,0.0
4,4899,0.0,0.0,0.0,0.0,0.0,0.404616,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.146152,0.0,0.0,0.0


### Векторные представления пользователей

In [20]:
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [21]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

In [46]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)    
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])    
    user_vector = np.max(user_vector, 0)
        
    return user_vector

In [25]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.17303,0.084241,0.0,0.0,0.193371,0.080112,0.145909,0.043919,0.20056,...,0.0,0.039243,0.441957,0.056205,0.123881,0.645629,0.179398,0.028564,0.087938,0.111608
1,u108690,0.14841,0.024556,0.012508,0.0,0.080243,0.045512,0.07603,0.031612,0.119225,...,0.104717,0.012879,0.456719,0.051033,0.106633,0.146451,0.0,0.109588,0.035219,0.345399
2,u108339,0.096342,0.131598,0.026179,0.0,0.177942,0.027698,0.022997,0.011191,0.099326,...,0.024304,0.0,0.11679,0.036324,0.162607,0.066967,0.0,0.136941,0.153679,0.262927


In [26]:
target = pd.read_csv("users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [27]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,churn
0,u105138,0.17303,0.084241,0.0,0.0,0.193371,0.080112,0.145909,0.043919,0.20056,...,0.039243,0.441957,0.056205,0.123881,0.645629,0.179398,0.028564,0.087938,0.111608,0
1,u108690,0.14841,0.024556,0.012508,0.0,0.080243,0.045512,0.07603,0.031612,0.119225,...,0.012879,0.456719,0.051033,0.106633,0.146451,0.0,0.109588,0.035219,0.345399,1
2,u108339,0.096342,0.131598,0.026179,0.0,0.177942,0.027698,0.022997,0.011191,0.099326,...,0.0,0.11679,0.036324,0.162607,0.066967,0.0,0.136941,0.153679,0.262927,1


In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#import itertools

import matplotlib.pyplot as plt

%matplotlib inline

In [29]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [30]:
logreg = LogisticRegression()
#обучим 
logreg.fit(X_train, y_train)

LogisticRegression()

In [31]:
#наши прогнозы для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]
preds[:10]

array([0.21386174, 0.00279442, 0.59181874, 0.56001186, 0.02553244,
       0.04127121, 0.14720039, 0.02824227, 0.00924216, 0.35823844])

In [32]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

### Precision, Recall, F_score

In [33]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.346043, F-Score=0.796, Precision=0.804, Recall=0.788


In [34]:
roc_auc_score(y_test, preds)

0.9732240246525962

In [52]:
mean_dict = {'Best Threshold': 0.298, 'F-Score': 0.691, 'Precision': 0.652, 'Recall': 0.735, 'roc_auc_score': 0.95}
median_dict = {'Best Threshold': 0.279, 'F-Score': 0.634, 'Precision': 0.568, 'Recall': 0.718, 'roc_auc_score': 0.931}
max_dict = {'Best Threshold': 0.327, 'F-Score': 0.814, 'Precision': 0.782, 'Recall': 0.849, 'roc_auc_score': 0.975}


In [53]:
from tabulate import tabulate

In [59]:
table=[
       ['Best Threshold', mean_dict.get('Best Threshold'), median_dict.get('Best Threshold'), max_dict.get('Best Threshold')],
       ['F-Score', mean_dict.get('F-Score'), median_dict.get('F-Score'), max_dict.get('F-Score')], 
       ['Precision', mean_dict.get('Precision'), median_dict.get('Precision'), max_dict.get('Precision')], 
       ['Recall', mean_dict.get('Recall'), median_dict.get('Recall'), max_dict.get('Recall')],
       ['ROC AUC score', mean_dict.get('roc_auc_score'), median_dict.get('roc_auc_score'), max_dict.get('roc_auc_score')], 
       ]
headers=["mean","median", "max"]
print(tabulate(table, headers, tablefmt="grid"))

+----------------+--------+----------+-------+
|                |   mean |   median |   max |
| Best Threshold |  0.298 |    0.279 | 0.327 |
+----------------+--------+----------+-------+
| F-Score        |  0.691 |    0.634 | 0.814 |
+----------------+--------+----------+-------+
| Precision      |  0.652 |    0.568 | 0.782 |
+----------------+--------+----------+-------+
| Recall         |  0.735 |    0.718 | 0.849 |
+----------------+--------+----------+-------+
| ROC AUC score  |  0.95  |    0.931 | 0.975 |
+----------------+--------+----------+-------+


Наилучший результат - при изпользовании функции max для формирования векторов для пользователей. 
Я думаю, что это связано с тем, что данная функция выбирает топик с наибольшим значением вероятности для каждой статьи пользователя. То есть более чётко определяется ведущая тема. 

Положительную динамику при обучении дало внесение дополнительных "мусорных" слов в список stopword_ru.