## News portal

You need to build a churn forecasting model.

We need:

     user vector representation
     segments describing the interests of the user

p.s. in the context of our task is the same

#### Let's do a clustering of documents with a subsequent attempt to interpret them (cluster)

In [1]:
import pandas as pd

#### Our news

In [2]:
news = pd.read_csv("./data/articles.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


#### Download users and lists of the latest read news

In [4]:
users = pd.read_csv("./data/users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


#### So, we need to get vector representations of users based on the news they read and the news itself.

In [5]:
# !pip install razdel pymorphy2 pyLDAvis

#### text preprocessing

In [6]:
import re
import numpy as np
from gensim.corpora.dictionary import Dictionary
from razdel import tokenize  # Russian text segmentation into tokens and sentences
import pymorphy2  # Morphological analyzer

In [7]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/recpi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
stopword_ru = stopwords.words('russian')
print(len(stopword_ru))

151


In [10]:
with open('./data/stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
    
stopword_ru += additional_stopwords
len(stopword_ru)

776

In [12]:
def clean_text(text):
    '''
    text cleaning
    
    cleared text output
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    text = re.sub('n', ' ', text)
    
    return text

cache = {}
morph = pymorphy2.MorphAnalyzer()

def lemmatization(text):    
    '''
    lemmatization
         [0] if the entered type is not `str`, make it `str`
         [1] sentence tokenization via section
         [2] checking if there is a '-' at the beginning of a word
         [3] token validation from one character
         [4] checking if the given word is in the cache
         [5] word lemmatization
         [6] check for stop words

     the output is a list of lemmatized tokens
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w) > 1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [13]:
from tqdm import tqdm
tqdm.pandas()

# We start cleaning the text.
news['title'] = news['title'].progress_apply(lambda x: clean_text(x))

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
100%|███████████████████████████████████| 27000/27000 [00:26<00:00, 1024.06it/s]


In [14]:
news['title'].iloc[:10]

0    заместитель председателя правительства рф серг...
1    матч  финала кубка россии по футболу был приос...
2    форвард авангарда томаш заборский прокомментир...
3    главный тренер кубани юрий красножан прокоммен...
4    решением попечительского совета владивостокско...
5    ио главного тренера вячеслав буцаев прокоммент...
6    запорожский металлург дома потерпел разгромное...
7    сборная сша одержала победу над австрией со сч...
8    бывший защитник сборной россии дарюс каспарайт...
9    полузащитник цска зоран тошич после победы над...
Name: title, dtype: object

In [15]:
# We start lemmatization of the text.
news['title'] = news['title'].progress_apply(lambda x: lemmatization(x))

100%|████████████████████████████████████| 27000/27000 [03:20<00:00, 134.51it/s]


In [16]:
# create a list of our texts
texts = list(news['title'].values)

# Create a corpus from a list with texts
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

#### Launching training

In [17]:
N_topic = 20  # set 20 topics

In [18]:
from gensim.models import LdaModel

# We train the model on the body
lda = LdaModel(common_corpus, num_topics=N_topic, id2word=common_dictionary, passes=2)  # could change

In [19]:
from gensim.test.utils import datapath

# Save the model to disk
temp_file = datapath("model.lda")
lda.save(temp_file)

In [20]:
# Loading the trained model from disk
lda = LdaModel.load(temp_file)

In [23]:
# Create a new corpus of documents that have not been seen before
other_texts = list(news['title'].iloc[:3])
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[2]
print(other_texts[2])
lda[unseen_doc] 

['форвард', 'авангард', 'томаш', 'заборский', 'прокомментировать', 'игра', 'команда', 'матч', 'чемпионат', 'кхл', 'против', 'атланта', 'плохой', 'матч', 'нижний', 'новгород', 'против', 'торпедо', 'настраиваться', 'первый', 'минута', 'включиться', 'заборский', 'получиться', 'забросить', 'быстрый', 'гол', 'задать', 'хороший', 'темп', 'поединок', 'играть', 'хороший', 'сторона', 'пять', 'очко', 'выезд', 'девять', 'хороший']


[(4, 0.3255905),
 (5, 0.11816321),
 (7, 0.3375133),
 (14, 0.091537915),
 (15, 0.107379794)]

#### what our themes look like

In [25]:
x = lda.show_topics(num_topics=N_topic, num_words=7, formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

# Printing only words
for topic, words in topics_words:
    print(f"topic_{topic}: " + " ".join(words))

topic_0: газ ракета турция поток запустить турецкий российский
topic_1: всё очень большой уровень рынок например жизнь
topic_2: ребёнок писать семья журнал возраст болезнь мужчина
topic_3: взрыв товар квартира следствие конкурс министерство рф
topic_4: млн тыс цена составить поверхность снижение место
topic_5: фонд решение сша россия закон против миссия
topic_6: район дом сотрудник день житель область пострадать
topic_7: научный первый университет планета новый звезда вода
topic_8: продукция высота вуз одежда производитель спасение грунт
topic_9: военный станция сша экипаж сила боевой северный
topic_10: россия пациент мероприятие белоруссия путин новость владимир
topic_11: земля рак космос сон билет доклад су
topic_12: млрд рубль банк рост объём млн экономика
topic_13: снижение граница диапазон ресторан вдвое почва таможенный
topic_14: наука автор запуск первый сша китай опубликовать
topic_15: кость лётчик автобус франция австралийский ядро медик
topic_16: украина гражданин киев народн

#### Let's write a function that will return a vector representation of the news.

In [27]:
def get_lda_vector(lda, text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]

    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(N_topic):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [28]:
get_lda_vector(lda, news['title'].iloc[0])

array([0.        , 0.        , 0.        , 0.5775367 , 0.        ,
       0.        , 0.        , 0.04580013, 0.        , 0.        ,
       0.14273918, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.01526056, 0.        , 0.2113193 ])

In [32]:
topic_matrix = pd.DataFrame([get_lda_vector(lda, text) for text in news['title'].values])
topic_matrix.columns = [f'topic_{i}' for i in range(N_topic)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+[f'topic_{i}' for i in range(N_topic)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,6,0.0,0.0,0.0,0.577542,0.0,0.0,0.0,0.045791,0.0,...,0.142748,0.0,0.0,0.0,0.0,0.0,0.0,0.015261,0.0,0.211314
1,4896,0.0,0.0,0.0,0.695472,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.280832,0.0,0.0,0.0,0.0
2,4897,0.0,0.0,0.0,0.0,0.325586,0.118162,0.0,0.337524,0.0,...,0.0,0.0,0.0,0.0,0.091538,0.107374,0.0,0.0,0.0,0.0
3,4898,0.0,0.300112,0.0,0.054843,0.107605,0.0,0.0,0.232914,0.0,...,0.0,0.012029,0.0,0.0,0.0,0.038936,0.0,0.0,0.245219,0.0
4,4899,0.0,0.0,0.0,0.0,0.184195,0.0,0.0,0.0,0.0,...,0.789316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### we got our news vectors! And we know how to interpret the resulting themes.

### user vector representations

In [30]:
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [33]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[[f'topic_{i}' for i in range(N_topic)]].values))

In [34]:
doc_dict[293672]

array([0.        , 0.05187816, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.16319355, 0.        , 0.33586147, 0.        , 0.03581395,
       0.        , 0.        , 0.05621139, 0.        , 0.33758086])

In [35]:
def get_user_embedding(user_articles_list, doc_dict, func):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = func(user_vector, axis=0)
    return user_vector

In [36]:
user_articles_list = users['articles'].iloc[33]

get_user_embedding(user_articles_list, doc_dict, np.mean)

array([0.02451139, 0.10684752, 0.        , 0.05170694, 0.02833607,
       0.16498322, 0.06280895, 0.00789706, 0.        , 0.05159412,
       0.09180262, 0.        , 0.00408273, 0.        , 0.04104322,
       0.        , 0.00484573, 0.12723623, 0.0154221 , 0.20525973])

#### The dataset is ready - you can try to train the model. Let's load our markup

In [66]:
target = pd.read_csv("./data/users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [67]:
target.loc[target['uid']=='u108339']

Unnamed: 0,uid,churn
1927,u108339,1


#### Now we will get embeddings for all users and check their quality on a specific downstream task

In [68]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [69]:
from sklearn.metrics import (f1_score, roc_auc_score, precision_score,
                             classification_report, precision_recall_curve, confusion_matrix)

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC


In [71]:
FUNC = [np.mean,np.median,np.max]

In [74]:
def change_func(FUNC):
    metrics_df = pd.DataFrame(columns=['model', 'thresh', 'F-Score', 'Precision', 'Recall', 'ROC AUC'])
    for func in FUNC:
        user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x, doc_dict, func))])
        user_embeddings.columns = [f'topic_{i}' for i in range(N_topic)]
        user_embeddings['uid'] = users['uid'].values
        user_embeddings = user_embeddings[['uid']+[f'topic_{i}' for i in range(N_topic)]]
        
        X = pd.merge(user_embeddings, target, 'left')
        X_train, X_test, y_train, y_test = train_test_split(X[[f'topic_{i}' for i in range(N_topic)]], 
                                                    X['churn'], random_state=0)
        model = LogisticRegression()
        # will train 
        model.fit(X_train, y_train)
        # our predictions for the test sample
        preds = model.predict_proba(X_test)[:, 1]
        
        precision, recall, thresholds = precision_recall_curve(y_test, preds)
        fscore = (2 * precision * recall) / (precision + recall)
        # locate the index of the largest f score
        ix = np.argmax(fscore)
        print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')   
        roc_auc = roc_auc_score(y_test, preds)
        print(f'roc_auc= {roc_auc}')
        
        metrics_df = metrics_df.append({
        'model': func.__name__,
        'thresh': thresholds[ix],
        'F-Score': fscore[ix],
        'Precision': precision[ix],
        'Recall': recall[ix],
        'ROC AUC': roc_auc
        }, ignore_index=True)
    return metrics_df

In [91]:
change_tab = change_func(FUNC)
change_tab

Best Threshold=0.33316306455230127, F-Score=0.742, Precision=0.771, Recall=0.714
roc_auc= 0.964777021919879


  metrics_df = metrics_df.append({


Best Threshold=0.3151218946422944, F-Score=0.789, Precision=0.763, Recall=0.816
roc_auc= 0.9760451189022618


  metrics_df = metrics_df.append({


Best Threshold=0.40079181649557616, F-Score=0.789, Precision=0.826, Recall=0.755
roc_auc= 0.9786196871911157


  metrics_df = metrics_df.append({


Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,mean,0.333163,0.741525,0.770925,0.714286,0.964777
1,median,0.315122,0.788955,0.763359,0.816327,0.976045
2,amax,0.400792,0.788913,0.825893,0.755102,0.97862


### Let's try to weigh the news by tfidf (taking the user's list of news)

In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [78]:
users['articles_str'] = users['articles'].apply(lambda x: x.replace('[','').replace(']', '').replace(',', ''))

users['articles_str'].iloc[0]

'293672 293328 293001 293622 293126 1852'

In [79]:
tfidf = TfidfVectorizer()
tfidf.fit(users['articles_str'])

TfidfVectorizer()

In [80]:
idf = pd.DataFrame({'article_id': tfidf.get_feature_names_out(),
                    'idf': tfidf.idf_})

idf

Unnamed: 0,article_id,idf
0,10,8.888710
1,100,7.907880
2,1000,8.041412
3,1001,8.888710
4,1002,8.888710
...,...,...
14776,995,8.377884
14777,996,8.195562
14778,997,8.601027
14779,998,9.294175


In [81]:
def get_user_embedding_idf(user_articles_list, doc_dict):
    user_articles_list = eval(user_articles_list)
    
    user_vector = np.zeros((len(user_articles_list), N_topic))
    for i, doc_id in enumerate(user_articles_list):
        try:
            weight = idf[idf['article_id'] == str(doc_id)]['idf'].values[0]
        except Exception as e:
            weight = 0
        user_vector[i] = doc_dict[doc_id] * weight

    user_vector = np.median(user_vector, axis=0)
    return user_vector

In [82]:
from tqdm import tqdm
tqdm.pandas()

user_embeddings = pd.DataFrame([i for i in users['articles'].progress_apply(lambda x: get_user_embedding_idf(x, doc_dict))])
user_embeddings.columns = [f'topic_{i}' for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+[f'topic_{i}' for i in range(N_topic)]]
user_embeddings.head(3)

100%|███████████████████████████████████████| 8000/8000 [01:48<00:00, 73.85it/s]


Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,u105138,0.0,0.105247,0.061665,0.0,0.0,0.0,0.0,0.0,0.0,...,1.221923,0.0,0.503128,0.0,0.0,0.0,0.090731,0.892628,0.626059,0.794026
1,u108690,0.0,0.84502,0.181361,0.0,0.143945,0.624564,0.2717,0.054869,0.0,...,0.342481,0.0,0.0,0.0,0.271889,0.0,0.211135,1.513369,0.3867,1.666428
2,u108339,0.050921,0.0,0.28562,0.594515,0.120825,0.641346,1.700731,0.046063,0.0,...,0.335657,0.0,0.21511,0.0,0.125005,0.0,0.0,1.124439,0.419803,1.79864


In [83]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,churn
0,u105138,0.0,0.105247,0.061665,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.503128,0.0,0.0,0.0,0.090731,0.892628,0.626059,0.794026,0
1,u108690,0.0,0.84502,0.181361,0.0,0.143945,0.624564,0.2717,0.054869,0.0,...,0.0,0.0,0.0,0.271889,0.0,0.211135,1.513369,0.3867,1.666428,1
2,u108339,0.050921,0.0,0.28562,0.594515,0.120825,0.641346,1.700731,0.046063,0.0,...,0.0,0.21511,0.0,0.125005,0.0,0.0,1.124439,0.419803,1.79864,1


In [84]:
X_train, X_test, y_train, y_test = train_test_split(X[[f'topic_{i}' for i in range(N_topic)]], 
                                                    X['churn'], random_state=0)

In [85]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [86]:
preds = model.predict_proba(X_test)[:, 1]
preds[:10]

array([1.51440662e-02, 6.26332878e-05, 9.47527536e-01, 3.01401444e-01,
       4.63881861e-03, 4.62870904e-04, 1.71638267e-03, 1.45788608e-02,
       7.24456253e-03, 4.74336421e-03])

In [87]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')                                                                        

Best Threshold=0.48644142239736204, F-Score=0.882, Precision=0.882, Recall=0.882


In [88]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.9924739810454095

In [89]:
metrics_df = metrics_df.append({
    'model': 'idf_median',
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc
}, ignore_index=True)

  metrics_df = metrics_df.append({


In [90]:
metrics_df

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,idf_median,0.486441,0.881633,0.881633,0.881633,0.992474


In [93]:
common_table = pd.concat([change_tab,metrics_df],axis=0)
common_table

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,mean,0.333163,0.741525,0.770925,0.714286,0.964777
1,median,0.315122,0.788955,0.763359,0.816327,0.976045
2,amax,0.400792,0.788913,0.825893,0.755102,0.97862
0,idf_median,0.486441,0.881633,0.881633,0.881633,0.992474
