# Задача «Радар тенденций новостных статей»

## Введение

Фраза “в нужный момент в нужном месте” хорошо описывает положение авторских текстов. Иногда качественно написанная статья проходит мимо своей потенциальной аудитории из-за более актуальных тем дня или неудачного заголовка.

Хорошо, что алгоритмы ИИ активно продвинулись в анализе текста и способны в автоматическом режиме анализировать и вычленять тенденции, а имея большой набор данных, можно научиться предсказывать их наперед.

Разумеется, что есть такие общемировые темы, которые невозможно предсказать, как, например, пандемия “коронавируса” или застрявший контейнеровоз, тем не менее исследования специалистов показывают, что в обществе есть тенденции, которые приходят и уходят в фиксированный временной период.

## Условие задачи

У компании РБК довольно взрослая аудитория, которую она хочет расширить за счет добавления статей на актуальные темы. Для этого вам нужно проанализировать лучшие новости российских СМИ и научиться предсказывать их популярность. Ожидается, что для этого будут использованы NLP модели.

## Описание входных значений

* train.csv — файл для обучения, содержит 7000 строчек, каждая из которых представляет из себя одну новостную статью
* test.csv — файл, содержащий 3000 строк, для предсказания
* sample_solution.csv — пример файла для отправки

## В наборе данных присутствует уникальных 11 строк:

* document id - идентификатор
* title - заголовок статьи
* publish_date - время публикации
* session - номер сессии
* authors - код автора
* views - количество просмотров
* depth - объем прочитанного материала
* full_reads_percent - процент читателей полностью прочитавших статью
* ctr - показатель кликабельности
* category - категория статьи
* tags - ключевые слова в статье

## На что стоит обратить внимание

Разрешено использование предобученных моделей. Платные модели или "приватные" модели использовать не разрешается.

## Метрика

Цель модели участников — предсказать 3 численные характеристики, которые в полной мере показывают популярность статьи: views, full reads percent, depth.

Для оценки качества решения используется метрика R2:

$result = 0.4*R2_{views}+0.3*R2_{full\_ reads\_percent}+0.3*R2_{depth}$

## Импорт необходимых библиотек и настройка среды

In [1]:
import nltk
import ruts
import warnings
import pymorphy2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import ru_core_news_md

from datetime import datetime
from string import punctuation
from tqdm.notebook import tqdm
from collections import Counter
from nltk.corpus import stopwords
from sklearn.metrics import r2_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
warnings.filterwarnings("ignore")
tqdm.pandas()
# Сброс ограничений на количество выводимых рядов
pd.set_option('display.max_rows', 20)
# Сброс ограничений на число столбцов
pd.set_option('display.max_columns', None)
# Сброс ограничений на количество символов в записи
pd.set_option('display.max_colwidth', None)

In [3]:
RANDOM_STATE = 42

## Загрузка датасета

In [4]:
train_data = pd.read_csv('train_pr_v2.csv', index_col='document_id')

In [5]:
#train_data.head(3)

In [6]:
test_data = pd.read_csv('test_pr_v2.csv', index_col='document_id')

In [7]:
#test_data.head(3)

### Описание датасетов

In [8]:
train_data.describe()

Unnamed: 0,ctr,views,depth,full_reads_percent,div_count,main_image,inline_items,url_count,rbk_pro,recommend_video,video_count,gallery,image_count
count,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0
mean,2.217779,30351.62,1.104794,34.619633,61.124571,0.864571,1.225571,5.063714,0.824857,0.441143,0.110714,0.006714,0.215714
std,2.778085,95477.86,0.065018,10.775901,24.501148,0.342205,0.730886,4.070805,0.380116,0.496559,0.3417,0.081671,1.156201
min,0.0,65.0,1.016,4.978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,5438.25,1.055,27.516,67.0,1.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0
50%,1.3485,11962.0,1.082,34.3345,68.5,1.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0
75%,3.5985,27053.0,1.147,41.36625,73.0,1.0,2.0,7.0,1.0,1.0,0.0,0.0,0.0
max,39.877,2554204.0,1.799,267.623,180.0,1.0,8.0,89.0,1.0,1.0,8.0,1.0,26.0


In [9]:
test_data.describe()

Unnamed: 0,ctr,div_count,main_image,inline_items,url_count,rbk_pro,recommend_video,video_count,gallery,image_count
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,2.286467,61.721333,0.854,1.259,5.127333,0.831667,0.464,0.121667,0.006667,0.244667
std,2.833735,24.016868,0.353165,0.772087,4.426396,0.374224,0.498785,0.342884,0.081391,1.355514
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,67.0,1.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0
50%,1.4485,69.0,1.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0
75%,3.654,73.0,1.0,2.0,7.0,1.0,1.0,0.0,0.0,0.0
max,30.531,181.0,1.0,9.0,89.0,1.0,1.0,3.0,1.0,33.0


### Информация о датасете

In [10]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7000 entries, 624ac09c9a7947db3d80c98eIDE7mtH4RBqGn-8MXfGffQ to 627f1c089a794743b070ff73hVvdVmFxS2SlZ2_lECDEow
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               7000 non-null   object 
 1   publish_date        7000 non-null   object 
 2   session             7000 non-null   object 
 3   authors             7000 non-null   object 
 4   ctr                 7000 non-null   float64
 5   category            7000 non-null   object 
 6   tags                7000 non-null   object 
 7   views               7000 non-null   int64  
 8   depth               7000 non-null   float64
 9   full_reads_percent  7000 non-null   float64
 10  url_id              7000 non-null   object 
 11  url                 7000 non-null   object 
 12  div_count           7000 non-null   int64  
 13  overview            7000 non-null   object 
 14  main_image          70

In [11]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3000 entries, 61f9569a9a794794245a82abJ0AvX96vTAaQCiWVbzoMdw to 6236f9129a79477b0ef18ae0ewByZQ1FQBK2dpXyY77rYw
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            3000 non-null   object 
 1   publish_date     3000 non-null   object 
 2   session          3000 non-null   object 
 3   authors          3000 non-null   object 
 4   ctr              3000 non-null   float64
 5   category         3000 non-null   object 
 6   tags             3000 non-null   object 
 7   url_id           3000 non-null   object 
 8   url              3000 non-null   object 
 9   div_count        3000 non-null   int64  
 10  overview         3000 non-null   object 
 11  main_image       3000 non-null   int64  
 12  inline_items     3000 non-null   int64  
 13  url_count        3000 non-null   int64  
 14  rbk_pro          3000 non-null   int64  
 15  recommend_video  3000 no

### Проверка на отсутствующие данные

In [12]:
train_data.isna().sum()

title           0
publish_date    0
session         0
authors         0
ctr             0
               ..
video_count     0
gallery         0
image_count     0
tags_word       0
text            0
Length: 24, dtype: int64

In [13]:
test_data.isna().sum()

title           0
publish_date    0
session         0
authors         0
ctr             0
               ..
video_count     0
gallery         0
image_count     0
tags_word       0
text            0
Length: 21, dtype: int64

### Уникальные значения

In [14]:
train_data.nunique()

title           6935
publish_date    6985
session         5901
authors          561
ctr             3178
                ... 
video_count        5
gallery            2
image_count       19
tags_word       6798
text            6809
Length: 24, dtype: int64

In [15]:
test_data.nunique()

title           2983
publish_date    3000
session         2718
authors          326
ctr             1617
                ... 
video_count        4
gallery            2
image_count       16
tags_word       2945
text            2932
Length: 21, dtype: int64

### Наличие дубликатов

In [16]:
set(train_data.duplicated())

{False}

In [17]:
set(test_data.duplicated())

{False}

## Feature engineering

In [18]:
df_train = train_data.copy()

In [19]:
df_train['full_reads_percent'] = df_train['full_reads_percent'].apply(lambda x: 100 if x > 100 else x)

In [20]:
#df_train.head(3)

### document_id

In [21]:
# def document_id_transform(dataframe):
#     df = dataframe.copy()
#     df['doc_id'] = df.index
#     for index, row in tqdm(df.iterrows()):
#         df.at[index, 'doc_id'] = row['doc_id'].replace(row['session'], '')
#     return df

In [22]:
#df_train = document_id_transform(df_train)

In [23]:
#df_train.head(3)

### ctr

In [24]:
# ctr = df_train['ctr'].to_list()
# plt.figure(figsize=(24, 12))
# plt.scatter(range(len(ctr)), ctr)
# plt.show()

In [25]:
# mm_scaler = MinMaxScaler()
# ctr_mm_scale = mm_scaler.fit_transform(df_train['ctr'].values.reshape(-1, 1))

In [26]:
# plt.figure(figsize=(24, 12))
# plt.scatter(range(len(ctr_mm_scale)), ctr_mm_scale)
# plt.show()

In [27]:
# std_scaler = StandardScaler()
# ctr_std_scale = std_scaler.fit_transform(df_train['ctr'].values.reshape(-1, 1))

In [28]:
# plt.figure(figsize=(24, 12))
# plt.scatter(range(len(ctr_std_scale)), ctr_std_scale)
# plt.show()

In [29]:
#df_train['ctr'] = ctr_std_scale

In [30]:
#df_train.head(3)

### publish_date

In [31]:
dates = sorted(df_train['publish_date'].to_list())
dates = sorted(list(set([el[:10] for el in dates])))[10:]
#dates

In [32]:
def publish_date_transform(dataframe):
    df = dataframe.copy()
    df['publish_date'] = df['publish_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
    df['month=2022-02'] = df['publish_date'].apply(lambda x: 1 if datetime(2022, 2, 1).date() <= x.date() <= datetime(2022, 2, 28).date() else 0)
    df['month=2022-03'] = df['publish_date'].apply(lambda x: 1 if datetime(2022, 3, 1).date() <= x.date() <= datetime(2022, 3, 31).date() else 0)
    df['month=2022-04'] = df['publish_date'].apply(lambda x: 1 if datetime(2022, 4, 1).date() <= x.date() <= datetime(2022, 4, 30).date() else 0)
    df['month=2022-05'] = df['publish_date'].apply(lambda x: 1 if datetime(2022, 5, 1).date() <= x.date() <= datetime(2022, 5, 31).date() else 0)
    df['month=other'] = df['publish_date'].apply(lambda x: 1 if x.date() < datetime(2022, 2, 1).date() or x.date() > datetime(2022, 5, 31).date() else 0)
    df['hour_sin'] = df['publish_date'].apply(lambda x: np.sin((2 * np.pi * x.hour) / 24))
    df['hour_cos'] = df['publish_date'].apply(lambda x: np.cos((2 * np.pi * x.hour) / 24))
    for d in dates:
        c_name = f'date={d}'
        df[c_name] = df['publish_date'].apply(lambda x: 1 if str(x.date()) == d else 0)
    weekdays = list(range(1, 8))
    for w in weekdays:
        c_name = f'weekday={w}'
        df[c_name] = df['publish_date'].apply(lambda x: 1 if datetime.isoweekday(x) == w else 0)
#     df['weekday'] = df['publish_date'].apply(lambda x: datetime.isoweekday(x))
#     df['date'] = df['publish_date'].dt.date
#     df['hour'] = df['publish_date'].apply(lambda x: x.hour)
#     dates = pd.date_range(datetime(2022, 2, 1), datetime(2022, 5, 29))
#     hours = list(range(0, 24))
#     for h in hours:
#         c_name = f'hour={h}'
#         df[c_name] = df['publish_date'].apply(lambda x: 1 if x.hour == h else 0)
#     df['time'] = df['publish_date'].dt.time
#     df['year'] = df['publish_date'].apply(lambda x: x.year)
#     df['month'] = df['publish_date'].apply(lambda x: x.month)
#     df['day'] = df['publish_date'].apply(lambda x: x.day)
#     df['minute'] = df['publish_date'].apply(lambda x: x.minute)
#     df['second'] = df['publish_date'].apply(lambda x: x.second)
#     df.drop(['publish_date', 'date', 'weekday', 'hour'], axis=1, inplace=True)
    return df

In [33]:
df_train = publish_date_transform(df_train)

In [34]:
#df_train.head(3)

### category

In [35]:
print(Counter(df_train['category'].to_list()).most_common())

[('politics', 3988), ('society', 1456), ('business', 667), ('economics', 338), ('technology_and_media', 283), ('finances', 265), ('auto', 1), ('money', 1), ('realty', 1)]


In [36]:
print(Counter(test_data['category'].to_list()).most_common())

[('politics', 1718), ('society', 637), ('business', 273), ('economics', 153), ('technology_and_media', 127), ('finances', 92)]


In [37]:
list_of_categories = ['politics', 'society', 'business', 'economics', 'technology_and_media', 'finances']

In [38]:
def category_transform(dataframe):
    df = dataframe.copy()
    for c in list_of_categories:
        c_name = f'category={c}'
        df[c_name] = df['category'].apply(lambda x: 1 if x == c else 0)
    df['category=other'] = df['category'].apply(lambda x: 1 if x not in list_of_categories else 0)
    return df

In [39]:
df_train = category_transform(df_train)

In [40]:
#df_train.head(3)

### authors

In [41]:
def bag_of_words(dataframe, count_vec, col_name):
    df = dataframe.copy()
    x = count_vec.transform(df[col_name].to_list())
    cols_n = [f'{col_name}={c}' for c in count_vec.get_feature_names()]
    df_ = pd.DataFrame(x.toarray(), columns=cols_n, index=df.index)
    df = pd.concat([df, df_], axis=1)
    return df

In [42]:
# authors_number_of_papers = {}
# authors_counter = []
# for index, row in tqdm(df_train.iterrows()):
#     list_of_authors = row['authors']
#     if list_of_authors != 'no_authors':
#         list_of_authors = list_of_authors.split(',')
#         authors_counter.extend(list_of_authors)
#     else:
#         authors_counter.append('no_authors')
# authors_number_of_papers = Counter(authors_counter)
# print(authors_number_of_papers.most_common())

In [43]:
n_authors = 10

In [44]:
CountVecAuthors = CountVectorizer(ngram_range=(1, 1), min_df=n_authors, binary=True)
CountVecAuthors.fit(df_train['authors'].to_list())

CountVectorizer(binary=True, min_df=10)

In [45]:
# print(len(CountVecAuthors.get_feature_names()))
# print(CountVecAuthors.get_feature_names())

In [46]:
def authors_transform(dataframe):
    df = dataframe.copy()
    df['authors_number'] = df['authors'].apply(lambda x: len(x.split(',')) if x != 'no_authors' else 0)
    df = bag_of_words(df, CountVecAuthors, 'authors')
    return df

In [47]:
#df_train = bag_of_words(df_train, CountVecAuthors, 'authors')
df_train = authors_transform(df_train)

In [48]:
#df_train.head(3)

### tags

In [49]:
#df_train['tags'] = df_train['tags_name'].apply(lambda x: x.replace(' ', '_'))

In [50]:
# tags_number_of_papers = {}
# tags_counter = []
# for index, row in tqdm(df_train.iterrows()):
#     list_of_tags = row['tags_word']
#     if list_of_tags != 'no_tags':
#         list_of_tags = list_of_tags.split(',')
#         tags_counter.extend(list_of_tags)
#     else:
#         tags_counter.append('no_tags')
# tags_number_of_papers = Counter(tags_counter)
# print(tags_number_of_papers.most_common())

In [51]:
n_tags = 10

In [52]:
CountVecTags = CountVectorizer(ngram_range=(1, 1), min_df=n_tags, binary=True)
CountVecTags.fit(df_train['tags'].to_list())

CountVectorizer(binary=True, min_df=10)

In [53]:
# print(len(CountVecTags.get_feature_names()))
# print(CountVecTags.get_feature_names())

In [54]:
def tags_transform(dataframe):
    df = dataframe.copy()
    #df['tags'] = df['tags_name'].apply(lambda x: x.replace(' ', '_'))
    df['tags_number'] = df['tags'].apply(lambda x: len(x.split(',')) if x != 'no_tags' else 0)
    df = bag_of_words(df, CountVecTags, 'tags')
    return df

In [55]:
#df_train = bag_of_words(df_train, CountVecTags, 'tags')
df_train = tags_transform(df_train)

In [56]:
#df_train.head(3)

### title

In [57]:
def tf_idf(dataframe, tfidf, col_name):
    df = dataframe.copy()
    x = tfidf.transform(df[col_name].to_list())
    cols_n = [f'tfidf_{col_name}={c}' for c in tfidf.get_feature_names()]
    df_ = pd.DataFrame(x.toarray(), columns=cols_n, index=df.index)
    df = pd.concat([df, df_], axis=1)
    return df

In [58]:
#df_train['title_old'] = df_train['title'].copy()

In [59]:
russian_stopwords = stopwords.words("russian")
russian_stopwords.append('из-за')
print(russian_stopwords)

['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 'или', 'ни', 'быть', 'был', 'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 'потом', 'себя', 'ничего', 'ей', 'может', 'они', 'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 'чем', 'была', 'сам', 'чтоб', 'без', 'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 'кто', 'этот', 'того', 'потому', 'этого', 'какой', 'совсем', 'ним', 'здесь', 'этом', 'один', 'почти', 'мой', 'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем', 'всех', 'никогда', 'можно', 'при', 'наконец', 'два', 'об', 'другой', 'хоть', 'после', 'над', 'больше', 'тот', 'через', 'эти', 'нас', 'про', 'всего', 'них', 'какая', 'много', 'разве', 'три', 'эту', 'моя', 'впр

In [60]:
punctuation = list(punctuation)
punctuation.append('«')
punctuation.append('»')
punctuation.append('—')
punctuation = ''.join(punctuation)
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~«»—


In [61]:
morph = pymorphy2.MorphAnalyzer()

In [62]:
def lemmatize(text):
    words = text.lower()
    words = nltk.word_tokenize(words)
    words = [word for word in words if word not in russian_stopwords and word not in punctuation]
    res = [morph.parse(word)[0].normal_form for word in words]
    return res

In [63]:
def title_transform(dataframe):
    df = dataframe.copy()
    df['title_number_of_words'] = df['title'].apply(lambda x: len(nltk.word_tokenize(x)))
    df['title_number_of_characters'] = df['title'].apply(lambda x: len(x))
    df['title'] = df['title'].progress_apply(lambda x: ' '.join(lemmatize(x)))
    return df

In [64]:
df_train = title_transform(df_train)

  0%|          | 0/7000 [00:00<?, ?it/s]

In [65]:
#df_train.head(3)

In [66]:
# titles = ' '.join(df_train['title'].to_list())
# titles = titles.split(sep=' ')
# print(len(titles))
# count_title = Counter(titles)
# print(count_title.most_common())

#### Tf-Idf

In [67]:
# TfIdfTitle = TfidfVectorizer(ngram_range=(1, 1), min_df=2)
# TfIdfTitle.fit(df_train['title'].to_list())

In [68]:
# print(len(TfIdfTitle.get_feature_names()))
# print(TfIdfTitle.get_feature_names())

In [69]:
#df_train = tf_idf(df_train, TfIdfTitle, 'title')

#### Bag of Words

In [70]:
CountVecTitle = CountVectorizer(ngram_range=(1, 1), min_df=2, binary=True)
CountVecTitle.fit(df_train['title'].to_list())

CountVectorizer(binary=True, min_df=2)

In [71]:
# print(len(CountVecTitle.get_feature_names()))
# print(CountVecTitle.get_feature_names())

In [72]:
df_train = bag_of_words(df_train, CountVecTitle, 'title')

In [73]:
#df_train.head(3)

### overview

In [74]:
#df_train['overview_old'] = df_train['overview'].copy()

In [75]:
def overview_transform(dataframe):
    df = dataframe.copy()
    df['overview_number_of_words'] = df['overview'].apply(lambda x: len(nltk.word_tokenize(x)) if x != 'no_overview' else 0)
    df['overview_number_of_characters'] = df['overview'].apply(lambda x: len(x) if x != 'no_overview' else 0)
    df['overview'] = df['overview'].progress_apply(lambda x: ' '.join(lemmatize(x)))
    return df

In [76]:
df_train = overview_transform(df_train)

  0%|          | 0/7000 [00:00<?, ?it/s]

In [77]:
#df_train.head(3)

In [78]:
# overviews = ' '.join(df_train['overview'].to_list())
# overviews = overviews.split(sep=' ')
# print(len(overviews))
# count_overview = Counter(overviews)
# print(count_overview.most_common())

#### Tf-Idf

In [79]:
#TfIdfOverview= TfidfVectorizer(ngram_range=(1, 1), min_df=2)
#TfIdfOverview.fit(df_train['overview'].to_list())

In [80]:
#print(len(TfIdfOverview.get_feature_names()))
#print(TfIdfOverview.get_feature_names())

In [81]:
#df_train = tf_idf(df_train, TfIdfOverview, 'overview')

#### Bag of Words

In [82]:
CountVecOverview = CountVectorizer(ngram_range=(1, 1), min_df=2, binary=True)
CountVecOverview.fit(df_train['overview'].to_list())

CountVectorizer(binary=True, min_df=2)

In [83]:
# print(len(CountVecOverview.get_feature_names()))
# print(CountVecOverview.get_feature_names())

In [84]:
df_train = bag_of_words(df_train, CountVecOverview, 'overview')

In [85]:
#df_train.head(3)

### text

In [86]:
#df_train['text_old'] = df_train['text'].copy()

In [87]:
nlp_spacy = spacy.load('ru_core_news_md')

In [88]:
pos_list = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'SPACE']

In [89]:
def part_of_speech(text):
    pos_dict = dict([(x, 0) for x in pos_list])
    text_nlp = nlp_spacy(text)
    pos_list_text = [token.pos_ for token in text_nlp]
    pos_counter = Counter(pos_list_text)
    for pos in pos_list:
        if pos in pos_list_text:
            pos_dict[pos] = pos_counter[pos]
    return pos_dict

In [90]:
base_stats_list = ['n_sents', 'n_words', 'n_unique_words', 'n_long_words', 'n_complex_words', 'n_simple_words', 'n_monosyllable_words', 'n_polysyllable_words', 'n_chars',
                   'n_letters', 'n_syllables']

In [91]:
#readability_stats_list = ['flesch_kincaid_grade', 'flesch_reading_easy', 'coleman_liau_index', 'smog_index', 'automated_readability_index', 'lix']

In [92]:
#diversity_stats_list = ['ttr', 'rttr', 'cttr', 'httr', 'sttr', 'mttr', 'dttr', 'mattr', 'msttr', 'mtld', 'mamtld', 'hdd', 'simpson_index', 'hapax_index']

In [93]:
def text_transform(dataframe):
    df = dataframe.copy()
    #df['text_number_of_words'] = df['text'].apply(lambda x: len(nltk.word_tokenize(x)) if x != 'no_text' else 0)
    #df['text_number_of_characters'] = df['text'].apply(lambda x: len(x) if x != 'no_text' else 0)
    for el in pos_list:
        df[el] = 0
    for el in base_stats_list:
        df[el] = 0
#     for el in readability_stats_list:
#         df[el] = 0
#     for el in diversity_stats_list:
#         df[el] = 0
    for index, row in tqdm(df.iterrows()):
        row_text = row['text']
        if row_text != 'no_text':
            row_pos_dict = part_of_speech(row_text)
            row_bs_dict = ruts.BasicStats(row_text).get_stats()
            #row_rs_dict = ruts.ReadabilityStats(row_text).get_stats()
            #row_ds_dict = ruts.DiversityStats(row_text).get_stats()
            for el in pos_list:
                df.at[index, el] = row_pos_dict[el]
            for el in base_stats_list:
                df.at[index, el] = row_bs_dict[el]
#             for el in readability_stats_list:
#                 df.at[index, el] = row_rs_dict[el]
#             for el in diversity_stats_list:
#                 df.at[index, el] = row_ds_dict[el]
    df['text'] = df['text'].progress_apply(lambda x: ' '.join(lemmatize(x)))
    return df

In [94]:
df_train = text_transform(df_train)

0it [00:00, ?it/s]

  0%|          | 0/7000 [00:00<?, ?it/s]

In [95]:
#df_train.head(3)

In [96]:
#df_train[pos_list].describe()

In [97]:
# texts = ' '.join(df_train['text'].to_list())
# texts = texts.split(sep=' ')
# print(len(texts))
# count_text = Counter(texts)
# print(count_text.most_common())

#### Tf-Idf

In [98]:
#TfIdfText= TfidfVectorizer(ngram_range=(1, 1), min_df=2)
#TfIdfText.fit(df_train['text'].to_list())

In [99]:
#print(len(TfIdfText.get_feature_names()))
#print(TfIdfText.get_feature_names())

In [100]:
#df_train = tf_idf(df_train, TfIdfText, 'text')

#### Bag of Words

In [101]:
CountVecText = CountVectorizer(ngram_range=(1, 1), min_df=2, binary=True)
CountVecText.fit(df_train['text'].to_list())

CountVectorizer(binary=True, min_df=2)

In [102]:
# print(len(CountVecText.get_feature_names()))
# print(CountVecText.get_feature_names())

In [103]:
df_train = bag_of_words(df_train, CountVecText, 'text')

In [104]:
#df_train.head(3)

## Сохранение предобработанных датасетов

### Train

In [105]:
cols_to_drop = ['publish_date', 'session', 'authors', 'category', 'tags', 'title', 'url_id', 'url', 'tags_word', 'overview', 'text']

In [106]:
#df_train_finale = df_train.drop(cols_to_drop, axis=1)
df_train.drop(cols_to_drop, axis=1, inplace=True)

In [107]:
print(df_train.shape)

(7000, 35692)


In [108]:
#df_train.head(3)

In [109]:
#df_train.to_csv('train_features_v6.1.csv')

### Test

In [110]:
df_test = test_data.copy()

In [111]:
df_test = publish_date_transform(df_test)

In [112]:
df_test = category_transform(df_test)

In [113]:
#df_test = bag_of_words(df_test, CountVecAuthors, 'authors')
df_test = authors_transform(df_test)

In [114]:
#df_test = bag_of_words(df_test, CountVecTags, 'tags')
#df_test['tags'] = df_test['tags_name'].apply(lambda x: x.replace(' ', '_'))
df_test = tags_transform(df_test)

In [115]:
df_test = title_transform(df_test)

  0%|          | 0/3000 [00:00<?, ?it/s]

In [116]:
#df_test = tf_idf(df_test, TfIdfTitle, 'title')
df_test = bag_of_words(df_test, CountVecTitle, 'title')

In [117]:
df_test = overview_transform(df_test)

  0%|          | 0/3000 [00:00<?, ?it/s]

In [118]:
#df_test = tf_idf(df_test, TfIdfOverview, 'overview')
df_test = bag_of_words(df_test, CountVecOverview, 'overview')

In [119]:
df_test = text_transform(df_test)

0it [00:00, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

In [120]:
#df_test = tf_idf(df_test, TfIdfText, 'text')
df_test = bag_of_words(df_test, CountVecText, 'text')

In [121]:
#df_test_finale = df_test.drop(cols_to_drop, axis=1)
df_test.drop(cols_to_drop, axis=1, inplace=True)

In [122]:
print(df_test.shape)

(3000, 35689)


In [123]:
#df_test.head(3)

In [124]:
#df_test.to_csv('test_features_v6.1.csv')