### 3. TF-IDF преобразование нормализованных текстов веб-страниц

In [1]:
import os
import numpy as np
import pandas as pd
import warnings
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm

In [None]:
def get_top_words(corpus, top_word_count = 5):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    top_words = [w[0] for w in words_freq[:top_word_count]]
    return top_words

def add_group_feature(group_id, tf_idf_feature_count=15):
    warnings.filterwarnings('ignore')
    group = df.loc[df['group_id']==group_id]
    top_words = get_top_words(group['title'])
    group['text'] = ''
    for index, row in group.iterrows():
        file_name = os.path.join('fulltext', f'{row["doc_id"]}.txt')
        if os.path.isfile(file_name):
            with open(file_name) as f:
                group['text'][index] = f.read()
    warnings.filterwarnings('default')
    # расстояния до ближайших соседей
    knn = NearestNeighbors(metric='cosine')
    X = TfidfVectorizer().fit_transform(group['text']) #ngram_range=(1, 2)
    svd = TruncatedSVD(n_components=len(group.index))
    X = svd.fit_transform(X)
    knn.fit(X)
    distances = knn.kneighbors(n_neighbors=tf_idf_feature_count)[0]
    for j in range(tf_idf_feature_count):
        df.loc[df['group_id'] == group_id, f'full_tfidf_{j}'] = distances[:, j]
    # параметры длины страницы в словах
    warnings.filterwarnings('ignore')
    group['len_word'] = list(map(lambda x: len(x.split()), group['text']))
    warnings.filterwarnings('default')
    df.loc[df['group_id'] == group_id, 'text_len_word'] = group['len_word']
    df.loc[df['group_id'] == group_id, 'text_len_word_mean'] = group['len_word'].mean()
    df.loc[df['group_id'] == group_id, 'text_len_word_var'] = group['len_word'].var()
    # tf-idf документа для топ-5 слов в текстах
    warnings.filterwarnings('ignore')
    for i in range(len(top_words)):
        group[f'jaf{i}'] = np.zeros(len(group.index))
    warnings.filterwarnings('default')
    for index, row in group.iterrows():
        tfidf = TfidfVectorizer(min_df=1)
        try:
            vec = tfidf.fit_transform(row['text'].split())
            features = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
            warnings.filterwarnings('ignore')
            for i in range(len(top_words)):
                group[f'jaf{i}'][index] = features[top_words[i]] if top_words[i] in features else 0.
            warnings.filterwarnings('default')
        except ValueError:
            pass
    warnings.filterwarnings('ignore')
    for i in range(len(top_words)):
        scaler = StandardScaler()
        scaler.fit(group[f'jaf{i}'].values.reshape(-1, 1))
        group[f'jaf{i}'] = scaler.transform(group[f'jaf{i}'].values.reshape(-1, 1))
        df.loc[df['group_id'] == group_id, f'jaf{i}'] = group[f'jaf{i}']
    warnings.filterwarnings('default')

In [None]:
df = pd.read_csv('train_preprocessed.csv').fillna('')
for i in tqdm(df['group_id'].unique()):
    add_group_feature(i)
df.to_csv('train_fulltext2.csv', index=False)

In [None]:
df = pd.read_csv('test_preprocessed.csv').fillna('')
for i in tqdm(df['group_id'].unique()):
    add_group_feature(i)
df.to_csv('test_fulltext2.csv', index=False)