### 4. 
### TF-IDF преобразование нормализованных заголовков
### Построение модели с TF-IDF фичами заголовков и полных текстов
### Обучение и предсказание

In [2]:
import numpy as np
import pandas as pd
import warnings
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import cosine_distances
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from tqdm.notebook import tqdm
from xgboost import XGBClassifier

def get_top_words(corpus, top_word_count = 5):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    top_words = [w[0] for w in words_freq[:top_word_count]]
    return top_words
    

def add_features(df, tf_idf_feature_count=15):
    def scale(col_name):
        scaler = StandardScaler()
        scaler.fit(df[col_name].values.reshape(-1, 1))
        df[col_name] = scaler.transform(df[col_name].values.reshape(-1, 1))

    df['len_word'] = list(map(lambda x: len(x.split()), df['title']))
    # средняя длина заголовка в категории
    df['mean_len'] = np.zeros(len(df.index))
    df['var_len'] = np.zeros(len(df.index))
    for i in df['group_id'].unique():
        df.loc[df['group_id'] == i, 'mean_len'] = df.loc[df['group_id'] == i, 'len_word'].mean()
        df.loc[df['group_id'] == i, 'mean_len'] = df.loc[df['group_id'] == i, 'len_word'].var()
    scale('len_word')
    scale('mean_len')
    scale('var_len')
    # расстояния до ближайших соседей по tf-idf
    for j in range(tf_idf_feature_count):
        df[f'tif_{j}'] = np.zeros(len(df.index))
    knn = NearestNeighbors(metric='cosine')
    df['dist_mean'] = np.zeros(len(df.index))
    df['dist_var'] = np.zeros(len(df.index))
    for i in df['group_id'].unique():
        X = TfidfVectorizer().fit_transform(df.loc[df['group_id'] == i, 'title'])
        knn.fit(X)
        distances = knn.kneighbors(n_neighbors=tf_idf_feature_count)[0]
        for j in range(tf_idf_feature_count):
            df.loc[df['group_id'] == i, f'tif_{j}'] = distances[:, j]
        # характеристики попарных расстояний
        cd = cosine_distances(X)
        df.loc[df['group_id'] == i, 'dist_mean'] = cd.mean()
        df.loc[df['group_id'] == i, 'dist_var'] = cd.var()
    scale('dist_mean')
    scale('dist_var')
    scale('text_len_word')
    scale('text_len_word_mean')
    scale('text_len_word_var')
    # характеристики по топ-словам
    for i in df['group_id'].unique():
        group = df.loc[df['group_id'] == i]
        top_words = get_top_words(group['title'])
        warnings.filterwarnings('ignore')
        group['tit_words'] = list(map(lambda x:x.split(), group['title']))
        warnings.filterwarnings('default')
        df.loc[df['group_id'] == i, 'top_in_tit'] = list(map(lambda x:sum([w in x for w in top_words]) / len(top_words), group['tit_words']))
        df.loc[df['group_id'] == i, 'tit_in_top'] = list(map(lambda x:sum([w in top_words for w in x]) / max(1, len(x)), group['tit_words']))
    # относительная доля различных символов в заголовке
    df2 = df.copy()
    df2['dig_cnt'] = list(map(lambda x:sum(c.isdigit() for c in x), df['title']))
    df2['cyr_cnt'] = list(map(lambda x:sum(c in 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя' for c in x), df['title']))
    df2['lat_cnt'] = list(map(lambda x:sum(c in 'abcdefghijklmnopqrstuvwxyz' for c in x), df['title']))
    df2['tit_len'] = list(map(lambda x:len(x), df['title']))
    df['dig_prop'] = df2['dig_cnt'] / df2['tit_len']
    df['cyr_prop'] = df2['cyr_cnt'] / df2['tit_len']
    df['lat_prop'] = df2['lat_cnt'] / df2['tit_len']
    # относительные доли символов по группам
    for i in df['group_id'].unique():
        tot_title = ''.join(df.loc[df['group_id'] == i, 'title'])
        tit_len = len(tot_title)
        df.loc[df['group_id'] == i, 'dig_prop_gr'] = sum(c.isdigit() for c in tot_title) / tit_len
        df.loc[df['group_id'] == i, 'cyr_prop_gr'] = sum(c in 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя' for c in tot_title) / tit_len
        df.loc[df['group_id'] == i, 'lat_prop_gr'] = sum(c in 'abcdefghijklmnopqrstuvwxyz' for c in tot_title) / tit_len

  import pandas.util.testing as tm


In [4]:
df = pd.read_csv('train_fulltext2.csv').fillna('')
#add_features(df)
#df = df.fillna(0)
df

Unnamed: 0,pair_id,group_id,doc_id,target,title,full_tfidf_0,full_tfidf_1,full_tfidf_2,full_tfidf_3,full_tfidf_4,...,full_tfidf_13,full_tfidf_14,text_len_word,text_len_word_mean,text_len_word_var,jaf0,jaf1,jaf2,jaf3,jaf4
0,1,1,15731,0,ваз 21213 замена подшипник ступица нива,0.142770,0.148897,0.445392,0.614014,0.624358,...,0.864465,0.866078,602.0,2023.598039,1.189762e+07,0.298801,-0.713882,0.642332,-0.450882,-1.179637
1,2,1,14829,0,ваз 2107 опт сочи сравнивать цена купить потре...,0.274238,0.329231,0.362604,0.470088,0.519535,...,0.821580,0.824289,1829.0,2023.598039,1.189762e+07,-0.424289,0.596078,0.775730,0.451130,0.265778
2,3,1,15764,0,купить ступица лад калина2 трансмиссия переход...,0.654946,0.725212,0.725352,0.762938,0.825418,...,0.897647,0.900443,1577.0,2023.598039,1.189762e+07,0.288688,1.159767,0.728335,-0.867552,1.030773
3,4,1,17669,0,классика 21010 21074,0.303608,0.360179,0.774168,0.784173,0.787643,...,0.855384,0.878428,4455.0,2023.598039,1.189762e+07,-0.705952,0.440252,1.060414,0.536020,-1.179637
4,5,1,14852,0,ступица нива замена подшипник свой рука,0.655133,0.657009,0.671876,0.721143,0.722285,...,0.798292,0.815899,643.0,2023.598039,1.189762e+07,0.924060,-0.977825,0.127882,-0.648225,-1.179637
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11685,11686,129,26672,0,апреленок 2014 6,0.749994,0.779016,0.799159,0.802812,0.826964,...,0.855401,0.857803,13978.0,9816.076923,2.812439e+08,1.430302,0.883967,0.257093,-1.451526,1.012992
11686,11687,129,25838,0,gästebuch,0.751653,0.778216,0.821056,0.839754,0.865445,...,0.951366,0.955215,37656.0,9816.076923,2.812439e+08,-1.030510,-1.453057,-0.202368,1.399799,0.715529
11687,11688,129,25703,0,jizolofej archive,0.633327,0.642604,0.659348,0.662014,0.665541,...,0.736894,0.740978,14004.0,9816.076923,2.812439e+08,1.329492,-1.453057,0.778846,-1.451526,0.251685
11688,11689,129,27885,0,звать парень диана шурыгина пусть говорить диа...,0.288560,0.899693,0.901883,0.902059,0.902656,...,0.922609,0.923807,6532.0,9816.076923,2.812439e+08,0.737032,-1.453057,0.958915,0.427482,0.495714


In [2]:
def get_classifier(X_train, y_train, model_):
    if model_ == 'svm':
        clf = make_pipeline(StandardScaler(), SVC(gamma='auto', class_weight='balanced'))
    elif model_ == 'xgb':    
        clf = XGBClassifier()
    elif model_ == 'gb':
        clf = GradientBoostingClassifier(random_state=42)
    elif model_ == 'rf':
        clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)
    return clf

def get_model_score(X, y, model, n_splits=3):
    group_kfold = GroupKFold(n_splits=n_splits)
    scores = []
    for train_index, test_index in group_kfold.split(X, y, df['group_id']):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = get_classifier(X_train, y_train, model)
        y_pred = clf.predict(X_test)
        scores.append(f1_score(y_test, y_pred))
    return np.mean(scores)

X = df.drop(['pair_id', 'doc_id', 'title', 'target', 'group_id'], axis=1)
y = df['target']
model_scores = []
for model in tqdm(['svm', 'gb', 'rf']):
    model_scores.append(get_model_score(X, y, model))
np.mean(model_scores)
# 0.7554901890005347

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




0.7554901890005347

In [3]:
model_scores

[0.7568569529143884, 0.7489365746204176, 0.7606770394667981]

In [4]:
model = 'svm'
df['train'] = 'train'
df_test = pd.read_csv('test_fulltext2.csv').fillna('')
df_test['train'] = 'test'
full_df = df.append(df_test)
add_features(full_df)
full_df = full_df.fillna(0)
df_test = full_df.loc[full_df['train'] == 'test'].drop(['train'], axis=1)
df = df.drop(['train'], axis=1)
X = df.drop(['pair_id', 'doc_id', 'title', 'target', 'group_id'], axis=1)
y = df['target']
clf = get_classifier(X, y, model)
df_test['target'] = clf.predict(df_test.drop(['pair_id', 'group_id', 'doc_id', 'title', 'target'], axis=1))
df_test[['pair_id', 'target']].to_csv(f'predict_full_text_{model}.csv', index=False)