## Конвертация данных для обучения в более удобный для feature-extraction формат

In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_json('ranking_train.jsonl', lines=True, encoding='utf-8')
NUMBER_OF_COMMENTS = 5

text_df = pd.DataFrame(np.repeat(train['text'].values, NUMBER_OF_COMMENTS), columns=['post'])
commentary = []

for row in train['comments'].values:
    for j in range(NUMBER_OF_COMMENTS):
        commentary.append([row[j]['text'], row[j]['score']])
        
commentary_df = pd.DataFrame(commentary, columns=['comment', 'score'])


train = pd.concat([text_df, commentary_df], axis=1)

In [None]:
test = pd.read_json('ranking_test.jsonl', lines=True, encoding='utf-8')
NUMBER_OF_COMMENTS = 5

text_df = pd.DataFrame(np.repeat(test['text'].values, NUMBER_OF_COMMENTS), columns=['post'])
commentary = []

for row in test['comments'].values:
    for j in range(NUMBER_OF_COMMENTS):
        commentary.append(row[j]['text'])
        
commentary_df = pd.DataFrame(commentary, columns=['comment'])


test = pd.concat([text_df, commentary_df], axis=1)

## Избавление от проблем с кодировкой

In [None]:
from html import unescape

for col in ['post', 'comment']:
    train[col] = train[col].apply(unescape).apply(unescape)
    test[col] = test[col].apply(unescape).apply(unescape)

## Майнинг фичей

### Длина комментария в символах

In [None]:
train['num_of_symbols'] = train['comment'].apply(len)
test['num_of_symbols'] = test['comment'].apply(len)

### Количество слов в комментарии

In [None]:
def word_count(text: str) -> int:
    return len(text.split())

train['num_of_words'] = train['comment'].apply(word_count)
test['num_of_words'] = test['comment'].apply(word_count)

### Количество предложений в комментарии

In [None]:
import re

def sentence_count(text: str) -> int:
    return len(re.split(r'\s*([^.?!]+)\s*', text))

train['num_of_sentences'] = train['comment'].apply(sentence_count)
test['num_of_sentences'] = test['comment'].apply(sentence_count)

### Количество гласных в комментарии

In [None]:
vowels = set('AaEeIiOoYy')

def vowels_count(text: str) -> int:
    
    vowels_count = 0
    for vowel in vowels:
        vowels_count += text.count(vowel)
        
    return vowels_count

train['num_of_vowels'] = train['comment'].apply(vowels_count)
test['num_of_vowels'] = test['comment'].apply(vowels_count)

### Количество цитат в комментарии

In [None]:
import math

def quotes_count(text: str) -> int:
    return math.floor(text.count('"') / 2)

train['quotes'] = train['comment'].apply(quotes_count)
test['quotes'] = test['comment'].apply(quotes_count)

### Количество восклицательных знаков в комментарии

In [None]:
train['exclamations'] = train['comment'].apply(lambda text: text.count('!'))
test['exclamations'] = test['comment'].apply(lambda text: text.count('!'))

### Количество вопросительных знаков в комментарии

In [None]:
train['questions'] = train['comment'].apply(lambda x: x.count('?'))
test['questions'] = test['comment'].apply(lambda x: x.count('?'))

### Flesch Formula (Reading Ease score)

In [None]:
def calculateRSE(data: pd.DataFrame) -> float:
    ASL = 1.015 * data['num_of_words'] / data['num_of_sentences']
    ASW = 84.6 * data['num_of_vowels'] / data['num_of_words']
    return 206.835 - ASL  - ASW

train['RES'] = calculateRSE(train)
test['RES'] = calculateRSE(test)

### Количество ссылок

In [None]:
import re

def count_links(text: str) -> int:
    return len(re.findall(r'(https://.+?\..+?)', text))

train['num_of_links'] = train['comment'].apply(count_links)
test['num_of_links'] = test['comment'].apply(count_links)

### Близость поста и комментария по полярности

In [None]:
import textblob

def text_polarity(text_1: str, text_2: str) -> [-1, 1]:
    return 1/( 1 + abs(textblob.TextBlob(text_1).polarity - textblob.TextBlob(text_2).polarity))

train['text_comment_polarity'] = train[['post', 'comment']].apply(lambda x: text_polarity(*x), axis=1)
test['text_comment_polarity'] = test[['post', 'comment']].apply(lambda x: text_polarity(*x), axis=1)

### Близость поста и комментария по объективности

In [None]:
def text_subjectivity(text_1: str, text_2: str) -> [0, 1]:
    return 1 / (1 + abs(textblob.TextBlob(text_1).subjectivity - textblob.TextBlob(text_2).subjectivity))

train['text_comment_subjectivity'] = train[['post', 'comment']].apply(lambda x: text_subjectivity(*x), axis=1)
test['text_comment_subjectivity'] = test[['post', 'comment']].apply(lambda x: text_subjectivity(*x), axis=1)

### Количество процентов в комментарии

In [None]:
import re

def statistics_counter(text: str) -> int:
    return len(re.findall('([0-9]*\.?[0-9]*)\s*%', text))

train['amount_of_percent'] = train['comment'].apply(statistics_counter)
test['amount_of_percent'] = test['comment'].apply(statistics_counter)

### Количество не-ascii символов в комментарии

In [None]:
ascii_string = set(""" !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~""")

def count_non_ascii(text: str) -> int:
    ascii_count = sum(c in ascii_string for c in text)
    return len(text) - ascii_count

train['non_ascii_num'] = train['comment'].apply(count_non_ascii)
test['non_ascii_num'] = test['comment'].apply(count_non_ascii)

### Количество заглавных букв

In [None]:
uppercase = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ')

def count_uppercase(text: str) -> int:
    return sum(c in uppercase for c in text)

train['uppercase_num'] = train['comment'].apply(count_uppercase)
test['uppercase_num'] = test['comment'].apply(count_uppercase)

### Отношение длины комментария к длине поста

In [None]:
train['post_comment_length_ratio'] = train['comment'].apply(len) / train['post'].apply(len)
test['post_comment_length_ratio'] = test['comment'].apply(len) / test['post'].apply(len)

## Скачивание получившихся датасетов

In [None]:
train.to_csv("first_train.csv", index=False)
test.to_csv("test.csv", index=False)

## Визуализация результатов

In [None]:
feature_cols = [
    'num_of_symbols', 'num_of_words',
    'num_of_sentences', 'num_of_vowels', 
    'quotes', 'exclamations',
    'questions', 'RES', 'num_of_links', 
    'text_comment_polarity', 'text_comment_subjectivity', 
    'amount_of_percent', 'non_ascii_num',
    'uppercase_num', 'post_comment_length_ratio'
]

target_col = 'score'

In [None]:
import matplotlib.pyplot as plt
import math

fig, ax = plt.subplots(4, 4, figsize=(16,16))

for index, col in enumerate(feature_cols):
    data = train.groupby(target_col, as_index=False).agg({col: np.mean}) #Вместо np.mean ваша функция
    ax[math.floor(index / 4), index % 4].bar(data[target_col], data[col])
    ax[math.floor(index / 4), index % 4].set_title(col)
fig.tight_layout()
plt.show()

> link из приведённых графиков и здравого смысла можем понять, что все фичи завязаны на длине комментария

## Обучение модели

In [None]:
import pandas as pd
import numpy as np

from catboost import CatBoostRanker, CatBoostClassifier, CatBoostRegressor, Pool

from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold,  GridSearchCV, KFold
from sklearn.model_selection import GroupShuffleSplit 
from sklearn.metrics import ndcg_score

from sklearn.ensemble import VotingClassifier

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score, classification_report

from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer

import warnings
warnings.filterwarnings('ignore')

### Подготовка данных к обучению

In [None]:
train_features = train #pd.read_csv("first_train.csv")

train_features['score'] = 4 - train_features['score']

train_features = train_features.drop(["post", "comment"], axis = 1)

group_id = pd.Series([i//5 for i in range(train_features.shape[0])])

train_features["group_id"] = group_id

### Разделение на train и validation выборки

In [None]:
splitter = GroupShuffleSplit(test_size=.30, n_splits=2, random_state = 42)
split = splitter.split(train_features, groups=train_features['group_id'])
train_inds, valid_inds = next(split)

train = train_features.iloc[train_inds]
valid = train_features.iloc[valid_inds]

### Добавление параметра group_id

In [None]:
group_train = train["group_id"]
group_val = valid["group_id"]

train = train.drop(["group_id"], axis = 1)
valid = valid.drop(["group_id"], axis = 1)

y_train = train["score"]
X_train = train.drop(["score"], axis = 1)

y_val = valid["score"]
X_val= valid.drop(["score"], axis = 1)

#cross validation
cv = StratifiedKFold(shuffle=True, n_splits = 4)

In [None]:
def ndcg_score(y_true, y_pred):
    ndcg_sum = 0
    k=0
    for i in range(len(y_true)//5):
        k+=1
        ncdg_sum+=ncdg_score(y_true[5*i: i*(5+1)], y_pred[5*i: i*(5+1)], 5)
    return ndcg_sum/k

# Define the scoring function using make_scorer
ndcg_scorer = make_scorer(ndcg_score, needs_proba=False, greater_is_better=True)

### Обучение модели (catboost ranking)

In [None]:
val = Pool(X_val, y_val, group_id = group_val)


#grid of parameters
params_cat = {'depth': [4, 5, 7],
              'l2_leaf_reg': [7, 10],
              'learning_rate': [0.05, 0.1, 0.15]}


model_cat = CatBoostRanker(loss_function='YetiRank', custom_metric='NDCG') #Catboost model for ranking

#gridsearch model
GS_cat = GridSearchCV(cv =cv, estimator = model_cat, param_grid = params_cat, scoring = ndcg_scorer)

#fitting
GS_cat.fit(X_train, y_train, group_id=group_train, eval_set = val, verbose = 100)

### Сохранение и выгрузка модели (catboost ranking)

In [None]:
model_cat_best = GS_cat.best_estimator_#best estimator with the highest score
model_cat_best.save_model('catboost_ranking_model')#save best catboostRanking model

### Модель для относительной оценки комментариев 
###### (разрешение ситуаций в которых catboost выдаёт двум комментариям одинаковый счёт)

### cross-product датасета фичей с самим собой

In [None]:
train_features_c = train_features
score_c = train_features_c.score

train_features_c = train_features_c.drop(["score"], axis = 1)

In [None]:
dfs = [train_features_c[train_features_c['group_id'] == id].merge(train_features_c[train_features_c['group_id'] == id], how='cross') for id in train_features_c['group_id'].unique()]
X_train_csl = pd.concat(dfs, axis=0)

In [None]:
X_train_csl['score_dif'] = (X_train_csl['score_x'] > X_train_csl['score_y'])

X = X_train_csl

X = X.drop(X[X['score_x'] == X['score_y']].index)

X = X.drop(['group_id_x', 'group_id_y', 'score_dif', 'score_x', 'score_y'], axis = 1)

In [None]:
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y, test_size=0.3, random_state=42)

### Обучение модели для относительной оценки

In [None]:
cv = StratifiedKFold(shuffle=True, n_splits = 4)

model_cat_classificator = CatBoostClassifier(verbose = 100, task_type = 'GPU', objective = 'Logloss')#catboost model for classification

#grid of parameters
params_cat_classificator = {'learning_rate': [0.01, 0.05, 0.1],
                            'depth': [4, 5, 7],
                            'l2_leaf_reg': [3, 7, 10],
                            'n_estimators': [100, 500 ,100]}

#gridsearch model
GS_cat_classificator = GridSearchCV(cv =cv, estimator = model_cat_classificator, param_grid = params_cat_classificator, scoring='roc_auc')

In [None]:
GS_cat_classificator.fit(X = X_train_c, y = y_train_c, eval_set=(X_test_c, y_test_c))
model_cat_classificator_best = GS_cat_classificator.best_estimator_

### Получение результатов

In [None]:
test = test.drop(['post', 'comment'], axis = 1)

In [None]:
ndcg_n = 0
k = 0
target_train = []

for i in range(test.shape[0] // 5):
    k+=1

    prediction = catboost_ranker_first_iteration.predict(test[i*5:5*(i + 1)])

    m = 1
    for j in range(5):
        for r in range(j,5):
            M = abs(prediction[j] - prediction[r])
            if M < m and M != 0:
                m = M

    m /= 10
    for j in range(5):
        for r in range(j,5):
            if r != j:
                if prediction[j] == prediction[r]:
                    clas_key_1 = catboost_classificator_first_iteration.predict_proba(list(test.iloc[i*5+j])+list(test.iloc[i*5+r]))[0]
                    clas_key_2 = catboost_classificator_first_iteration.predict_proba(list(test.iloc[i*5+r])+list(test.iloc[i*5+j]))[0]
                    prediction[j] += (clas_key_1 - clas_key_2) * m


    dict_pred = [[4-i,prediction[i]] for i in range(5)]

    sorted_pred = sorted(dict_pred,key = lambda i: i[1], reverse = True)

    list_pred = sorted([[sorted_pred[i][0], 4-i] for i in range(5)], key = lambda i: i[0], reverse = True)

    target_post = [4 - i[1] for i in list_pred]
    target_train = target_train + target_post


pd.DataFrame(target_train).to_csv('target_test.csv', index = False)

In [None]:
target = pd.read_csv('target_test.csv').values

In [None]:
test_result = pd.read_json('ranking_test.jsonl', lines=True, encoding='utf-8')

In [None]:
for i in range(len(target) // 5):
    for j in range(5):
        test_result['comments'].iloc[i][j]['score'] = target[i * 5 + j][0]

## Выгрузка результатов

In [None]:
test_result.to_json('results.jsonl', orient='records', lines=True)