In [14]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Загрузка данных
train_path = 'nlp-getting-started/train.csv'
test_path = 'nlp-getting-started/test.csv'
ss_path = 'nlp-getting-started/sample_submission.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
ss_data = pd.read_csv(ss_path)

# Заполнение пропусков
train_data['keyword'] = train_data['keyword'].fillna('missing')
train_data['location'] = train_data['location'].fillna('missing')

test_data['keyword'] = test_data['keyword'].fillna('missing')
test_data['location'] = test_data['location'].fillna('missing')

# Конкатенация текстовых столбцов
train_data['full_text'] = train_data['keyword'] + ' ' + train_data['location'] + ' ' + train_data['text']
test_data['full_text'] = test_data['keyword'] + ' ' + test_data['location'] + ' ' + test_data['text']

# Целевой столбец
X = train_data['full_text']
y = train_data['target']

X_test = test_data['full_text']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Убедимся, что X_train, X_val и X_test - DataFrame
X_train = X_train.to_frame(name='full_text')
X_val = X_val.to_frame(name='full_text')
X_test = X_test.to_frame(name='full_text')

# Создание объектов Pool
train_pool = Pool(data=X_train, label=y_train, text_features=['full_text'])
val_pool = Pool(data=X_val, label=y_val, text_features=['full_text'])
test_pool = Pool(data=X_test, text_features=['full_text'])


# Подготовка данных для модели
train_pool = Pool(data=X_train, label=y_train, text_features=['full_text'])
val_pool = Pool(data=X_val, label=y_val, text_features=['full_text'])
test_pool = Pool(data=X_test, text_features=['full_text'])

# Создание модели
model = CatBoostClassifier(
    iterations=1500,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    task_type='GPU'  # Замените на 'GPU', если доступно
)

# Обучение модели
model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50)

# Предсказания и метрики
val_preds = model.predict(val_pool)
print("Accuracy:", accuracy_score(y_val, val_preds))
print(classification_report(y_val, val_preds))

0:	learn: 0.6523299	test: 0.6468947	best: 0.6468947 (0)	total: 56.6ms	remaining: 1m 24s
100:	learn: 0.4385273	test: 0.4357737	best: 0.4352064 (92)	total: 4.86s	remaining: 1m 7s
bestTest = 0.4352064327
bestIteration = 92
Shrink model to first 93 iterations.
Accuracy: 0.8003939592908733
              precision    recall  f1-score   support

           0       0.80      0.87      0.83       869
           1       0.81      0.70      0.75       654

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523



In [15]:
# Предсказания на тестовом наборе
test_preds = model.predict(test_pool)

# Формирование файла для отправки
submission = pd.DataFrame({
    'id': test_data['id'],
    'target': test_preds
})

submission.to_csv('preds/NLwDT_02_pred_catboost.csv', index=False)
