In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Загрузка данных
train_path = 'nlp-getting-started/train.csv'
test_path = 'nlp-getting-started/test.csv'
ss_path = 'nlp-getting-started/sample_submission.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
ss_data = pd.read_csv(ss_path)

# Заполнение пропусков
train_data['keyword'] = train_data['keyword'].fillna('missing')
train_data['location'] = train_data['location'].fillna('missing')

test_data['keyword'] = test_data['keyword'].fillna('missing')
test_data['location'] = test_data['location'].fillna('missing')

# Конкатенация текстовых столбцов
train_data['full_text'] = train_data['keyword'] + ' ' + train_data['location'] + ' ' + train_data['text']
test_data['full_text'] = test_data['keyword'] + ' ' + test_data['location'] + ' ' + test_data['text']

# Целевой столбец
X = train_data['full_text']
y = train_data['target']

X_test = test_data['full_text']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000,  # Adjust as needed
                                   ngram_range=(1, 2), # Consider unigrams and bigrams
                                   stop_words='english') 

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_val_tfidf = tfidf_vectorizer.transform(X_val).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

# Convert to DataFrames (for Pool object)
X_train_tfidf_df = pd.DataFrame(X_train_tfidf, columns=tfidf_vectorizer.get_feature_names_out())
X_val_tfidf_df = pd.DataFrame(X_val_tfidf, columns=tfidf_vectorizer.get_feature_names_out())
X_test_tfidf_df = pd.DataFrame(X_test_tfidf, columns=tfidf_vectorizer.get_feature_names_out())


# Create Pool objects
train_pool = Pool(data=X_train_tfidf_df, label=y_train)
val_pool = Pool(data=X_val_tfidf_df, label=y_val)
test_pool = Pool(data=X_test_tfidf_df)



# Создание модели
model = CatBoostClassifier(
    iterations=1500,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    task_type='GPU'  # Замените на 'GPU', если доступно
)

# Обучение модели
model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50)

# Предсказания и метрики
val_preds = model.predict(val_pool)
print("Accuracy:", accuracy_score(y_val, val_preds))
print(classification_report(y_val, val_preds))

0:	learn: 0.6787966	test: 0.6787842	best: 0.6787842 (0)	total: 190ms	remaining: 4m 44s
100:	learn: 0.5026431	test: 0.5238042	best: 0.5237142 (99)	total: 6.99s	remaining: 1m 36s
200:	learn: 0.4533127	test: 0.4965120	best: 0.4963683 (197)	total: 14s	remaining: 1m 30s
300:	learn: 0.4214785	test: 0.4871264	best: 0.4871264 (300)	total: 20.7s	remaining: 1m 22s
400:	learn: 0.3986060	test: 0.4835300	best: 0.4828245 (373)	total: 27.7s	remaining: 1m 15s
500:	learn: 0.3786636	test: 0.4800726	best: 0.4799721 (498)	total: 34.4s	remaining: 1m 8s
600:	learn: 0.3630932	test: 0.4776091	best: 0.4775965 (581)	total: 41.5s	remaining: 1m 2s
bestTest = 0.4774586708
bestIteration = 614
Shrink model to first 615 iterations.
Accuracy: 0.768220617202889
              precision    recall  f1-score   support

           0       0.77      0.86      0.81       869
           1       0.77      0.65      0.71       654

    accuracy                           0.77      1523
   macro avg       0.77      0.75      0.76 

In [None]:
test_preds = model.predict(test_pool)

# Создание DataFrame с предсказаниями для submission
submission = pd.DataFrame({'id': test_data['id'], 'target': test_preds})

submission.to_csv('preds/NLPwDT_04_pred_catboost_with_tokenizer.csv', index=False)
print("Submission file created: submission.csv")

Submission file created: submission.csv
