In [17]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from catboost.text_processing import Tokenizer
import nltk
import os
from nltk.stem import WordNetLemmatizer

train_path = 'nlp-getting-started/train.csv'
test_path = 'nlp-getting-started/test.csv'
ss_path = 'nlp-getting-started/sample_submission.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
ss_data = pd.read_csv(ss_path)

# Заполнение пропусков
# train_data['keyword'] = train_data['keyword'].fillna('missing')
# train_data['location'] = train_data['location'].fillna('missing')

# test_data['keyword'] = test_data['keyword'].fillna('missing')
# test_data['location'] = test_data['location'].fillna('missing')

# Заполнение пропусков (теперь как пустые строки)
train_data['keyword'] = train_data['keyword'].fillna('')
train_data['location'] = train_data['location'].fillna('')

test_data['keyword'] = test_data['keyword'].fillna('')
test_data['location'] = test_data['location'].fillna('')

# Конкатенация текстовых столбцов
train_data['full_text'] = train_data['keyword'] + ' ' + train_data['location'] + ' ' + train_data['text']
test_data['full_text'] = test_data['keyword'] + ' ' + test_data['location'] + ' ' + test_data['text']

# Целевой столбец
X = train_data['full_text']
y = train_data['target']

X_test = test_data['full_text']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Tokenization setup
tokenizer = Tokenizer(
    lowercasing=True,
    separator_type='BySense',
    token_types=['Word', 'Number']
)


stop_words = set(('be', 'is', 'are', 'the', 'an', 'of', 'and', 'in'))
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()


def tokenize_texts(texts):
    return [tokenizer.tokenize(text) for text in texts]


def filter_stop_words(tokens):
    return list(filter(lambda x: x not in stop_words, tokens))


def lemmatize_tokens(tokens):
    return list(map(lambda t: lemmatizer.lemmatize(t), tokens))

# Text preprocessing function
def preprocess_texts(texts):
    tokenized_text = tokenize_texts(texts)
    tokenized_text_no_stop = [filter_stop_words(tokens) for tokens in tokenized_text]
    lemmatized_text = [" ".join(lemmatize_tokens(tokens)) for tokens in tokenized_text_no_stop]
    return lemmatized_text

# Apply preprocessing to all datasets
X_train_processed = preprocess_texts(X_train)
X_val_processed = preprocess_texts(X_val)
X_test_processed = preprocess_texts(X_test)

# Вывод примеров
print("Примеры предобработанного текста (обучающая выборка):")
for i in range(5):  # Выведем первые 5 примеров
    print(f"Оригинал: {X_train.iloc[i]}")
    print(f"Предобработка: {X_train_processed[i]}\n")


# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words=None
) # Stop words already removed


X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_processed).toarray()
X_val_tfidf = tfidf_vectorizer.transform(X_val_processed).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test_processed).toarray()

X_train_tfidf_df = pd.DataFrame(X_train_tfidf, columns=tfidf_vectorizer.get_feature_names_out())
X_val_tfidf_df = pd.DataFrame(X_val_tfidf, columns=tfidf_vectorizer.get_feature_names_out())
X_test_tfidf_df = pd.DataFrame(X_test_tfidf, columns=tfidf_vectorizer.get_feature_names_out())


train_pool = Pool(data=X_train_tfidf_df, label=y_train)
val_pool = Pool(data=X_val_tfidf_df, label=y_val)
test_pool = Pool(data=X_test_tfidf_df)


# Создание модели
model = CatBoostClassifier(
    iterations=600,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    task_type='GPU',  # Замените на 'GPU', если доступно,
)

# Обучение модели
model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50)

# Предсказания и метрики на валидационных данных
val_preds = model.predict(val_pool)
accuracy = accuracy_score(y_val, val_preds)
print("Validation Accuracy:", accuracy)
print("Validation Classification Report:\n", classification_report(y_val, val_preds))
print('---------')
print(accuracy)
print("STOP")



[nltk_data] Downloading package wordnet to /home/bulat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Примеры предобработанного текста (обучающая выборка):
Оригинал: snowstorm South, USA Sassy city girl country hunk stranded in Smoky Mountain snowstorm #AoMS http://t.co/nkKcTttsD9 #ibooklove #bookboost
Предобработка: snowstorm south usa sassy city girl country hunk stranded smoky mountain snowstorm aoms http t co ibooklove bookboost

Оригинал: armageddon Worldwide God's Kingdom (Heavenly Gov't) will rule over all people on the earth after Armageddon.  http://t.co/8HGcBXUkz0  http://t.co/4kopkCyvTt
Предобработка: armageddon worldwide god's kingdom heavenly gov't will rule over all people on earth after armageddon http t co http t co

Оригинал: body%20bagging Cloud 9 Mopheme and Bigstar Johnson are a problem in this game body bagging niggas #VuzuHustle
Предобработка: body cloud 9 mopheme bigstar johnson a problem this game body bagging nigga vuzuhustle

Оригинал: whirlwind Sheff/Bangor/Salamanca/Madrid @VixMeldrew sounds like a whirlwind life!
Предобработка: whirlwind sheff bangor salama

In [11]:
# Предсказания на тестовых данных
test_preds = model.predict(test_pool)

# Создание DataFrame с предсказаниями для submission
submission_df = pd.DataFrame({'id': test_data['id'], 'target': test_preds})

# Сохранение в CSV
submission_df.to_csv('preds/NLPwDT_05_pred_catboost_with_preprocess_text.csv', index=False)
print("Submission file created: submission.csv")

Submission file created: submission.csv


In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Загрузка данных
train_path = 'nlp-getting-started/train.csv'
test_path = 'nlp-getting-started/test.csv'
ss_path = 'nlp-getting-started/sample_submission.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
ss_data = pd.read_csv(ss_path)

from catboost.text_processing import Tokenizer

simple_tokenizer = Tokenizer()

def tokenize_texts(texts):
    return [simple_tokenizer.tokenize(text) for text in texts]

simple_tokenized_text = tokenize_texts()
simple_tokenized_text