# Библиотеки

In [2]:
import numpy as np
import pandas as pd
import joblib
import nltk
import category_encoders as ce
import spacy
import re
import emoji
import spacy
import time
from tqdm import tqdm

# Данные

In [3]:
train_df = pd.read_csv("train_df.csv")
val_df = pd.read_csv("val_df.csv")
test_df = pd.read_csv("test_df.csv")

In [4]:
bert_predictions_train = np.load("predictions_train.npy")
bert_predictions_val = np.load("predictions_val.npy")
bert_predictions_test = np.load("predictions_test.npy")

In [5]:
bert_predictions_train_proba = np.load("predictions_train_proba.npy")
bert_predictions_val_proba = np.load("predictions_val_proba.npy")
bert_predictions_test_proba = np.load("predictions_test_proba.npy")

# Tfidf + svd

In [13]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "textcat"])
if "lemmatizer" not in nlp.pipe_names:
    nlp.add_pipe("lemmatizer", name="lemmatizer")

URL_RE = re.compile(r'https?://\S+|www\.\S+')
USER_RE = re.compile(r'@\w+')
NUM_RE  = re.compile(r'\b\d+\b')

def spacy_tweet_analyzer(text: str):
    text = text.lower()
    text = URL_RE.sub(" URL ", text)
    text = USER_RE.sub(" MENTION ", text)
    text = NUM_RE.sub(" NUM ", text)
    text = emoji.demojize(text, language='en')                 
    text = re.sub(r':([a-z0-9_+\-]+):', r' EMOJI_\1 ', text) 
    text = re.sub(r'#([a-z0-9_]+)', r' hashtag_\1 ', text)     
    text = re.sub(r'\s+', ' ', text).strip()


    doc = nlp(text)

    tokens = []
    for t in doc:
        lemma = t.lemma_.lower()

        if lemma in {"mention", "url", "num"}:
            tokens.append(lemma)
            continue
        if lemma.startswith("hashtag_") or lemma.startswith("emoji_"):
            tokens.append(lemma)
            continue

        if t.is_alpha and not t.is_stop and len(lemma) > 1:
            tokens.append(lemma)

    return tokens


In [11]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
vectorizer = TfidfVectorizer(
    max_features=20000,
    sublinear_tf=True,
    analyzer=spacy_tweet_analyzer,
    ngram_range=(1, 2)
)

In [17]:
tfidf_train = vectorizer.fit_transform(train_df['full_text'])



In [18]:
svd = TruncatedSVD(n_components=400, random_state=42)
svd_train = svd.fit_transform(tfidf_train)

In [19]:
tfidf_val = vectorizer.transform(val_df['full_text'])
svd_val = svd.transform(tfidf_val)

In [20]:
tfidf_test = vectorizer.transform(test_df['full_text'])
svd_test = svd.transform(tfidf_test)

In [40]:
np.save("svd_train.npy", svd_train)
np.save("svd_val.npy", svd_val)
np.save("svd_test.npy", svd_test)

In [8]:
svd_train = np.load("svd_train.npy")
svd_val = np.load("svd_val.npy")
svd_test = np.load("svd_test.npy")

In [26]:
joblib.dump(vectorizer, "vectorizer.joblib")
joblib.dump(tfidf_train, "tfidf_train.joblib")
joblib.dump(tfidf_val, "tfidf_val.joblib")
joblib.dump(tfidf_test, "tfidf_test.joblib")

['tfidf_test.joblib']

In [15]:
vectorizer = joblib.load("vectorizer.joblib")
tfidf_train = joblib.load("tfidf_train.joblib")
tfidf_val = joblib.load("tfidf_val.joblib")
tfidf_test = joblib.load("tfidf_test.joblib")

# Модели

## CatBoostClassifier

In [21]:
from hyperopt import hp, tpe, Trials, fmin, STATUS_OK
from sklearn.datasets import make_classification
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score


In [18]:
label_map = {'Negative': 0, 'Neutral': 1, 'Positive': 2}

In [19]:
y_train = train_df['type'].map(label_map).astype('int64')
y_val   = val_df['type'].map(label_map).astype('int64')
y_test   = test_df['type'].map(label_map).astype('int64')

In [39]:
print("\nЗапуск Hyperopt...")
start_time_hyperopt = time.time()

space = {
    'iterations': hp.quniform('iterations', 200, 1000, 100),
    'learning_rate':hp.uniform('learning_rate', 0.01, 0.9),
    'depth': hp.quniform('depth', 3, 5, 1),
    'l2_leaf_reg': hp.quniform('l2_leaf_reg', 0.1, 10.1, 1),
    'bagging_temperature': hp.quniform('bagging_temperature', 0.1, 1, 0.4)}


def objective(params):
    model_params = {
        'iterations': int(params['iterations']),
        'depth': int(params['depth']),
        'learning_rate': params['learning_rate'],
        'l2_leaf_reg': params['l2_leaf_reg'], 
        'bagging_temperature': params['bagging_temperature'],
        'verbose': False,
        'allow_writing_files': False,
        'random_seed': 42,
        'od_type': 'Iter',     
        'od_wait': 100,
        'thread_count': -1,
        'loss_function': 'MultiClass'
        
    }

    model = CatBoostClassifier(**model_params)

    scores = cross_val_score(model, svd_train, y_train, cv=5, scoring="f1_macro", n_jobs=-1, error_score='raise')

    hyperopt_pbar.update(1)

    return {'loss': -np.mean(scores), 'status': STATUS_OK, 'params': model_params}

n_iter_hyperopt = 30
hyperopt_pbar = tqdm(total=n_iter_hyperopt, desc="Hyperopt")

trials = Trials()
best_hyperopt = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=n_iter_hyperopt,
    trials=trials,
    verbose=0
)

hyperopt_pbar.close()

best_trial_idx = int(np.argmin([t['result']['loss'] for t in trials.trials]))
best_params_hyperopt1 = trials.trials[best_trial_idx]['result']['params']
best_score_hyperopt = -min([t['result']['loss'] for t in trials.trials])

time_hyperopt = time.time() - start_time_hyperopt

best_model_hyperopt = CatBoostClassifier(**best_params_hyperopt1)
best_model_hyperopt.fit(
    svd_train, y_train,
    verbose=False  
)

y_test_pred_hyperopt = best_model_hyperopt.predict(svd_val)
metrics_hyperopt = f1_score(y_val, y_test_pred_hyperopt, average='macro')

print(f"\nHyperopt завершен за {time_hyperopt:.2f} секунд")
print(f"Лучшие параметры: {best_params_hyperopt1}")
print(f"Лучший CV score: {best_score_hyperopt:.4f}")
print(f"F1 на валидации: {metrics_hyperopt:.4f}")


Hyperopt завершен за 3703.20 секунд
Лучшие параметры: {'iterations': 1000, 'depth': 5, 'learning_rate': 0.5708795132134128, 'l2_leaf_reg': 4.0, 'bagging_temperature': 0.0, 'verbose': False, 'allow_writing_files': False, 'random_seed': 42, 'od_type': 'Iter', 'od_wait': 100, 'thread_count': -1, 'loss_function': 'MultiClass'}
Лучший CV score: 0.8678
F1 на валидации: 0.8337


In [22]:
best_model_hyperopt.save_model("catboost_model_clf.cbm")


## XGBclf

In [28]:
from xgboost import XGBClassifier

In [38]:
print("\nЗапуск Hyperopt...")
start_time_hyperopt = time.time()

space = {
    'n_estimators':      hp.quniform('n_estimators', 200, 1000, 100),
    'learning_rate':     hp.uniform('learning_rate', 0.02, 0.4),
    'max_depth':         hp.choice('max_depth', [3, 4, 5]),
    'min_child_weight':  hp.quniform('min_child_weight', 1, 10, 1),
    'subsample':         hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree':  hp.uniform('colsample_bytree', 0.5, 1.0),
    'reg_alpha':         hp.uniform('reg_alpha', 0.0, 5.0),   
    'reg_lambda':        hp.uniform('reg_lambda', 0.0, 20.0), 
}

def objective(params):
    model_params = {
        'n_estimators':      int(params['n_estimators']),
        'max_depth':         int(params['max_depth']),
        'learning_rate':     float(params['learning_rate']),
        'min_child_weight':  int(params['min_child_weight']),
        'subsample':         float(params['subsample']),
        'colsample_bytree':  float(params['colsample_bytree']),
        'reg_alpha':         float(params['reg_alpha']),
        'reg_lambda':        float(params['reg_lambda']),
        'random_state':      42,
        'n_jobs':            -1,
        'verbosity':         0,             
        'tree_method':       'hist',        
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
    }

    model = XGBClassifier(**model_params)

    scores = cross_val_score(
        model,
        svd_train, y_train,
        cv=5,
        scoring="f1_macro",      
        n_jobs=-1,
        error_score='raise'
    )

    hyperopt_pbar.update(1)
    return {'loss': -float(np.mean(scores)), 'status': STATUS_OK, 'params': model_params}


n_iter_hyperopt = 20  
hyperopt_pbar = tqdm(total=n_iter_hyperopt, desc="Hyperopt")

trials = Trials()
best_hyperopt = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=n_iter_hyperopt,
    trials=trials,
    verbose=0
)

hyperopt_pbar.close()

best_trial_idx = int(np.argmin([t['result']['loss'] for t in trials.trials]))
best_params_hyperopt3 = trials.trials[best_trial_idx]['result']['params']
best_score_hyperopt = -min([t['result']['loss'] for t in trials.trials])
time_hyperopt = time.time() - start_time_hyperopt

best_model_xgb = XGBClassifier(**best_params_hyperopt3)
best_model_xgb.fit(svd_train, y_train, verbose=False)

y_test_pred_hyperopt = best_model_xgb.predict(svd_val)
metrics_hyperopt = f1_score(
    y_val, y_test_pred_hyperopt,
    average='macro'  
)

print(f"\nHyperopt завершен за {time_hyperopt:.2f} секунд")
print(f"Лучшие параметры: {best_params_hyperopt3}")
print(f"Лучший CV score (macro-F1): {best_score_hyperopt:.4f}")
print(f"F1 на тесте (macro): {metrics_hyperopt:.4f}")


Hyperopt завершен за 3703.20 секунд
Лучшие параметры: {'n_estimators': 1000, 'max_depth': 5, 'learning_rate': 0.2975111730313384, 'min_child_weight': 7, 'subsample': 0.7244308832253615, 'colsample_bytree': 0.5630216689866965, 'reg_alpha': 2.991233271938252, 'reg_lambda': 3.7506588341945646, 'random_state': 42, 'n_jobs': -1, 'verbosity': 0, 'tree_method': 'hist', 'objective': 'multi:softprob', 'eval_metric': 'mlogloss'}
Лучший CV score (macro-F1): 0.8678
F1 на тесте (macro): 0.8789


In [34]:
best_model_xgb.save_model("xgb_model.json")


## ComplementNB

In [22]:
from sklearn.naive_bayes import ComplementNB, MultinomialNB


In [23]:
clf_nb = ComplementNB(alpha=0.5)              
clf_nb.fit(tfidf_train, y_train)
f1_score(y_val, clf_nb.predict(tfidf_val), average='macro')

0.7614923165782019

## LogReg

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
clf_lr = LogisticRegression(
    multi_class='multinomial',   
    solver='lbfgs',              
    max_iter=2000,
    n_jobs=-1,
    C=2.0,                     
    random_state=42
)

clf_lr.fit(svd_train, y_train)
f1_score(y_val, clf_lr.predict(svd_val), average='macro')



0.614787758341153

## Загрузка моделей

In [83]:
cat_clf = CatBoostClassifier()
cat_clf.load_model("catboost_model_clf.cbm")

<catboost.core.CatBoostClassifier at 0x2da92dfa990>

In [84]:
xgb_clf = XGBClassifier()
xgb_clf.load_model("xgb_model.json")

## Мета модель (CatBoostClassifier, XGBClassifier, ComplementNB, LogReg, Bert)

In [30]:
meta_model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=2000,
    random_state=42
)

In [35]:
train_df_end = np.hstack([cat_clf.predict_proba(svd_train), xgb_clf.predict_proba(svd_train), clf_lr.predict_proba(svd_train)
                          , clf_nb.predict_proba(tfidf_train)
                          ,bert_predictions_train_proba])

In [37]:
val_df_end = np.hstack([cat_clf.predict_proba(svd_val), xgb_clf.predict_proba(svd_val), clf_lr.predict_proba(svd_val)
                          , clf_nb.predict_proba(tfidf_val)
                          ,bert_predictions_val_proba])

In [38]:
test_df_end = np.hstack([cat_clf.predict_proba(svd_test), xgb_clf.predict_proba(svd_test), clf_lr.predict_proba(svd_test)
                          , clf_nb.predict_proba(tfidf_test)
                          ,bert_predictions_test_proba])

In [39]:
meta_model.fit(train_df_end, y_train)



In [40]:
f1_score(y_val, meta_model.predict(val_df_end), average='macro')

0.9434812237285194

In [41]:
f1_score(y_test, meta_model.predict(test_df_end), average='macro')

0.9858924186351287

## Стакинг (CatBoostClassifier, XGBClassifier, ComplementNB, LogReg)

In [67]:
from sklearn.model_selection import StratifiedKFold

In [68]:
best_params_cat = {'iterations': 1000, 'depth': 5, 'learning_rate': 0.5708795132134128, 'l2_leaf_reg': 4.0, 'bagging_temperature': 0.0, 'verbose': False, 'allow_writing_files': False, 'random_seed': 42, 'od_type': 'Iter', 'od_wait': 100, 'thread_count': -1, 'loss_function': 'MultiClass'}

In [69]:
best_params_xgb =  {'n_estimators': 1000, 'max_depth': 5, 'learning_rate': 0.2975111730313384, 'min_child_weight': 7, 'subsample': 0.7244308832253615, 'colsample_bytree': 0.5630216689866965, 'reg_alpha': 2.991233271938252, 'reg_lambda': 3.7506588341945646, 'random_state': 42, 'n_jobs': -1, 'verbosity': 0, 'tree_method': 'hist', 'objective': 'multi:softprob', 'eval_metric': 'mlogloss'}

In [70]:
model_sta = [CatBoostClassifier(**best_params_cat),
             XGBClassifier(**best_params_xgb),
             LogisticRegression(multi_class='multinomial',solver='lbfgs',max_iter=2000,random_state=42)]

In [71]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
X_text = train_df["full_text"].to_numpy()
meta_train = np.zeros((train_df.shape[0], 12), dtype=float)
for train_idx, val_idx in kf.split(X_text, y_train):
    X_tr, X_val = X_text[train_idx], X_text[val_idx]
    y_tr = y_train[train_idx]
    vectorizer = TfidfVectorizer(
        max_features=20000,
        sublinear_tf=True,
        analyzer=spacy_tweet_analyzer,
        ngram_range=(1, 2)
    )
    tfidf_tr = vectorizer.fit_transform(X_tr)
    svd = TruncatedSVD(n_components=400, random_state=42)
    svd_tr = svd.fit_transform(tfidf_tr)
    tfidf_va = vectorizer.transform(X_val)
    svd_va = svd.transform(tfidf_va)
    for i, a_model in enumerate(model_sta):
        a_model.fit(svd_tr, y_tr)
        meta_train[val_idx, i*3:(i+1)*3] = a_model.predict_proba(svd_va)
    clf_nb_s = ComplementNB(alpha=0.5)              
    clf_nb_s.fit(tfidf_tr, y_tr)
    meta_train[val_idx,  9:12] =  clf_nb_s.predict_proba(tfidf_va)



In [73]:
meta_val = np.zeros((val_df.shape[0], 12), dtype=float)
meta_test = np.zeros((test_df.shape[0], 12), dtype=float)

In [74]:
model_sta_vt = [CatBoostClassifier(**best_params_cat),
             XGBClassifier(**best_params_xgb),
             LogisticRegression(multi_class='multinomial',solver='lbfgs',max_iter=2000,random_state=42)]

In [75]:
for i, a_model in enumerate(model_sta_vt):
    a_model.fit(svd_train, y_train)
    meta_val[:,i*3:(i+1)*3] = a_model.predict_proba(svd_val)
    meta_test[:,i*3:(i+1)*3] = a_model.predict_proba(svd_test)



In [76]:
clf_nb_svt = ComplementNB(alpha=0.5)              
clf_nb_svt.fit(tfidf_train, y_train)
meta_val[:,  9:12] =  clf_nb_svt.predict_proba(tfidf_val)
meta_test[:,  9:12] =  clf_nb_svt.predict_proba(tfidf_test)

In [77]:
meta_model_2 = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=2000,
    random_state=42
)

In [78]:
meta_model_2.fit(meta_train, y_train)



In [79]:
f1_score(y_val, meta_model_2.predict(meta_val), average='macro')

0.9101066803371006

In [80]:
f1_score(y_test, meta_model_2.predict(meta_test), average='macro')

0.9770118149204366

## Final results

In [92]:
f1_train = []
f1_val = []
f1_test = []
for a_model in [cat_clf, xgb_clf, clf_lr]:
    f1_train.append(f1_score(y_train, a_model.predict(svd_train), average='macro'))
    f1_val.append(f1_score(y_val, a_model.predict(svd_val), average='macro'))
    f1_test.append(f1_score(y_test, a_model.predict(svd_test), average='macro'))
    
f1_train.append(f1_score(y_train, clf_nb.predict(tfidf_train), average='macro'))
f1_val.append(f1_score(y_val, clf_nb.predict(tfidf_val), average='macro'))
f1_test.append(f1_score(y_test, clf_nb.predict(tfidf_test), average='macro'))
f1_train.append(f1_score(y_train, bert_predictions_train, average='macro'))
f1_val.append(f1_score(y_val, bert_predictions_val, average='macro'))
f1_test.append(f1_score(y_test, bert_predictions_test, average='macro'))

In [94]:
f1_train.append(f1_score(y_train, meta_model.predict(train_df_end), average='macro'))
f1_val.append(f1_score(y_val, meta_model.predict(val_df_end), average='macro'))
f1_test.append(f1_score(y_test, meta_model.predict(test_df_end), average='macro'))

In [95]:
f1_train.append(f1_score(y_train, meta_model_2.predict(meta_train), average='macro'))
f1_val.append(f1_score(y_val, meta_model_2.predict(meta_val), average='macro'))
f1_test.append(f1_score(y_test, meta_model_2.predict(meta_test), average='macro'))

In [97]:
df_res = pd.DataFrame({"Модели": ["CatBoostClassifier", "XGBClassifier", "LogisticRegression", "ComplementNB", "Bert", "Meta model(над всеми)", "Стакинг(без берта)"]
            , "F1_train": f1_train
            , "F1_val": f1_val
            , "F1_test": f1_test
             })

In [98]:
df_res.style.highlight_max(subset= ['F1_train', 'F1_val', "F1_test"], axis=0, color='palegreen')

Unnamed: 0,Модели,F1_train,F1_val,F1_test
0,CatBoostClassifier,0.976728,0.833725,0.961325
1,XGBClassifier,0.983492,0.878912,0.970076
2,LogisticRegression,0.620242,0.614788,0.653057
3,ComplementNB,0.802316,0.761492,0.817077
4,Bert,0.982866,0.936804,0.975663
5,Meta model(над всеми),0.988229,0.943481,0.985892
6,Стакинг(без берта),0.89515,0.910107,0.977012
