In [1]:
import pandas as pd
import numpy as np

In [2]:
from tqdm import tqdm

In [3]:
reviews = pd.read_csv('data/reviews.tsv', sep='\t')
reviews.head()

Unnamed: 0,id,text
0,43591,"Мармелад в целом неплохой, но цены завышены, м..."
1,43591,"Не нравится, что товар выложен открыто, слишко..."
2,43591,"Часто попадается сухой мармелад, дубовый впере..."
3,43591,"Персонал был одет в костюмы пиратов, а ассорти..."
4,43591,"Вкусный мармелад с широким ассортиментом форм,..."


In [None]:
from sentence_transformers import SentenceTransformer
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"


model_name='intfloat/multilingual-e5-base'

model = SentenceTransformer(model_name)
all_vecs = []
ids = []

  from tqdm.autonotebook import tqdm, trange


In [5]:
n_components=96 
top_n=10

In [6]:
for place_id, group in tqdm(reviews.groupby('id')):
    texts = group['text'].dropna().tolist()
    texts = sorted(texts, key=len, reverse=True)[:top_n]
    if not texts:
        all_vecs.append(np.zeros(model.get_sentence_embedding_dimension()))
        ids.append(place_id)
        continue

    texts = [f"passage: {t}" for t in texts]
    emb = model.encode(texts, normalize_embeddings=True)
    mean_vec = np.mean(emb, axis=0)
    all_vecs.append(mean_vec)
    ids.append(place_id)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 38770/38770 [22:19<00:00, 28.95it/s]


In [None]:
from sklearn.decomposition import PCA

X = np.vstack(all_vecs)
pca = PCA(n_components=n_components, random_state=42)
X_pca = pca.fit_transform(X)

emb_df = pd.DataFrame(X_pca, columns=[f'emb_{i}' for i in range(n_components)])
emb_df['id'] = ids

#или

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=n_components, random_state=42)
X_svd = svd.fit_transform(X)
emb_df = pd.DataFrame(X_svd, columns=[f'emb_{i}' for i in range(n_components)])
emb_df['id'] = ids

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [9]:
model_name = 'blanchefort/rubert-base-cased-sentiment'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

batch_size = 32  # можно варьировать, 16–64 оптимально

sentiments = []
for place_id, group in tqdm(reviews.groupby('id')):
    texts = group['text'].dropna().tolist()[:top_n]
    if not texts:
        sentiments.append({'id': place_id, 'pos_mean': 0, 'neu_mean': 0, 'neg_mean': 0, 'sent_score': 0})
        continue

    pos, neu, neg = [], [], []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()

        neg.extend(probs[:, 0])
        neu.extend(probs[:, 1])
        pos.extend(probs[:, 2])

    sentiments.append({
        'id': place_id,
        'pos_mean': np.mean(pos),
        'neu_mean': np.mean(neu),
        'neg_mean': np.mean(neg),
        'sent_score': np.mean(pos) - np.mean(neg)
    })

100%|██████████| 38770/38770 [18:02<00:00, 35.82it/s]


In [10]:
sentiments = pd.DataFrame(sentiments)

In [11]:
emb_df.to_parquet('emb_df.parquet', compression="zstd")
sentiments.to_parquet("sentiments.parquet", compression="zstd")

In [None]:
test = pd.read_csv('data/test.tsv', sep='\t')

In [None]:
test.head(3)

Unnamed: 0,id,name,coordinates,category,address,traffic_300m,homes_300m,works_300m,female_300m,train_ticket_order_300m,...,doramas_1000m,computer_components_1000m,humor_1000m,car_market_1000m,no_higher_education_1000m,goods_for_moms_and_babies_1000m,age_25-34_1000m,male_1000m,phone_repair_1000m,mean_income_1000m
0,21472,Счастье,"[37.533334, 55.790246]",candy_shop,"Ходынский бул., 4, Москва",62672,4709.110524,4298.125296,38987.0,961.0,...,1706.0,5764.0,32190.0,12370.0,265916.0,221.0,226914.0,399582.0,2255.0,122883.795473
1,9837,O'STIN,"[37.886829, 55.751627]",baby_clothes,"Носовихинское ш., 45, Реутов",110226,12987.989255,15235.256665,96081.0,1346.0,...,1266.0,3930.0,17037.0,9044.0,233487.0,102.0,214465.0,320022.0,1801.0,113878.735454
2,41791,Дровосек,"[37.474419, 55.863549]",barbershop,"Беломорская ул., 18А, корп. 2, Москва",81080,9575.248571,9463.322898,57147.0,1506.0,...,1915.0,6549.0,32133.0,13745.0,298014.0,305.0,253803.0,431408.0,2426.0,119179.257929


In [None]:
test = prep(test)

In [None]:
test = test.merge(emb_df, on='id', how='left')
test = test.merge(sent_df, on='id', how='left')

In [None]:
test['geo_cluster'] = kmeans.predict(test[['lat', 'lon']])

In [None]:
test['geo_sentiment_mean'] = test.groupby('geo_cluster')['sent_score'].transform('mean')
test['geo_sentiment_diff'] = test['sent_score'] - test['geo_sentiment_mean']

In [None]:
for col in text_cols:
    test[col] = test[col].fillna('')

In [None]:
tfidf_features_test = []
for col in text_cols:
    vec = tfidf_vectorizers[col]  # обученные на train
    tfidf_matrix = vec.transform(test[col])
    tfidf_features_test.append(tfidf_matrix)

X_test_tfidf = hstack(tfidf_features_test).tocsr()

In [None]:
num_array_test = num_imputer.transform(test[num_cols])
num_array_test = csr_matrix(num_array_test)

ohe_array_test = ohe.transform(test[cat_cols])  # обученный на train OHE

# Финальная матрица для не-CatBoost моделей
X_test_all = hstack([num_array_test, ohe_array_test, X_test_tfidf]).tocsr()

In [None]:
pred_cat_test = cat_model.predict(test[X.columns])
pred_xgb_test = xgb_model.predict(X_test_all)
pred_lgb_test = lgb_model.predict(X_test_all)
pred_knn_test = knn_model.predict(X_test_all)

In [None]:
stack_test = np.vstack([pred_cat_test, pred_xgb_test, pred_lgb_test, pred_knn_test]).T
y_pred_test = meta_model.predict(stack_test)

In [None]:
test['target'] = np.round(y_pred_test, 2)

In [None]:
test[['id', 'target']].to_csv('stacking1.csv', index=False)