#Modeling
На цьому етапі розглядаються різні підходи до побудови моделей для задачі визначення дублікатів запитань.

In [2]:
from google.colab import files
uploaded = files.upload()


Saving preprocessing.py to preprocessing.py
Saving evaluation.py to evaluation.py
Saving models.py to models.py


In [None]:
!ls

drive  evaluation.py  models.py  preprocessing.py  __pycache__	sample_data


In [3]:
import importlib

import evaluation
import preprocessing
import models

importlib.reload(evaluation)
importlib.reload(preprocessing)
importlib.reload(models)

from evaluation import evaluate_model, evaluate_with_threshold
from preprocessing import preprocess, fill_missing_questions, cosine_similarity_pairs, count_common_words,common_words_count_for_df,cosine_similarity_between_embeddings, normalize_embedding_vectors
from models import train_logistic_regression


In [4]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, f1_score, confusion_matrix, classification_report
from scipy.sparse import hstack, csr_matrix
from xgboost import XGBClassifier

In [5]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
train_df = pd.read_csv("/content/drive/MyDrive/Final_Project/quora_question_pairs_train_.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Final_Project/quora_question_pairs_test.csv")
pd.set_option('display.max_colwidth', None)

In [7]:
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,332278,459256,459257,The Iliad and the Odyssey in the Greek culture?,How do I prove that the pairs of three independent variables is also independent?,0
1,196656,297402,297403,What is practical management and what is strategic management?,What are the practical aspects of strategic management?,0
2,113125,184949,184950,How useful is MakeUseOf Answers?,"Is there any Q&A site that is not Yahoo answers, where hate speech is allowed?",0
3,266232,101283,163744,Which is the best place to reside in India and Why?,Which ia the best place to visit in India?,0
4,122738,17811,27517,Why do so many people ask questions on Quora that can be easily answered by any number of legitimate sources on the Web? Have they not heard of Google or Bing?,Why don't many people posting questions on Quora check Google first?,1


In [8]:
test_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,305985,429434,429435,Why is beef banned in India and not pork as well?,Is beef banned in india?,0
1,5193,10230,10231,At what valuation did Homejoy raise money in December of 2013?,"Should a wealthy founder self-fund his second startup then raise money at high valuation after getting traction, or raise money at low valuation before any traction?",0
2,123326,199422,199423,How do we judge?,How do I judge my love?,0
3,368557,327674,498931,Are Adderall and meth the same?,Are concerta and meth test the same?,0
4,369226,499645,499646,"If you had internet access to only one site for the rest of your life, which site would you pick?",Why is there .co.uk for British internet sites but only .fr for French ones?,0


In [9]:
print("train_df:", train_df.shape)
print("test_df :", test_df.shape)

train_df: (323432, 6)
test_df : (80858, 6)


In [10]:
train_df = fill_missing_questions(train_df)
test_df  = fill_missing_questions(test_df)

In [11]:
print("train_df:", train_df.shape)
print("test_df :", test_df.shape)

train_df: (323432, 6)
test_df : (80858, 6)


In [12]:
train_df.isna().sum(), test_df.isna().sum()

(id              0
 qid1            0
 qid2            0
 question1       0
 question2       0
 is_duplicate    0
 dtype: int64,
 id              0
 qid1            0
 qid2            0
 question1       0
 question2       0
 is_duplicate    0
 dtype: int64)

In [13]:
train_texts = train_df["question1"] + " " + train_df["question2"]
test_texts  = test_df["question1"]  + " " + test_df["question2"]


#Logistic Regression (Logistic Regression + багато ознак TF-IDF)


In [14]:
vectorizer = TfidfVectorizer(
    tokenizer=preprocess,
    token_pattern=None,
    ngram_range=(1, 2),
    max_features=50_000,
    min_df=2,
    max_df=0.95
)


In [15]:
X_train = vectorizer.fit_transform(train_texts)
X_test  = vectorizer.transform(test_texts)

y_train = train_df["is_duplicate"]
y_test  = test_df["is_duplicate"]

print("X_train:", X_train.shape)
print("X_test :", X_test.shape)

X_train: (323432, 50000)
X_test : (80858, 50000)


In [16]:
clf = LogisticRegression(
    solver="liblinear",
    penalty="l2",
    C=1.0,
    max_iter=3000,
    class_weight="balanced",
    random_state=42
)

clf.fit(X_train, y_train)

In [None]:
probs = clf.predict_proba(X_test)[:, 1]
pred_05 = (probs >= 0.5).astype(int)

print("LogLoss:", log_loss(y_test, probs))
print("F1 @0.5:", f1_score(y_test, pred_05))
print("")
print("Confusion matrix @0.5:\n", confusion_matrix(y_test, pred_05))
print("")
print(classification_report(y_test, pred_05, digits=4))

LogLoss: 0.469893157461519
F1 @0.5: 0.7118330194601381

Confusion matrix @0.5:
 [[39817 11188]
 [ 7174 22679]]

              precision    recall  f1-score   support

           0     0.8473    0.7806    0.8126     51005
           1     0.6696    0.7597    0.7118     29853

    accuracy                         0.7729     80858
   macro avg     0.7585    0.7702    0.7622     80858
weighted avg     0.7817    0.7729    0.7754     80858



In [None]:
best_f1 = 0
best_threshold = 0

for t in [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]:
    preds = (probs >= t).astype(int)
    f1 = f1_score(y_test, preds)

    print("threshold:", t, "F1:", f1)

    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print("\nBest threshold:", best_threshold)
print("Best F1:", best_f1)


threshold: 0.2 F1: 0.6386735908014278
threshold: 0.25 F1: 0.6611391407637395
threshold: 0.3 F1: 0.6786886876927669
threshold: 0.35 F1: 0.6926050596959975
threshold: 0.4 F1: 0.7027734097772088
threshold: 0.45 F1: 0.710546903830541
threshold: 0.5 F1: 0.7118330194601381
threshold: 0.55 F1: 0.7056073986277815
threshold: 0.6 F1: 0.6915854549341695

Best threshold: 0.5
Best F1: 0.7118330194601381


##Висновок
Модель Logistic Regression з TF-IDF ознаками працює стабільно і не потребує ручного підбору порогу. Стандартний поріг 0.5 забезпечує найкращий баланс між precision та recall, що свідчить про хорошу калібровку ймовірностей і достатню інформативність використаних ознак.

#Logistic Regression + TF-IDF + cosine similarity

In [88]:
vectorizer_cos = TfidfVectorizer(
    tokenizer=preprocess,
    token_pattern=None,
    ngram_range=(1, 2),
    max_features=50_000,
    min_df=2,
    max_df=0.95,
    norm="l2"
)


In [89]:
vectorizer_cos.fit(
    list(train_df["question1"]) + list(train_df["question2"])
)


In [90]:
X_q1_train = vectorizer_cos.transform(train_df["question1"])
X_q2_train = vectorizer_cos.transform(train_df["question2"])

X_q1_test  = vectorizer_cos.transform(test_df["question1"])
X_q2_test  = vectorizer_cos.transform(test_df["question2"])


In [91]:
cos_train = cosine_similarity_pairs(X_q1_train, X_q2_train)
cos_test  = cosine_similarity_pairs(X_q1_test,  X_q2_test)

In [92]:
cos_train = cos_train.reshape(-1, 1)
cos_test  = cos_test.reshape(-1, 1)

In [93]:
cos_train.shape, cos_test.shape

((323432, 1), (80858, 1))

In [None]:
X_train_cos = hstack([X_train, cos_train])
X_test_cos  = hstack([X_test,  cos_test])

print("X_train_cos:", X_train_cos.shape)
print("X_test_cos :", X_test_cos.shape)

X_train_cos: (323432, 50001)
X_test_cos : (80858, 50001)


In [None]:
model_cos = train_logistic_regression(X_train_cos, y_train)

In [None]:
results = evaluate_model(model_cos, X_test_cos, y_test)

In [None]:
print("LogLoss:", results["log_loss"])
print("F1:", results["f1"])
print("Confusion matrix:\n", results["confusion_matrix"])
print("Classification report:\n", results["classification_report"])


LogLoss: 0.4260844035044139
F1: 0.7086075190092268
Confusion matrix:
 [[44258  6747]
 [ 9770 20083]]
Classification report:
               precision    recall  f1-score   support

           0       0.82      0.87      0.84     51005
           1       0.75      0.67      0.71     29853

    accuracy                           0.80     80858
   macro avg       0.78      0.77      0.78     80858
weighted avg       0.79      0.80      0.79     80858



In [None]:
y_prob_cos = model_cos.predict_proba(X_test_cos)[:, 1]

NameError: name 'model_cos' is not defined

In [None]:
thresholds = [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]

for t in thresholds:
    f1 = evaluate_with_threshold(y_test, y_prob_cos, t)
    print("threshold:", t, "F1:", f1)

NameError: name 'y_prob_cos' is not defined

In [None]:
results_the_best_treshhold = evaluate_model(model_cos, X_test_cos, y_test, threshold=0.35)

In [None]:
print("LogLoss:", results_the_best_treshhold["log_loss"])
print("F1:", results_the_best_treshhold["f1"])
print("Confusion matrix:\n", results_the_best_treshhold["confusion_matrix"])
print("Classification report:\n", results_the_best_treshhold["classification_report"])

LogLoss: 0.4260844035044139
F1: 0.7389974537071914
Confusion matrix:
 [[39011 11994]
 [ 5329 24524]]
Classification report:
               precision    recall  f1-score   support

           0       0.88      0.76      0.82     51005
           1       0.67      0.82      0.74     29853

    accuracy                           0.79     80858
   macro avg       0.78      0.79      0.78     80858
weighted avg       0.80      0.79      0.79     80858



##Висновок
При стандартному порозі 0.5 модель демонструвала F1 ≈ 0.71. Зниження порогу до 0.35 дозволило збільшити F1-score до ≈ 0.74 завдяки суттєвому зростанню recall класу дублікатів, при незмінному log loss, що свідчить про стабільну калібровку ймовірностей.

Порівняння двох моделей логістичної регресії показало, що додавання cosine similarity між TF-IDF представленнями пари питань суттєво покращує якість класифікації. Оптимально налаштована модель з cosine similarity досягла F1-score ≈ 0.74 порівняно з ≈ 0.71 для моделі без cosine, а також продемонструвала нижчий log loss, що свідчить про кращу калібровку ймовірностей. Основний приріст якості досягається за рахунок підвищення recall класу дублікатів при збереженні прийнятного рівня precision.

#XGBOOST

In [None]:
xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42
)

In [None]:
xgb_model.fit(X_train, y_train)

In [None]:
y_prob = xgb_model.predict_proba(X_test)[:, 1]

In [None]:
for t in thresholds:
    f1 = evaluate_with_threshold(y_test, y_prob, t)
    print("threshold:", t, "F1:", f1)

threshold: 0.2 F1: 0.616410212510968
threshold: 0.25 F1: 0.6433691340555336
threshold: 0.3 F1: 0.6649289220049335
threshold: 0.35 F1: 0.6731201334030497
threshold: 0.4 F1: 0.6603449918993615
threshold: 0.45 F1: 0.6297976524807374
threshold: 0.5 F1: 0.5750526811104973
threshold: 0.55 F1: 0.49925047854062404
threshold: 0.6 F1: 0.4109051031598276


In [None]:
results_xgb = evaluate_model(xgb_model, X_test, y_test, threshold=0.3)

In [None]:
print("LogLoss:", results_xgb["log_loss"])
print("F1:", results_xgb["f1"])
print("Confusion matrix:\n", results_xgb["confusion_matrix"])
print("\nClassification report:\n", results_xgb["classification_report"])

LogLoss: 0.5153539361073165
F1: 0.6649289220049335
Confusion matrix:
 [[28359 22646]
 [ 3706 26147]]

Classification report:
               precision    recall  f1-score   support

           0       0.88      0.56      0.68     51005
           1       0.54      0.88      0.66     29853

    accuracy                           0.67     80858
   macro avg       0.71      0.72      0.67     80858
weighted avg       0.76      0.67      0.68     80858



# XGBOOST + cosine_similarity + len_diff + common_words_count

In [94]:
# 1) Різниця довжин (len_diff)

train_len_q1 = train_df["question1"].astype(str).str.len()
train_len_q2 = train_df["question2"].astype(str).str.len()

test_len_q1 = test_df["question1"].astype(str).str.len()
test_len_q2 = test_df["question2"].astype(str).str.len()

train_len_diff = (train_len_q1 - train_len_q2).abs()
test_len_diff  = (test_len_q1 - test_len_q2).abs()


# 2) TF-IDF матриці для question1 та question2 окремо (для cosine)

X_train_q1 = vectorizer.transform(train_df["question1"].astype(str))
X_train_q2 = vectorizer.transform(train_df["question2"].astype(str))

X_test_q1 = vectorizer.transform(test_df["question1"].astype(str))
X_test_q2 = vectorizer.transform(test_df["question2"].astype(str))


# 3) Cosine similarity для кожної пари

train_cos = cosine_similarity_pairs(X_train_q1, X_train_q2)
test_cos  = cosine_similarity_pairs(X_test_q1, X_test_q2)


# 4) Кількість спільних слів (common_words_count)

train_common_words = common_words_count_for_df(train_df)
test_common_words  = common_words_count_for_df(test_df)


In [None]:
print(len(train_len_diff), len(train_cos), len(train_common_words), len(train_df))
print(len(test_len_diff), len(test_cos), len(test_common_words), len(test_df))

323432 323432 323432 323432
80858 80858 80858 80858


In [None]:
train_num = np.vstack([
    train_cos,
    train_len_diff.values,
    train_common_words.values
]).T

test_num = np.vstack([
    test_cos,
    test_len_diff.values,
    test_common_words.values
]).T

X_train_num = csr_matrix(train_num)
X_test_num  = csr_matrix(test_num)

X_train_full = hstack([X_train, X_train_num])
X_test_full  = hstack([X_test, X_test_num])

print("X_train_full:", X_train_full.shape)
print("X_test_full :", X_test_full.shape)


X_train_full: (323432, 50003)
X_test_full : (80858, 50003)


In [None]:
xgb_tfidf_plus_num = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42
)


In [None]:
xgb_tfidf_plus_num.fit(X_train_full, y_train)

In [None]:
y_prob_xgb_tfidf_plus_num = xgb_tfidf_plus_num.predict_proba(X_test_full)[:, 1]

In [None]:
best_thr = None
best_f1 = -1

for t in thresholds:
    f1 = evaluate_with_threshold(y_test, y_prob_xgb_tfidf_plus_num, t)
    print("threshold:", t, "F1:", f1)
    if f1 > best_f1:
        best_f1 = f1
        best_thr = t


threshold: 0.2 F1: 0.6797713910175917
threshold: 0.25 F1: 0.6949746595835623
threshold: 0.3 F1: 0.7071575786875652
threshold: 0.35 F1: 0.7157341444542277
threshold: 0.4 F1: 0.7212924583020923
threshold: 0.45 F1: 0.7172205068631348
threshold: 0.5 F1: 0.6930696416497634
threshold: 0.55 F1: 0.638227039581922
threshold: 0.6 F1: 0.5321531791907514


In [None]:
results_xgb_tfidf_plus_num = evaluate_model(xgb_tfidf_plus_num, X_test_full, y_test,threshold= 0.4)


In [None]:
print("LogLoss:", results_xgb_tfidf_plus_num["log_loss"])
print("F1:", results_xgb_tfidf_plus_num["f1"])
print("Confusion matrix:\n", results_xgb_tfidf_plus_num["confusion_matrix"])
print("\nClassification report:\n", results_xgb_tfidf_plus_num["classification_report"])

LogLoss: 0.44892099419790277
F1: 0.7212924583020923
Confusion matrix:
 [[35664 15341]
 [ 4360 25493]]

Classification report:
               precision    recall  f1-score   support

           0       0.89      0.70      0.78     51005
           1       0.62      0.85      0.72     29853

    accuracy                           0.76     80858
   macro avg       0.76      0.78      0.75     80858
weighted avg       0.79      0.76      0.76     80858



#Висновок

У ході експериментів було протестовано як лінійні, так і ансамблеві моделі.

Логістична регресія з TF-IDF та cosine similarity показала найкращі результати за F1-score та LogLoss.
Хоча XGBoost із додатковими числовими ознаками покращив якість у порівнянні з базовою версією, він поступився логістичній регресії за всіма ключовими метриками.

# Sentence-BERT embeddings + класифікатор

In [17]:
!pip -q install sentence-transformers


In [18]:
from sentence_transformers import SentenceTransformer

In [19]:
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [20]:
train_q1_list = train_df["question1"].astype(str).tolist()
train_q2_list = train_df["question2"].astype(str).tolist()

test_q1_list = test_df["question1"].astype(str).tolist()
test_q2_list = test_df["question2"].astype(str).tolist()

In [21]:
train_emb_q1 = sbert_model.encode(
    train_q1_list,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True
)

Batches:   0%|          | 0/5054 [00:00<?, ?it/s]

In [22]:

train_emb_q2 = sbert_model.encode(
    train_q2_list,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True
)

Batches:   0%|          | 0/5054 [00:00<?, ?it/s]

In [23]:
test_emb_q1 = sbert_model.encode(
    test_q1_list,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True
)

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

In [24]:

test_emb_q2 = sbert_model.encode(
    test_q2_list,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True
)

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

In [25]:
print("train_emb_q1:", train_emb_q1.shape)
print("train_emb_q2:", train_emb_q2.shape)
print("test_emb_q1 :", test_emb_q1.shape)
print("test_emb_q2 :", test_emb_q2.shape)

train_emb_q1: (323432, 384)
train_emb_q2: (323432, 384)
test_emb_q1 : (80858, 384)
test_emb_q2 : (80858, 384)


In [26]:
train_sbert_cosine_similarity = cosine_similarity_between_embeddings(
    train_emb_q1,
    train_emb_q2
)

test_sbert_cosine_similarity = cosine_similarity_between_embeddings(
    test_emb_q1,
    test_emb_q2
)

print(train_sbert_cosine_similarity.shape)
print(test_sbert_cosine_similarity.shape)


(323432,)
(80858,)


In [27]:
train_abs_diff = np.abs(train_emb_q1 - train_emb_q2)
test_abs_diff  = np.abs(test_emb_q1 - test_emb_q2)

print(train_abs_diff.shape)
print(test_abs_diff.shape)


(323432, 384)
(80858, 384)


In [28]:
X_train_sbert = np.hstack([
    train_sbert_cosine_similarity.reshape(-1, 1),
    train_abs_diff
])

X_test_sbert = np.hstack([
    test_sbert_cosine_similarity.reshape(-1, 1),
    test_abs_diff
])


In [27]:
sbert_logreg_model = LogisticRegression(max_iter=2000)
sbert_logreg_model.fit(X_train_sbert, y_train)

In [28]:
y_prob_sbert = sbert_logreg_model.predict_proba(X_test_sbert)[:, 1]

thresholds = [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]

best_threshold_sbert = None
best_f1_sbert = -1

for threshold in thresholds:
    f1 = evaluate_with_threshold(y_test, y_prob_sbert, threshold)
    print("threshold:", threshold, "F1:", f1)

    if f1 > best_f1_sbert:
        best_f1_sbert = f1
        best_threshold_sbert = threshold

print("\nBEST threshold:", best_threshold_sbert)
print("BEST F1:", best_f1_sbert)


threshold: 0.2 F1: 0.7311120570669349
threshold: 0.25 F1: 0.7402614799335584
threshold: 0.3 F1: 0.7453666398066076
threshold: 0.35 F1: 0.7465594512414797
threshold: 0.4 F1: 0.7433479740364868
threshold: 0.45 F1: 0.7365575435580904
threshold: 0.5 F1: 0.7236437405928932
threshold: 0.55 F1: 0.706534081134088
threshold: 0.6 F1: 0.6820008364092587

BEST threshold: 0.35
BEST F1: 0.7465594512414797


In [29]:
results_sbert_logreg = evaluate_model(
    sbert_logreg_model,
    X_test_sbert,
    y_test,
    threshold=best_threshold_sbert
)
print("LogLoss:", results_sbert_logreg["log_loss"])
print("F1:", results_sbert_logreg["f1"])
print("Confusion matrix:\n", results_sbert_logreg["confusion_matrix"])
print("\nClassification report:\n", results_sbert_logreg["classification_report"])

LogLoss: 0.41088977341047517
F1: 0.7465594512414797
Confusion matrix:
 [[37368 13637]
 [ 3950 25903]]

Classification report:
               precision    recall  f1-score   support

           0       0.90      0.73      0.81     51005
           1       0.66      0.87      0.75     29853

    accuracy                           0.78     80858
   macro avg       0.78      0.80      0.78     80858
weighted avg       0.81      0.78      0.79     80858



#Sentence-BERT + features

In [29]:
train_embeddings_q1_normalized = normalize_embedding_vectors(train_emb_q1)
train_embeddings_q2_normalized = normalize_embedding_vectors(train_emb_q2)

test_embeddings_q1_normalized = normalize_embedding_vectors(test_emb_q1)
test_embeddings_q2_normalized = normalize_embedding_vectors(test_emb_q2)


In [30]:
train_cosine_similarity = cosine_similarity_between_embeddings(
    train_embeddings_q1_normalized,
    train_embeddings_q2_normalized
)

test_cosine_similarity = cosine_similarity_between_embeddings(
    test_embeddings_q1_normalized,
    test_embeddings_q2_normalized
)


In [31]:
train_abs_diff_mean = np.mean(np.abs(train_embeddings_q1_normalized - train_embeddings_q2_normalized), axis=1)
test_abs_diff_mean  = np.mean(np.abs(test_embeddings_q1_normalized - test_embeddings_q2_normalized), axis=1)

train_abs_diff_max = np.max(np.abs(train_embeddings_q1_normalized - train_embeddings_q2_normalized), axis=1)
test_abs_diff_max  = np.max(np.abs(test_embeddings_q1_normalized - test_embeddings_q2_normalized), axis=1)

train_abs_diff_l2 = np.linalg.norm(train_embeddings_q1_normalized - train_embeddings_q2_normalized, axis=1)
test_abs_diff_l2  = np.linalg.norm(test_embeddings_q1_normalized - test_embeddings_q2_normalized, axis=1)

train_prod_mean = np.mean(train_embeddings_q1_normalized * train_embeddings_q2_normalized, axis=1)
test_prod_mean  = np.mean(test_embeddings_q1_normalized * test_embeddings_q2_normalized, axis=1)

train_prod_max = np.max(train_embeddings_q1_normalized * train_embeddings_q2_normalized, axis=1)
test_prod_max  = np.max(test_embeddings_q1_normalized * test_embeddings_q2_normalized, axis=1)


In [None]:
# #X_train_bert = np.hstack([
#     train_cosine_similarity.reshape(-1, 1),
#     train_absolute_difference,
#     train_elementwise_product
# ]).astype(np.float32)

# X_test_bert = np.hstack([
#     test_cosine_similarity.reshape(-1, 1),
#     test_absolute_difference,
#     test_elementwise_product
# ]).astype(np.float32)


# print("X_train_bert shape:", X_train_bert.shape)
# print("X_test_bert shape:", X_test_bert.shape)


In [32]:
X_train_bert = np.column_stack([
    train_cosine_similarity,
    train_abs_diff_mean,
    train_abs_diff_max,
    train_abs_diff_l2,
    train_prod_mean,
    train_prod_max
]).astype(np.float32)

X_test_bert = np.column_stack([
    test_cosine_similarity,
    test_abs_diff_mean,
    test_abs_diff_max,
    test_abs_diff_l2,
    test_prod_mean,
    test_prod_max
]).astype(np.float32)

print("X_train_bert shape:", X_train_bert.shape)
print("X_test_bert shape:", X_test_bert.shape)


X_train_bert shape: (323432, 6)
X_test_bert shape: (80858, 6)


In [63]:
logistic_model_bert = LogisticRegression(
    solver="lbfgs",
    penalty="l2",
    C=0.5,
    max_iter=600,
    tol=1e-4,
    random_state=42

)

In [64]:
logistic_model_bert.fit(X_train_bert, y_train)

In [65]:
y_test_proba_bert = logistic_model_bert.predict_proba(X_test_bert)[:, 1]


In [66]:
thresholds_ = [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]

best_threshold_bert = None
best_f1_bert = -1

for threshold in thresholds_:
    f1 = evaluate_with_threshold(
        y_test,
        y_test_proba_bert,
        threshold
    )

    print("threshold:", threshold, "F1:", f1)

    if f1 > best_f1_bert:
        best_f1_bert = f1
        best_threshold_bert = threshold

print("\nBEST threshold:", best_threshold_bert)
print("BEST F1:", best_f1_bert)


threshold: 0.2 F1: 0.725976923567285
threshold: 0.25 F1: 0.7339225845778726
threshold: 0.3 F1: 0.7374960484901797
threshold: 0.35 F1: 0.7367488135399394
threshold: 0.4 F1: 0.73208008103438
threshold: 0.45 F1: 0.7230716532245508
threshold: 0.5 F1: 0.710477715854675
threshold: 0.55 F1: 0.6919161418341817
threshold: 0.6 F1: 0.6643828087788077

BEST threshold: 0.3
BEST F1: 0.7374960484901797


In [67]:
results_bert_logreg = evaluate_model(logistic_model_bert,X_test_bert,y_test,threshold=0.3)

In [68]:
print("LogLoss:", results_bert_logreg["log_loss"])
print("F1:", results_bert_logreg["f1"])
print("Confusion matrix:\n", results_bert_logreg["confusion_matrix"])
print("\nClassification report:\n", results_bert_logreg["classification_report"])


LogLoss: 0.42210850677204914
F1: 0.7374960484901797
Confusion matrix:
 [[34930 16075]
 [ 3024 26829]]

Classification report:
               precision    recall  f1-score   support

           0       0.92      0.68      0.79     51005
           1       0.63      0.90      0.74     29853

    accuracy                           0.76     80858
   macro avg       0.77      0.79      0.76     80858
weighted avg       0.81      0.76      0.77     80858



In [69]:
from sklearn.decomposition import PCA

In [70]:
pca = PCA(n_components=64, random_state=42)

In [71]:
train_embeddings_all = np.vstack([
    train_embeddings_q1_normalized,
    train_embeddings_q2_normalized
]).astype(np.float32)

In [72]:
pca.fit(train_embeddings_all)

In [73]:
train_q1_pca = pca.transform(train_embeddings_q1_normalized.astype(np.float32))
train_q2_pca = pca.transform(train_embeddings_q2_normalized.astype(np.float32))

test_q1_pca = pca.transform(test_embeddings_q1_normalized.astype(np.float32))
test_q2_pca = pca.transform(test_embeddings_q2_normalized.astype(np.float32))

In [74]:
train_cosine_similarity = cosine_similarity_between_embeddings(train_q1_pca, train_q2_pca)
test_cosine_similarity  = cosine_similarity_between_embeddings(test_q1_pca, test_q2_pca)

train_abs_diff = np.abs(train_q1_pca - train_q2_pca)
test_abs_diff  = np.abs(test_q1_pca - test_q2_pca)

train_prod = train_q1_pca * train_q2_pca
test_prod  = test_q1_pca * test_q2_pca


In [75]:
X_train_bert = np.hstack([
    train_cosine_similarity.reshape(-1, 1),
    train_abs_diff,
    train_prod
]).astype(np.float32)

X_test_bert = np.hstack([
    test_cosine_similarity.reshape(-1, 1),
    test_abs_diff,
    test_prod
]).astype(np.float32)

print("X_train_bert shape:", X_train_bert.shape)
print("X_test_bert shape:", X_test_bert.shape)


X_train_bert shape: (323432, 129)
X_test_bert shape: (80858, 129)


In [82]:
logistic_model_bert = LogisticRegression(
    solver="lbfgs",
    max_iter=800,
    C=0.7,
    random_state=42
)

In [83]:
logistic_model_bert.fit(X_train_bert, y_train)

In [84]:
y_test_proba_bert = logistic_model_bert.predict_proba(X_test_bert)[:, 1]

In [85]:
thresholds_ = [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]

best_threshold_bert = None
best_f1_bert = -1

for threshold in thresholds_:
    f1 = evaluate_with_threshold(
        y_test,
        y_test_proba_bert,
        threshold
    )

    print("threshold:", threshold, "F1:", f1)

    if f1 > best_f1_bert:
        best_f1_bert = f1
        best_threshold_bert = threshold

print("\nBEST threshold:", best_threshold_bert)
print("BEST F1:", best_f1_bert)

threshold: 0.2 F1: 0.7375014559525812
threshold: 0.25 F1: 0.746392028355514
threshold: 0.3 F1: 0.7516092493853082
threshold: 0.35 F1: 0.7548258138206739
threshold: 0.4 F1: 0.7563867534003548
threshold: 0.45 F1: 0.753829623944743
threshold: 0.5 F1: 0.7462509802666325
threshold: 0.55 F1: 0.7333804809052333
threshold: 0.6 F1: 0.7092251712328768

BEST threshold: 0.4
BEST F1: 0.7563867534003548


In [86]:
results_bert_logreg = evaluate_model(logistic_model_bert,X_test_bert,y_test,threshold=0.4)

In [87]:
print("LogLoss:", results_bert_logreg["log_loss"])
print("F1:", results_bert_logreg["f1"])
print("Confusion matrix:\n", results_bert_logreg["confusion_matrix"])
print("\nClassification report:\n", results_bert_logreg["classification_report"])

LogLoss: 0.39784918577695516
F1: 0.7563867534003548
Confusion matrix:
 [[38799 12206]
 [ 4272 25581]]

Classification report:
               precision    recall  f1-score   support

           0       0.90      0.76      0.82     51005
           1       0.68      0.86      0.76     29853

    accuracy                           0.80     80858
   macro avg       0.79      0.81      0.79     80858
weighted avg       0.82      0.80      0.80     80858



In [101]:
X_train_bert = np.hstack([
    X_train_bert,
    cos_train.reshape(-1, 1),
    train_common_words.values.reshape(-1, 1)
]).astype(np.float32)

X_test_bert = np.hstack([
    X_test_bert,
    cos_test.reshape(-1, 1),
    test_common_words.values.reshape(-1, 1)
]).astype(np.float32)

print("X_train_bert shape:", X_train_bert.shape)
print("X_test_bert shape:", X_test_bert.shape)


X_train_bert shape: (323432, 133)
X_test_bert shape: (80858, 133)


In [102]:
logistic_model_bert = LogisticRegression(
    solver="lbfgs",
    max_iter=800,
    C=1.0,
    random_state=42
)

logistic_model_bert.fit(X_train_bert, y_train)


In [103]:
y_test_proba_bert = logistic_model_bert.predict_proba(X_test_bert)[:, 1]

In [104]:
thresholds_ = [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]

best_threshold_bert = None
best_f1_bert = -1

for threshold in thresholds_:
    f1 = evaluate_with_threshold(
        y_test,
        y_test_proba_bert,
        threshold
    )

    print("threshold:", threshold, "F1:", f1)

    if f1 > best_f1_bert:
        best_f1_bert = f1
        best_threshold_bert = threshold

print("\nBEST threshold:", best_threshold_bert)
print("BEST F1:", best_f1_bert)

threshold: 0.2 F1: 0.7415054101084638
threshold: 0.25 F1: 0.7514186270128252
threshold: 0.3 F1: 0.7588379418970949
threshold: 0.35 F1: 0.7626011786681339
threshold: 0.4 F1: 0.764040900776635
threshold: 0.45 F1: 0.7610768106250292
threshold: 0.5 F1: 0.7563857683965166
threshold: 0.55 F1: 0.7451751190334989
threshold: 0.6 F1: 0.7270292946784012

BEST threshold: 0.4
BEST F1: 0.764040900776635


In [105]:
results_bert_logreg = evaluate_model(logistic_model_bert,X_test_bert,y_test,threshold=0.4)

In [106]:
print("LogLoss:", results_bert_logreg["log_loss"])
print("F1:", results_bert_logreg["f1"])
print("Confusion matrix:\n", results_bert_logreg["confusion_matrix"])
print("\nClassification report:\n", results_bert_logreg["classification_report"])

LogLoss: 0.3875337546650748
F1: 0.764040900776635
Confusion matrix:
 [[39640 11365]
 [ 4373 25480]]

Classification report:
               precision    recall  f1-score   support

           0       0.90      0.78      0.83     51005
           1       0.69      0.85      0.76     29853

    accuracy                           0.81     80858
   macro avg       0.80      0.82      0.80     80858
weighted avg       0.82      0.81      0.81     80858



#Висновок
У роботі було протестовано кілька підходів до задачі визначення схожості запитань.

Базові моделі дали прийнятний, але обмежений результат і були використані як відправна точка.

Використання Sentence-BERT embeddings суттєво покращило якість, однак повні векторні ознаки виявилися надто важкими для стабільного навчання.

Агрегування embeddingів спростило модель, але призвело до зниження якості.

Найкращі результати показала модель, яка поєднує Sentence-BERT embeddings зі зменшенням розмірності за допомогою PCA та додатковими лексичними ознаками (TF-IDF cosine similarity і кількість спільних слів).

Цей підхід забезпечив найнижчий LogLoss і найвищий F1 серед усіх протестованих моделей, зберігаючи стабільність і простоту реалізації.