# псевдомаркировка


In [None]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Flatten, ZeroPadding2D, Input
from tensorflow.keras.applications import VGG16
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import optimizers

# Load MNIST dataset
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Preprocess data and pad images to match VGG16 input size
train_images = np.pad(train_images, ((0,0), (2,2), (2,2)), 'constant')  # Pad to 32x32
test_images = np.pad(test_images, ((0,0), (2,2), (2,2)), 'constant')  # Pad to 32x32
train_images = np.expand_dims(train_images, axis=-1).astype('float32') / 255
test_images = np.expand_dims(test_images, axis=-1).astype('float32') / 255

X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(train_images, train_labels, test_size=0.5, random_state=42)

# Define VGGNet model with ImageNet weights (without the top classification layers)
input_shape = (32, 32, 1)  # Grayscale image input shape
input_layer = Input(shape=input_shape)
# Convert grayscale to 3 channels by stacking the same channel 3 times
stacked_input = tf.keras.layers.Concatenate()([input_layer, input_layer, input_layer])

base_model = VGG16(weights='imagenet', include_top=False, input_tensor=stacked_input)

# Freeze the convolutional base
base_model.trainable = False

# Create a new model on top
flatten_layer = Flatten()(base_model.output)
dense_layer = Dense(128, activation='relu')(flatten_layer)
output_layer = Dense(10, activation='softmax')(dense_layer)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the initial model on labeled data
model.fit(X_labeled, y_labeled, epochs=5, batch_size=32, validation_split=0.2)

# Generate pseudo-labels for the test data using the trained model
pseudo_labels = model.predict(test_images)
pseudo_labels = np.argmax(pseudo_labels, axis=1)

# Retrain the model on combined labeled and pseudo-labeled data
X_combined = np.concatenate([X_labeled, test_images])
y_combined = np.concatenate([y_labeled, pseudo_labels])

model.fit(X_combined, y_combined, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate the model on the actual test set
test_loss, test_acc = model.evaluate(test_images, test_labels, verbose=2)
print(f"Test accuracy with pseudo-labeling: {test_acc}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
313/313 - 121s - loss: 0.1160 - accuracy: 0.9618 - 121s/epoch - 387ms/step
Test accuracy with pseudo-labeling: 0.9617999792098999


# вывод
Псевдомаркировка улучшила производительность модели за счет использования предсказаний модели на немеченых данных для дополнения набора меченых данных. По сравнению с моделью, обученной только на помеченных данных, точность теста с псевдомаркировкой увеличилась до 96,18 %, что свидетельствует о полезности этого метода для улучшения обобщения модели на неизвестные данные.

# улучшение качества разметки релевантности
Повышение точности и релевантности аннотированных или маркированных данных - необходимый шаг к улучшению качества разметки релевантности, особенно в контексте поисковых систем или поиска информации.
Можно использовать следующие методы:
#### Улучшенные правила аннотирования:
Наличие четких правил гарантирует, что аннотаторы полностью понимают задачу, что позволяет получить более точные и последовательные аннотации.
#### Меры контроля качества:
Принятие мер контроля качества во время процесса аннотирования и использование нескольких аннотаторов для независимой маркировки одних и тех же данных и оценки их согласия помогает выявить ошибки или конфликты.
#### Активное обучение
Производительность модели повышается при меньших затратах на аннотирование, если использовать методы активного обучения для интеллектуального выбора образцов для аннотирования, обучения предварительной модели на небольшом наборе помеченных данных, а затем использовать эту модель для прогнозирования релевантности немеченых данных.


In [None]:
max_iterations = 10  # Set a maximum number of iterations for the active learning loop
iteration = 0

while len(X_unlabeled) > 0 and iteration < max_iterations:
    iteration += 1

    # Make predictions on the unlabeled data
    unlabeled_predictions = model.predict(X_unlabeled)

    # Calculate uncertainty using entropy
    uncertainty = -np.sum(unlabeled_predictions * np.log(unlabeled_predictions + 1e-10), axis=1)

    # Choose samples with highest uncertainty for annotation
    num_samples_to_annotate = min(100, len(X_unlabeled))  # Annotate 100 samples or remaining if less
    indices_to_annotate = np.argsort(uncertainty)[-num_samples_to_annotate:]

    # Add the selected samples to the labeled dataset
    pseudo_labels = np.argmax(unlabeled_predictions, axis=1)
    X_labeled = np.concatenate((X_labeled, X_unlabeled[indices_to_annotate]))
    y_labeled = np.concatenate((y_labeled, pseudo_labels))

    # Remove the annotated samples from the unlabeled dataset
    X_unlabeled = np.delete(X_unlabeled, indices_to_annotate, axis=0)

    # Retrain the model with the updated labeled dataset
    model.fit(X_labeled, y_labeled, epochs=2, batch_size=32, validation_split=0.2)



Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2

In [None]:
import numpy as np

# Relevance scores for Google and Yandex search results
#the relevance scores were results were achieved from http://www.analyzethis.ru/
relevance_scores_google = [0.914, 0.983, 0.791, 0.584, 0.564, 0.769, 0.985, 0.834, 0.998, 0.428]
relevance_scores_yandex = [0.998, 0.939, 1.000, 0.943, 0.938, 0.953, 0.719, 0.984, 0.993, 0.926]

# Function to calculate DCG (Discounted Cumulative Gain)
def calculate_dcg(relevance_scores):
    # DCG calculation formula: DCG = rel_1 + sum(rel_i / log2(i+1)) for i in range(1, len(relevance_scores))
    dcg = relevance_scores[0] + sum(score / np.log2(i + 2) for i, score in enumerate(relevance_scores[1:]))
    return dcg

# Function to calculate PFound
def calculate_pfound(relevance_scores, p_break=0.15):
    # PFound calculation logic
    pfound = 0
    num_relevant = sum(1 for score in relevance_scores if score > 0)  # Count the number of relevant documents
    for i, rel_score in enumerate(relevance_scores):
        pfound += ((1 - p_break) ** i) * (rel_score > 0)
    pfound *= 1 / num_relevant  # Normalize by the number of relevant documents
    return pfound

# Calculate DCG for Google and Yandex
dcg_google = calculate_dcg(relevance_scores_google)
dcg_yandex = calculate_dcg(relevance_scores_yandex)

# Calculate PFound for Google and Yandex
pfound_google = calculate_pfound(relevance_scores_google)
pfound_yandex = calculate_pfound(relevance_scores_yandex)

# Displaying calculated search quality metrics
print(f"DCG for Google: {dcg_google}")
print(f"DCG for Yandex: {dcg_yandex}")
print(f"PFound for Google: {pfound_google}")
print(f"PFound for Yandex: {pfound_yandex}")

# Calculate the t-statistic and p-value using a paired t-test
t_statistic, p_value = ttest_rel(relevance_scores_google, relevance_scores_yandex)

# Assuming alpha (significance level) is set to 0.05
alpha = 0.05

# Print the t-statistic, p-value, and decision based on significance level
print(f"t-statistic: {t_statistic}")
print(f"p-value: {p_value}")

if p_value < alpha:
    print("Reject null hypothesis: Google relevance is statistically significantly better than Yandex relevance.")
else:
    print("Fail to reject null hypothesis: No significant difference in relevance between Google and Yandex.")


DCG for Google: 4.3009956870695225
DCG for Yandex: 4.988198456502561
PFound for Google: 0.5354170637728515
PFound for Yandex: 0.5354170637728515
t-statistic: -2.161842716615156
p-value: 0.058890539530887895
Fail to reject null hypothesis: No significant difference in relevance between Google and Yandex.
