# pseudolabeling


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.utils import to_categorical

# Load MNIST dataset
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Preprocess data and split into labeled and unlabeled data
train_images = train_images.reshape((60000, 28, 28, 1)).astype('float32') / 255
test_images = test_images.reshape((10000, 28, 28, 1)).astype('float32') / 255

X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(train_images, train_labels, test_size=0.5, random_state=42)

# Define and train the initial model on labeled data
model = Sequential([
    Flatten(input_shape=(28, 28, 1)),
    Dense(128, activation='relu'),
    Dense(10, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_labeled, y_labeled, epochs=5, batch_size=32, validation_split=0.2)

# Generate pseudo-labels for the test data using the trained model
pseudo_labels = model.predict(test_images)
pseudo_labels = np.argmax(pseudo_labels, axis=1)

# Retrain the model on combined labeled and pseudo-labeled data
X_combined = np.concatenate([X_labeled, test_images])
y_combined = np.concatenate([y_labeled, pseudo_labels])

model.fit(X_combined, y_combined, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate the model on the actual test set
test_loss, test_acc = model.evaluate(test_images, test_labels, verbose=2)
print(f"Test accuracy with pseudo-labeling: {test_acc}")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
313/313 - 1s - loss: 0.1141 - accuracy: 0.9709 - 645ms/epoch - 2ms/step
Test accuracy with pseudo-labeling: 0.9708999991416931


# search quality metrics

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming 'model' is the trained model from the previous example

# Making predictions on the test set
predictions = model.predict(test_images)
predicted_labels = np.argmax(predictions, axis=1)

# Calculating evaluation metrics
accuracy = accuracy_score(test_labels, predicted_labels)
precision = precision_score(test_labels, predicted_labels, average='weighted')
recall = recall_score(test_labels, predicted_labels, average='weighted')
f1 = f1_score(test_labels, predicted_labels, average='weighted')

# Displaying evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.9709
Precision: 0.9711697838943246
Recall: 0.9709
F1 Score: 0.970900654112268


# improving the quality of relevance markup
Enhancing the accuracy and relevance of annotated or labeled data is a necessary step towards improving the quality of relevance markup, particularly in the context of search engines or information retrieval.
The following techniques can be used:
#### Improved Annotation Guidelines:
Having clear rules ensures that annotators fully comprehend the task, which produces annotations that are more accurate and consistent.
####  Quality Control Measures:
Putting quality control measures in place while the annotation process is underway and Using several annotators to independently label the same data and gauge their agreement helps to spot errors or conflicts.
#### Active Learning
The performance of the model is enhanced with less annotation work when active learning techniques are used to intelligently choose samples for annotation, train a preliminary model on a small labeled dataset, and then leverage this model to predict the relevance of unlabeled data.


In [None]:
max_iterations = 10  # Set a maximum number of iterations for the active learning loop
iteration = 0

while len(X_unlabeled) > 0 and iteration < max_iterations:
    iteration += 1

    # Make predictions on the unlabeled data
    unlabeled_predictions = model.predict(X_unlabeled)

    # Calculate uncertainty using entropy
    uncertainty = -np.sum(unlabeled_predictions * np.log(unlabeled_predictions + 1e-10), axis=1)

    # Choose samples with highest uncertainty for annotation
    num_samples_to_annotate = min(100, len(X_unlabeled))  # Annotate 100 samples or remaining if less
    indices_to_annotate = np.argsort(uncertainty)[-num_samples_to_annotate:]

    # Add the selected samples to the labeled dataset
    pseudo_labels = np.argmax(unlabeled_predictions, axis=1)
    X_labeled = np.concatenate((X_labeled, X_unlabeled[indices_to_annotate]))
    y_labeled = np.concatenate((y_labeled, pseudo_labels))

    # Remove the annotated samples from the unlabeled dataset
    X_unlabeled = np.delete(X_unlabeled, indices_to_annotate, axis=0)

    # Retrain the model with the updated labeled dataset
    model.fit(X_labeled, y_labeled, epochs=2, batch_size=32, validation_split=0.2)



Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2


In [6]:
import numpy as np

# Relevance scores for Google and Yandex search results
#the relevance scores were results were achieved from http://www.analyzethis.ru/
relevance_scores_google = [0.914, 0.983, 0.791, 0.584, 0.564, 0.769, 0.985, 0.834, 0.998, 0.428]
relevance_scores_yandex = [0.998, 0.939, 1.000, 0.943, 0.938, 0.953, 0.719, 0.984, 0.993, 0.926]

# Function to calculate DCG (Discounted Cumulative Gain)
def calculate_dcg(relevance_scores):
    # DCG calculation formula: DCG = rel_1 + sum(rel_i / log2(i+1)) for i in range(1, len(relevance_scores))
    dcg = relevance_scores[0] + sum(score / np.log2(i + 2) for i, score in enumerate(relevance_scores[1:]))
    return dcg

# Function to calculate PFound
def calculate_pfound(relevance_scores, p_break=0.15):
    # PFound calculation logic
    pfound = 0
    num_relevant = sum(1 for score in relevance_scores if score > 0)  # Count the number of relevant documents
    for i, rel_score in enumerate(relevance_scores):
        pfound += ((1 - p_break) ** i) * (rel_score > 0)
    pfound *= 1 / num_relevant  # Normalize by the number of relevant documents
    return pfound

# Calculate DCG for Google and Yandex
dcg_google = calculate_dcg(relevance_scores_google)
dcg_yandex = calculate_dcg(relevance_scores_yandex)

# Calculate PFound for Google and Yandex
pfound_google = calculate_pfound(relevance_scores_google)
pfound_yandex = calculate_pfound(relevance_scores_yandex)

# Displaying calculated search quality metrics
print(f"DCG for Google: {dcg_google}")
print(f"DCG for Yandex: {dcg_yandex}")
print(f"PFound for Google: {pfound_google}")
print(f"PFound for Yandex: {pfound_yandex}")

# Calculate the t-statistic and p-value using a paired t-test
t_statistic, p_value = ttest_rel(relevance_scores_google, relevance_scores_yandex)

# Assuming alpha (significance level) is set to 0.05
alpha = 0.05

# Print the t-statistic, p-value, and decision based on significance level
print(f"t-statistic: {t_statistic}")
print(f"p-value: {p_value}")

if p_value < alpha:
    print("Reject null hypothesis: Google relevance is statistically significantly better than Yandex relevance.")
else:
    print("Fail to reject null hypothesis: No significant difference in relevance between Google and Yandex.")


DCG for Google: 4.3009956870695225
DCG for Yandex: 4.988198456502561
PFound for Google: 0.5354170637728515
PFound for Yandex: 0.5354170637728515
t-statistic: -2.161842716615156
p-value: 0.058890539530887895
Fail to reject null hypothesis: No significant difference in relevance between Google and Yandex.
