In [2]:
import gensim
from sklearn.datasets import fetch_20newsgroups
from elasticsearch import Elasticsearch
import numpy as np

# Load the Word2Vec model (make sure to download and specify the appropriate path)
model_path = 'E:\IR\GoogleNews-vectors-negative300.bin'
word_vectors_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)

# Load the dataset (e.g., 20 Newsgroups dataset)
newsgroups = fetch_20newsgroups(subset='all')

# Preprocess the documents
preprocessed_docs = []
for doc in newsgroups.data:
    # Tokenize the document
    tokens = gensim.utils.simple_preprocess(doc.lower())
    # Remove stop words and stem the tokens
    stemmed_tokens = [gensim.parsing.porter.PorterStemmer().stem(token) for token in tokens if token not in gensim.parsing.preprocessing.STOPWORDS]
    # Join the stemmed tokens back into a string
    preprocessed_doc = ' '.join(stemmed_tokens)
    preprocessed_docs.append(preprocessed_doc)

# Initialize Elasticsearch client
es = Elasticsearch("https://localhost:9200/", ca_certs="E:\Elastic-stack\elasticsearch-8.10.4\config\certs\http_ca.crt", basic_auth=("elastic", "xVG7=GFVFC1-aQ_bOtv7"))

# Delete the index if it already exists
index_name = 'vector_index'
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

# Create index with appropriate mappings
index_mappings = {
    'mappings': {
        'properties': {
            'text': {
                'type': 'text'
            },
            'vector': {
                'type': 'dense_vector',
                'dims': 300  # Adjust the dimension to match your model
            }
        }
    }
}
es.indices.create(index=index_name, body=index_mappings)

# Iterate over preprocessed documents and generate vectors
for i, doc in enumerate(preprocessed_docs):
    # Split the preprocessed document into tokens
    tokens = doc.split()
    # Initialize an array to store the word vectors
    word_vectors = [word_vectors_model[token] for token in tokens if token in word_vectors_model]
    if word_vectors:
        # Compute the average vector for the document
        doc_vector = np.mean(word_vectors, axis=0)
        # Store the document and its vector in the Elasticsearch index
        es.index(index=index_name, id=i, body={'text': doc, 'vector': doc_vector.tolist()})

# Get a user query
user_query = "terrorism"

# Preprocess the user query
query_tokens = gensim.utils.simple_preprocess(user_query.lower())
stemmed_query_tokens = [gensim.parsing.porter.PorterStemmer().stem(token) for token in query_tokens if token not in gensim.parsing.preprocessing.STOPWORDS]
preprocessed_query = ' '.join(stemmed_query_tokens)

# Convert the query to a vector (similar to the document vectorization step)
# Initialize an array to store the word vectors
query_word_vectors = []

# Compute vectors for query tokens and aggregate them
for token in preprocessed_query.split():
    if token in word_vectors_model:  # Check if the token is in the Word2Vec model's vocabulary
        word_vector = word_vectors_model[token]  # Retrieve the vector for the token
        query_word_vectors.append(word_vector)

# Calculate the query vector by averaging the word vectors
if query_word_vectors:
    query_vector = sum(query_word_vectors) / len(query_word_vectors)
    print(f'Query vector: {query_vector}')

# Use Elasticsearch to retrieve similar documents
search_body = {
    'query': {
        'script_score': {
            'query': {
                'match_all': {}
            },
            'script': {
                'source': 'cosineSimilarity(params.query_vector, "vector") + 1.0',
                'params': {
                    'query_vector': query_vector.tolist()
                }
            }
        }
    },
    '_source': {
        'includes': ['text']
    }
}
search_results = es.search(index=index_name, body=search_body)['hits']['hits']

# Print the top relevant documents
for i, hit in enumerate(search_results[:10]):
    print(f'{i+1}. {hit["_source"]["text"]}')


Query vector: [ 0.07617188  0.08007812 -0.10986328  0.26171875 -0.36328125 -0.05664062
 -0.20410156  0.03491211  0.31054688  0.19726562  0.24511719  0.14941406
 -0.07324219  0.40039062 -0.13769531  0.09423828 -0.03588867  0.3203125
 -0.01171875 -0.01757812 -0.12695312  0.03662109  0.00238037  0.12353516
  0.26953125  0.09912109  0.16210938 -0.46679688  0.48046875 -0.01220703
  0.25390625 -0.06542969 -0.1328125   0.16503906 -0.50390625 -0.11035156
 -0.25390625  0.2421875   0.31640625  0.09472656  0.16601562 -0.09814453
  0.03857422 -0.09863281  0.01373291  0.06445312  0.20703125 -0.12988281
 -0.28125     0.12304688  0.01245117  0.09716797 -0.13085938  0.19628906
  0.05615234 -0.3046875  -0.08642578 -0.07910156 -0.19042969 -0.09082031
 -0.09228516  0.02954102  0.18261719 -0.06030273  0.00405884 -0.21289062
 -0.14648438 -0.11132812  0.02490234 -0.03271484 -0.04321289  0.35546875
  0.15820312  0.01806641 -0.140625   -0.58203125  0.2734375  -0.13964844
 -0.07666016 -0.12792969  0.140625    

In [3]:
from elasticsearch import Elasticsearch
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, ndcg_score
import numpy as np

# Initialize Elasticsearch client
es = Elasticsearch("https://localhost:9200/", ca_certs="E:\Elastic-stack\elasticsearch-8.10.4\config\certs\http_ca.crt", basic_auth=("elastic", "xVG7=GFVFC1-aQ_bOtv7"))

# Define your Elasticsearch index name
index_name = 'vector_index'

# Define your queries and relevance judgments
queries = [
    'cars',
    'computer',
    'terrorism',
    'india',
    'Health',
    'education'
]

relevance_judgments = [
    [1, 2, 3, 4, 5],  # Relevance judgments for the first query
    [2, 4, 6, 8, 10],  # Relevance judgments for the second query
    [1, 2, 5, 7, 9],  # Relevance judgments for the third query
    [3, 4, 6, 8, 10],  # Relevance judgments for the fourth query
    [1, 3, 5, 7, 9],  # Relevance judgments for the fifth query
    [2, 4, 6, 8, 10]  # Relevance judgments for the sixth query
]

# Initialize lists to store evaluation results
precision_results = []
recall_results = []
f1_results = []
ap_results = []
ndcg_results = []

# Iterate through the queries
for i, query in enumerate(queries):
    # Preprocess the query and retrieve the query_vector as shown in your reference

    # Search for similar documents using Elasticsearch
    search_body = {
        'query': {
            'script_score': {
                'query': {
                    'match_all': {}
                },
                'script': {
                    'source': 'cosineSimilarity(params.query_vector, "vector") + 1.0',
                    'params': {
                        'query_vector': query_vector.tolist()
                    }
                }
            }
        },
        '_source': {
            'includes': ['text']
        }
    }
    search_results = es.search(index=index_name, body=search_body)['hits']['hits']

    # Retrieve relevant documents using relevance judgments
    y_true = np.zeros(len(search_results))
    for relevance_index in relevance_judgments[i]:
        if relevance_index - 1 < len(y_true):
            y_true[relevance_index - 1] = 1

    # Calculate IR metrics
    y_pred = np.zeros(len(search_results))
    for j, hit in enumerate(search_results):
        if j in relevance_judgments[i]:
            y_pred[j] = 1
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    ap = average_precision_score(y_true, y_pred)
    ndcg = ndcg_score([y_true], [y_pred])

    # Append results to lists
    precision_results.append(precision)
    recall_results.append(recall)
    f1_results.append(f1)
    ap_results.append(ap)
    ndcg_results.append(ndcg)

    # Print the results
    print(f'Query: {query}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1: {f1:.4f}')
    print(f'MAP: {ap:.4f}')
    print(f'NDCG: {ndcg:.4f}')
    print()

    # Print relevant documents
    print(f'Similar documents for query: {query}')
    for hit in search_results:
        print(f'Document id: {hit["_id"]}, Text: {hit["_source"]["text"]}')
    print()


Query: cars
Precision: 0.8000
Recall: 0.8000
F1: 0.8000
MAP: 0.7400
NDCG: 0.9082

Similar documents for query: cars
Document id: 10647, Text: lfoard hopper virginia edu lawrenc foard subject bad press islam reciev organ itc uva commun access unix internet project line articl buddha du cc iastat edu buddha iastat edu scott vann write recent read articl local paper written islam person upset wai islam portrai western media terrorist action take place middl east plai islam terrorist serbian terrorist attack croation christian terrorist terrorist tri explain close friend believ press islam ti violenc time hear thing like valu human life like wonder suggest chang imag help friend hype appreci suggest comment mail interest hear right press easili pictur crime scene perpetr christian terrorist countri convinc talk victim christian terror brutal act terror inspir christian propoganda recent commit campu simpl religi extremist religion valu human life christian islam fundamentalist advanc relig