In [4]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi

# Sample documents and query
documents = [
    "Information retrieval is the process of obtaining information from a large repository",
    "BM25 is a probabilistic model for information retrieval",
    "Language models estimate the probability of a sequence of words",
    "This model is based on the vector space model for IR",
    "Dirichlet smoothing improves retrieval effectiveness in language models",
]

query = "probabilistic model for information retrieval"

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(documents)
query_vector = vectorizer.transform([query])

# Calculate Cosine Similarity
cosine_similarities = cosine_similarity(query_vector, doc_vectors).flatten()
vsm_ranking = np.argsort(-cosine_similarities)  # Ranking based on similarity scores

print("VSM Ranking (Cosine Similarity):", vsm_ranking)
print("VSM Scores:", cosine_similarities[vsm_ranking])

# Tokenize documents for BM25
tokenized_docs = [doc.split(" ") for doc in documents]
bm25 = BM25Okapi(tokenized_docs)

# Tokenize the query and calculate BM25 scores
query_tokens = query.split(" ")
bm25_scores = bm25.get_scores(query_tokens)
bm25_ranking = np.argsort(-bm25_scores)

print("BM25 Ranking:", bm25_ranking)
print("BM25 Scores:", bm25_scores[bm25_ranking])

VSM Ranking (Cosine Similarity): [1 3 0 4 2]
VSM Scores: [0.83745063 0.33232178 0.30631303 0.09360362 0.        ]
BM25 Ranking: [1 3 0 4 2]
BM25 Scores: [2.5208062  0.78137254 0.4912689  0.22284355 0.        ]


In [13]:
import numpy as np
from collections import Counter
import math

# Sample documents and query
documents = [
    "Information retrieval is the process of obtaining information.",
    "Information retrieval systems are used to find documents.",
    "Information and documents can be retrieved by using search engines.",
]
query = "retrieving information from documents"

# Tokenize documents and query
tokenized_docs = [doc.lower().split() for doc in documents]
tokenized_query = query.lower().split()

# BM25 Parameters
k1 = 1.5  # Controls term frequency saturation
b = 0.75  # Controls length normalization
avg_doc_len = np.mean([len(doc) for doc in tokenized_docs])


# Build term frequencies and document frequencies
def compute_bm25_scores(tokenized_docs, query, k1=1.5, b=0.75, avg_doc_len=None):
    # Calculate document frequencies for each word
    doc_freqs = Counter(word for doc in tokenized_docs for word in set(doc))
    N = len(tokenized_docs)  # Total number of documents
    scores = []

    for doc in tokenized_docs:
        score = 0
        doc_len = len(doc)
        term_freqs = Counter(doc)

        for term in query:
            if term in term_freqs:
                # Compute BM25 term weight
                idf = math.log(
                    (N - doc_freqs[term] + 0.5) / (doc_freqs[term] + 0.5) + 1
                )
                tf = term_freqs[term]
                term_score = idf * (
                    (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_len / avg_doc_len)))
                )
                score += term_score
        scores.append(score)

    return scores


# Compute BM25 scores
bm25_scores = compute_bm25_scores(tokenized_docs, tokenized_query, k1, b, avg_doc_len)

# Rank documents based on BM25 score
ranked_docs_bm25 = sorted(
    list(enumerate(bm25_scores)), key=lambda x: x[1], reverse=True
)
print("Ranked Documents (BM25):", ranked_docs_bm25)

Ranked Documents (BM25): [(2, 1.0422077980770672), (0, 0.138319370846119), (1, 0.138319370846119)]


In [11]:
from collections import defaultdict
import numpy as np

# Document word counts
doc_words = [
    ["information", "retrieval", "information", "retrieval", "process"],
    ["information", "retrieval", "systems", "documents", "find"],
    ["information", "documents", "retrieved", "search", "engines"],
]

# Vocabulary and query
vocab = set(word for doc in doc_words for word in doc)
query_words = ["information", "retrieval", "documents"]


# Dirichlet Smoothing Function
def dirichlet_smoothing(doc, query, alpha=2000):
    doc_length = len(doc)
    term_freqs = defaultdict(int)
    for word in doc:
        term_freqs[word] += 1

    # Calculate smoothed probability
    probabilities = []
    for word in query:
        term_prob = (term_freqs[word] + alpha * (1 / len(vocab))) / (doc_length + alpha)
        probabilities.append(term_prob)

    # Return log-probability for ranking
    return np.sum(np.log(probabilities))


# Rank documents based on Dirichlet-smoothed log-probability
scores = [dirichlet_smoothing(doc, query_words) for doc in doc_words]
ranked_docs_dirichlet = sorted(
    list(enumerate(scores)), key=lambda x: x[1], reverse=True
)
print("Ranked Documents (Dirichlet):", ranked_docs_dirichlet)

Ranked Documents (Dirichlet): [(0, -6.581244889861476), (1, -6.585694656785863), (2, -6.590184562058717)]


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Sample Data: Create a DataFrame representing user-specific search queries and click history
data = {
    "query_id": [1, 1, 1, 2, 2, 2, 3, 3, 3],
    "user_id": [101, 101, 101, 102, 102, 102, 103, 103, 103],
    "document_id": [1001, 1002, 1003, 1001, 1004, 1005, 1003, 1006, 1007],
    "relevance_score": [5, 3, 2, 4, 5, 1, 3, 2, 4],  # Target variable, to be predicted
    "click_count": [10,2,1,5,8,1,3,2,4,],  # Number of times a document was clicked
    "user_pref_score": [0.9,0.3,0.2,0.8,0.85,0.15,0.4,0.3,0.6,],  # Simulated user preference score
}

df = pd.DataFrame(data)A

# Feature Engineering
# Create a combined feature using click_count and user preference score
df["combined_score"] = df["user_pref_score"] * np.log1p(df["click_count"])

# Define features and target variable
X = df[
    ["click_count", "user_pref_score", "combined_score"]
]  # Use only non-leakage features
y = df["relevance_score"]  # Target variable (relevance score)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Model: Learning-to-Rank using Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict ranking scores for test set
y_pred = model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.4f}")

# Simulate ranking for a new user query
new_query_data = pd.DataFrame(
    {"click_count": [7, 1, 12], "user_pref_score": [0.75, 0.2, 0.85]}
)
new_query_data["combined_score"] = new_query_data["user_pref_score"] * np.log1p(
    new_query_data["click_count"]
)

# Predict and rank results
new_query_scores = model.predict(new_query_data)
new_query_data["predicted_rank_score"] = new_query_scores
new_query_data = new_query_data.sort_values(by="predicted_rank_score", ascending=False)
print("\nRanked Results for New Query:")
print(new_query_data[["click_count", "user_pref_score", "predicted_rank_score"]])

Mean Squared Error: 0.3869

Ranked Results for New Query:
   click_count  user_pref_score  predicted_rank_score
2           12             0.85                  4.91
0            7             0.75                  4.43
1            1             0.20                  1.81


In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import wordnet
import nltk

# Download WordNet
# nltk.download("wordnet")

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset="all", remove=("headers", "footers", "quotes"))
df = pd.DataFrame({"title": newsgroups.data, "category": newsgroups.target})
df["category"] = df["category"].apply(lambda x: newsgroups.target_names[x])
df = df.head(2000)

In [2]:
def reformulate_query(query):
    expanded_query = set(query.split())
    for term in query.split():
        for syn in wordnet.synsets(term)[:3]:  # Get up to 3 synonyms
            expanded_query.add(syn.lemmas()[0].name().lower())
    return " ".join(expanded_query)


def facet_search(df, category=None):
    return df[df["category"].str.contains(category, case=False)] if category else df


def diversify_results(results, n=3):
    diversified = []
    for category in results["category"].unique():
        category_results = results[results["category"] == category].head(n)
        diversified += category_results.to_dict("records")
    return diversified


def search_engine(query, reformulate=False, diversify=False, facet=None):
    filtered_df = df

    if reformulate:
        query = reformulate_query(query)

    filtered_df = facet_search(filtered_df, category=facet)

    vectorizer = TfidfVectorizer(stop_words="english")
    vectors = vectorizer.fit_transform(filtered_df["title"])
    scores = cosine_similarity(vectorizer.transform([query]), vectors).flatten()

    filtered_df["score"] = scores
    sorted_results = filtered_df[filtered_df["score"] > 0].sort_values(
        by="score", ascending=False
    )

    return diversify_results(sorted_results) if diversify else sorted_results.head(5)


# Test the search engine
query = "Microsoft tech support needs new people"
print("Query Reformulation Example:")
reformulated_query = reformulate_query(query)
print(f"Original Query: {query}\nReformulated Query: {reformulated_query}\n")

print("Search Results with Query Reformulation and Faceted Search:")
results = search_engine(query, reformulate=True, facet="sci.electronics")
print(results[["title", "score"]].head())

print("\nSearch Results with Diversification:")
results_diversified = search_engine(query, diversify=True)
for result in results_diversified:
    print(result["title"], result["category"])

Query Reformulation Example:
Original Query: Microsoft tech support needs new people
Reformulated Query: Microsoft needs tech fresh support people technical_school motivation citizenry need raw new

Search Results with Query Reformulation and Faceted Search:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["score"] = scores


                                                  title     score
1970  \n\nAre you sure that he needs a two way conve...  0.145563
379   The subject line says it all. I'm working on a...  0.122619
718   Last week I asked for help in getting an old h...  0.088446
565   \n\n\n\n\n\n\nWHAT??!!!!\n\nYou can't remove i...  0.072802
407   There is a cartridge capping upgrade for older...  0.059586

Search Results with Diversification:

I got one from Microsoft tech support.
 comp.os.ms-windows.misc
This probably is in a FAQ somewhere, but....

I'm looking for Microsoft's internal speaker sound driver for Windows.

Should be at Microsoft's FTP site, but I can't remember the name of the site... comp.os.ms-windows.misc

Apparently, Microsoft came out with a new product: MS-Braille it is suppose 
to be "WYTIWIG".  :-)

No offense.
  comp.os.ms-windows.misc

     There's only one car that really fits your needs. It's spelled: rec.autos

To avoid paperwork associated with re-certification as a br

In [4]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

# Sample data: User-item interactions (ratings) and social network
ratings_data = {
    "user": [1, 1, 2, 2, 3, 3, 4, 4, 5],
    "item": [101, 102, 101, 103, 102, 104, 103, 105, 101],
    "rating": [5, 4, 4, 3, 5, 2, 4, 5, 5],
}

social_data = {"user": [1, 1, 2, 2, 3, 4, 5], "friend": [2, 3, 4, 5, 5, 5, 4]}

ratings_df = pd.DataFrame(ratings_data)
social_df = pd.DataFrame(social_data)

# 1. Construct User-Item Interaction Matrix
user_item_matrix = ratings_df.pivot(
    index="user", columns="item", values="rating"
).fillna(0)

# 2. Compute User Similarity (Collaborative Filtering)
user_similarity = cosine_similarity(user_item_matrix)

# 3. Construct Social Network Graph
G = nx.Graph()
for _, row in social_df.iterrows():
    G.add_edge(row["user"], row["friend"])

# 4. Social Influence Propagation (PageRank or similar algorithm)
social_influence = nx.pagerank(G)


# 5. Recommendation Function
def recommend(user_id, user_item_matrix, user_similarity, social_influence, top_n=2):
    user_idx = user_id - 1  # Adjust index for 0-based indexing
    user_ratings = user_item_matrix.loc[user_id]

    # User similarity score for the user
    sim_scores = user_similarity[user_idx]

    # Compute weighted recommendation score (Collaborative + Social)
    recommendation_score = (
        np.dot(sim_scores, user_item_matrix) * social_influence[user_id]
    )

    # Mask already rated items
    recommendation_score[user_ratings > 0] = -1

    # Get top N recommendations
    recommended_items = np.argsort(recommendation_score)[::-1][:top_n]

    return user_item_matrix.columns[recommended_items]


user_id = 1
recommended_items = recommend(
    user_id, user_item_matrix, user_similarity, social_influence
)
print(f"Recommended items for user {user_id}: {recommended_items}")

Recommended items for user 1: Index([103, 104], dtype='int64', name='item')


In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
import nltk

nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")

# Sample document collection (replace this with your dataset)
documents = [
    "The cat sat on the mat.",
    "Dogs are great companions.",
    "Cats and dogs are both popular pets.",
    "I love to watch movies on weekends.",
    "Reading books is a wonderful pastime.",
]

# Create a DataFrame for the documents
doc_df = pd.DataFrame({"document": documents})


# Function to clean and tokenize documents
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r"\W+", " ", text)  # Remove special characters
    tokens = word_tokenize(text)  # Tokenize
    tokens = [
        word for word in tokens if word not in stopwords.words("english")
    ]  # Remove stopwords
    return " ".join(tokens)


# Clean the documents
doc_df["cleaned"] = doc_df["document"].apply(clean_text)

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(doc_df["cleaned"])


# Pseudo-Relevance Feedback Expansion
def pseudo_relevance_feedback(query, tfidf_matrix, doc_df, top_n=2):
    query_vec = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = cosine_similarities.argsort()[-top_n:][
        ::-1
    ]  # Top N relevant documents

    # Retrieve top documents for query expansion
    top_docs = doc_df.iloc[top_indices]["cleaned"].values
    expanded_terms = set()

    for doc in top_docs:
        expanded_terms.update(doc.split())

    # Combine original query terms with expanded terms
    expanded_query = " ".join(set(query.split()).union(expanded_terms))
    return expanded_query


# WordNet-based Expansion
def wordnet_expansion(query):
    expanded_terms = set(query.split())
    for term in query.split():
        synonyms = wordnet.synsets(term)
        for syn in synonyms:
            for lemma in syn.lemmas():
                expanded_terms.add(lemma.name().lower())  # Add synonyms
    return " ".join(expanded_terms)


# Contextual Query Expansion
def contextual_expansion(query, tfidf_matrix, doc_df, top_n=2):
    query_vec = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = cosine_similarities.argsort()[-top_n:][
        ::-1
    ]  # Top N relevant documents

    # Retrieve top documents for query expansion
    top_docs = doc_df.iloc[top_indices]["cleaned"].values
    expanded_terms = set()

    for doc in top_docs:
        expanded_terms.update(doc.split())

    # Combine original query terms with expanded terms
    expanded_query = " ".join(set(query.split()).union(expanded_terms))
    return expanded_query


# Thesaurus-Based Expansion
def thesaurus_expansion(query, thesaurus):
    expanded_terms = set(query.split())
    for term in query.split():
        if term in thesaurus:
            expanded_terms.update(thesaurus[term])
    return " ".join(expanded_terms)


# Example Thesaurus (a simple dictionary)
thesaurus = {
    "cat": ["feline", "kitty", "pussycat"],
    "dog": ["canine", "puppy"],
    "great": ["excellent", "fantastic"],
    "love": ["adore", "like"],
}

# Example Query
input_query = "cats"
print("Original Query:", input_query)

# Apply Pseudo-Relevance Feedback
expanded_query_prf = pseudo_relevance_feedback(input_query, tfidf_matrix, doc_df)
print("Expanded Query (Pseudo-Relevance Feedback):", expanded_query_prf)

# Apply WordNet Expansion
expanded_query_wn = wordnet_expansion(input_query)
print("Expanded Query (WordNet):", expanded_query_wn)

# Apply Contextual Expansion
expanded_query_ce = contextual_expansion(input_query, tfidf_matrix, doc_df)
print("Expanded Query (Contextual):", expanded_query_ce)

# Apply Thesaurus-Based Expansion
expanded_query_tb = thesaurus_expansion(input_query, thesaurus)
print("Expanded Query (Thesaurus-Based):", expanded_query_tb)

# Function to retrieve documents based on expanded query
def retrieve_documents(expanded_query):
    query_vec = vectorizer.transform([expanded_query])
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = cosine_similarities.argsort()[-3:][::-1]  # Retrieve top 3 documents
    return doc_df.iloc[top_indices]

# Retrieve documents using expanded queries
print("\nDocuments for Pseudo-Relevance Feedback Expanded Query:")
print(retrieve_documents(expanded_query_prf)["document"])
print("\nDocuments for WordNet Expanded Query:")
print(retrieve_documents(expanded_query_wn)["document"])
print("\nDocuments for Contextual Expanded Query:")
print(retrieve_documents(expanded_query_ce)["document"])
print("\nDocuments for Thesaurus-Based Expanded Query:")
print(retrieve_documents(expanded_query_tb)["document"])

Original Query: cats
Expanded Query (Pseudo-Relevance Feedback): cats popular reading wonderful pastime dogs pets books
Expanded Query (WordNet): true_cat cast vomit khat big_cat hombre purge african_tea vomit_up retch puke cats arabian_tea qat regurgitate ct spew caterpillar computerized_axial_tomography chuck spue sick barf honk cat-o'-nine-tails kat quat computerized_tomography cat computed_axial_tomography computed_tomography upchuck disgorge throw_up guy bozo regorge be_sick
Expanded Query (Contextual): cats popular reading wonderful pastime dogs pets books
Expanded Query (Thesaurus-Based): cats

Documents for Pseudo-Relevance Feedback Expanded Query:
4    Reading books is a wonderful pastime.
2     Cats and dogs are both popular pets.
1               Dogs are great companions.
Name: document, dtype: object

Documents for WordNet Expanded Query:
0                  The cat sat on the mat.
2     Cats and dogs are both popular pets.
4    Reading books is a wonderful pastime.
Name: do

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
import random

# Sample data for demonstration
documents = [
    {
        "id": 1,
        "title": "Python Programming",
        "tags": ["python", "programming", "coding"],
    },
    {"id": 2, "title": "Learning Java", "tags": ["java", "programming"]},
    {
        "id": 3,
        "title": "Data Science with Python",
        "tags": ["data", "science", "python"],
    },
    {"id": 4, "title": "Web Development", "tags": ["html", "css", "web"]},
    {"id": 5, "title": "Machine Learning", "tags": ["machine learning", "data"]},
]


# Function to reformulate queries (example: synonyms)
def reformulate_query(query):
    synonyms = {
        "python": ["py", "python programming"],
        "java": ["javascript"],
        "data": ["information"],
        "web": ["website"],
    }
    for word in query.split():
        if word.lower() in synonyms:
            query += " " + " ".join(synonyms[word.lower()])
    return query


# Function for result diversification
def diversify_results(results):
    unique_tags = set(tag for doc in results for tag in doc["tags"])
    diversified_results = []
    for tag in unique_tags:
        filtered_docs = [doc for doc in results if tag in doc["tags"]]
        diversified_results.append(random.choice(filtered_docs))
    return diversified_results


# Faceted search based on tags
def faceted_search(query):
    filtered_docs = [doc for doc in documents if query.lower() in doc["title"].lower()]
    return diversify_results(filtered_docs)


# Example usage
original_query = "python"
reformulated_query = reformulate_query(original_query)
print(f"Reformulated Query: {reformulated_query}")

search_results = faceted_search(original_query)
print("Search Results:")
for result in search_results:
    print(f" - {result['title']} (Tags: {', '.join(result['tags'])})")

Reformulated Query: python py python programming
Search Results:
 - Python Programming (Tags: python, programming, coding)
 - Python Programming (Tags: python, programming, coding)
 - Data Science with Python (Tags: data, science, python)
 - Python Programming (Tags: python, programming, coding)
 - Data Science with Python (Tags: data, science, python)
