In [1]:
import sqlite3

# Function to create the similarity_scores table
def create_similarity_table():
    conn = sqlite3.connect("analysis.db")
    cursor = conn.cursor()
    
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS similarity_scores (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            method TEXT NOT NULL,
            score REAL NOT NULL
        )
    """)
    
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS final_conclusion (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            conclusion TEXT NOT NULL
        )
    """)

    conn.commit()
    conn.close()
    print("✅ SQLite tables for similarity analysis created successfully!")

# Run the function
create_similarity_table()


✅ SQLite tables for similarity analysis created successfully!


In [1]:
import sqlite3
def get_bible_text():
    conn = sqlite3.connect("bible.db")
    cursor = conn.cursor()
    
    cursor.execute("SELECT text FROM bible_verses")
    result = cursor.fetchall()
    
    conn.close()
    res = []
    for i in result:
        res.append(i[0])
    return res

#get_bible_text()

In [2]:
import sqlite3
def get_shakespeare_text():
    conn = sqlite3.connect("shakespeare.db")
    cursor = conn.cursor()
    
    cursor.execute("SELECT sentence FROM shakespeare_sentences")
    result = cursor.fetchall()
    
    conn.close()
    res = []
    for i in result:
        res.append(i[0])
    return res

# get_shakespeare_text()

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_tfidf_similarity(bible_texts, shakespeare_texts):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(bible_texts + shakespeare_texts)

    bible_tfidf = tfidf_matrix[:len(bible_texts)]
    shakespeare_tfidf = tfidf_matrix[len(bible_texts):]

    cosine_scores = []
    total = len(bible_texts)
    
    for i, (b, s) in enumerate(zip(bible_tfidf, shakespeare_tfidf), start=1):
        cosine_scores.append(cosine_similarity(b, s)[0][0])
        
        # Print progress every 1000 steps
        if i % 1000 == 0 or i == total:
            percent_done = (i / total) * 100
            print(f"🔄 TF-IDF Similarity: Processed {i}/{total} pairs ({percent_done:.2f}% done)")

    return sum(cosine_scores) / len(cosine_scores)

bible_texts = get_bible_text()
shakespeare_texts = get_shakespeare_text()

# Run
average_cosine_score = compute_tfidf_similarity(bible_texts, shakespeare_texts)


🔄 TF-IDF Similarity: Processed 1000/31102 pairs (3.22% done)
🔄 TF-IDF Similarity: Processed 2000/31102 pairs (6.43% done)
🔄 TF-IDF Similarity: Processed 3000/31102 pairs (9.65% done)
🔄 TF-IDF Similarity: Processed 4000/31102 pairs (12.86% done)
🔄 TF-IDF Similarity: Processed 5000/31102 pairs (16.08% done)
🔄 TF-IDF Similarity: Processed 6000/31102 pairs (19.29% done)
🔄 TF-IDF Similarity: Processed 7000/31102 pairs (22.51% done)
🔄 TF-IDF Similarity: Processed 8000/31102 pairs (25.72% done)
🔄 TF-IDF Similarity: Processed 9000/31102 pairs (28.94% done)
🔄 TF-IDF Similarity: Processed 10000/31102 pairs (32.15% done)
🔄 TF-IDF Similarity: Processed 11000/31102 pairs (35.37% done)
🔄 TF-IDF Similarity: Processed 12000/31102 pairs (38.58% done)
🔄 TF-IDF Similarity: Processed 13000/31102 pairs (41.80% done)
🔄 TF-IDF Similarity: Processed 14000/31102 pairs (45.01% done)
🔄 TF-IDF Similarity: Processed 15000/31102 pairs (48.23% done)
🔄 TF-IDF Similarity: Processed 16000/31102 pairs (51.44% done)
🔄 TF

In [4]:
print(f"📊 Average Cosine Similarity Score: {average_cosine_score:.4f}")

📊 Average Cosine Similarity Score: 0.0238


In [7]:
from pymilvus import MilvusClient, DataType

CLUSTER_ENDPOINT = "http://localhost:19530"
TOKEN = "root:Milvus"

client = MilvusClient(
    uri=CLUSTER_ENDPOINT,
    token=TOKEN 
)




In [15]:
from pymilvus import connections, utility

# Connect to Milvus
connections.connect(alias="default", host="localhost", port="19530")

# List all collections
collections = utility.list_collections()
print(collections)

bible_collection = Collection("bible_verses")
shakespeare_collection = Collection("shakespeare_sentences")

# Get schema information
schema = bible_collection.schema

# Print field names
field_names = [field.name for field in schema.fields]
print("Fields in the collection:", field_names)

schema = shakespeare_collection.schema

# Print field names
field_names = [field.name for field in schema.fields]
print("Fields in the collection:", field_names)


['customized_setup_2', 'customized_setup_3', 'customized_setup_4', 'quick_setup', 'bible_verses', 'shakespeare_sentences', 'customized_setup_1']
Fields in the collection: ['id', 'ref', 'text', 'embedding']
Fields in the collection: ['id', 'ref', 'sentence', 'embedding']


In [None]:
import ollama
import numpy as np

# Function to get   embeddings using Ollama (with error handling)
def get_embedding(text):
    try:
        embedding_data = ollama.embeddings(model="mxbai-embed-large", prompt=text)
        embedding_data = embedding_data["embedding"]  # Extract embedding
        embedding_data = np.array(embedding_data)
        return embedding_data
    except Exception as e:
        print(f"❌ Unexpected error generating embedding: {e}")
    return None  # Return None if an error occurs


In [16]:
import numpy as np
import sqlite3
from pymilvus import Collection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from difflib import SequenceMatcher
import nltk
from nltk.util import ngrams

nltk.download('punkt')

# Load Bible & Shakespeare collections from Milvus
bible_collection = Collection("bible_verses")
shakespeare_collection = Collection("shakespeare_sentences")

# Load texts from both sources
bible_texts = get_bible_text()
shakespeare_texts = get_shakespeare_text()

# Compute TF-IDF Cosine Similarity
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(bible_texts + shakespeare_texts)

bible_tfidf = tfidf_matrix[:len(bible_texts)]
shakespeare_tfidf = tfidf_matrix[len(bible_texts):]

cosine_scores = cosine_similarity(bible_tfidf, shakespeare_tfidf)
average_cosine_score = np.mean(cosine_scores)

# Compute Jaccard Similarity
def jaccard_similarity(text1, text2):
    set1, set2 = set(text1.lower().split()), set(text2.lower().split())
    return len(set1 & set2) / len(set1 | set2)

jaccard_scores = [jaccard_similarity(b, s) for b, s in zip(bible_texts, shakespeare_texts)]
average_jaccard_score = np.mean(jaccard_scores)

# Compute N-Gram Overlap
def ngram_overlap(text1, text2, n=3):
    ngrams1, ngrams2 = set(ngrams(text1.split(), n)), set(ngrams(text2.split(), n))
    return len(ngrams1 & ngrams2) / max(len(ngrams1 | ngrams2), 1)

ngram_scores = [ngram_overlap(b, s) for b, s in zip(bible_texts, shakespeare_texts)]
average_ngram_score = np.mean(ngram_scores)

# Compute Levenshtein Distance Similarity
def levenshtein_similarity(text1, text2):
    return SequenceMatcher(None, text1, text2).ratio()

levenshtein_scores = [levenshtein_similarity(b, s) for b, s in zip(bible_texts, shakespeare_texts)]
average_levenshtein_score = np.mean(levenshtein_scores)

# Compute Vector Similarity using Milvus
def milvus_vector_similarity(text1, collection, field="text"):
    query_embedding = get_embedding(text1)
    search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
    results = collection.search(
        data=[query_embedding],
        anns_field="embedding",
        param=search_params,
        limit=5,
        output_fields=[field]
    )
    return np.mean([hit.distance for hit in results[0]])

vector_scores = [milvus_vector_similarity(b, shakespeare_collection, "sentence") for b in bible_texts]
average_vector_score = np.mean(vector_scores)

# Aggregate all scores
scores = {
    "TF-IDF Cosine Similarity": average_cosine_score,
    "Jaccard Similarity": average_jaccard_score,
    "N-Gram Overlap": average_ngram_score,
    "Levenshtein Similarity": average_levenshtein_score,
    "Milvus Vector Similarity": average_vector_score
}

print("\n📊 **Aggregated Similarity Scores:**")
for method, score in scores.items():
    print(f"{method}: {score:.4f}")


[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
2025-03-09 23:01:23,368 [ERROR][handler]: RPC error: [search], <MilvusException: (code=101, message=failed to search: collection not loaded[collection=456493849065998355])>, <Time:{'RPC start': '2025-03-09 23:01:23.364301', 'RPC error': '2025-03-09 23:01:23.368323'}> (decorators.py:140)


MilvusException: <MilvusException: (code=101, message=failed to search: collection not loaded[collection=456493849065998355])>