In [7]:
import numpy as np

# Step 1: Small dataset
documents = [
    "Book: The Great Gatsby, Genre: Fiction",
    "Book: To Kill a Mockingbird, Genre: Literature",
    "Book: 1984, Genre: Horror",
    "Book: Ghost in the machine, Genre: Philosphy"
]

# Step 2: Simple embedding function (word frequency)
def create_embedding(text):
    # Split text into words, lowercase
    words = text.lower().split()
    # Common words for vector (fixed vocabulary for simplicity)
    vocab = ['book', 'genre', 'fiction', 'dystopian', 'great', 'gatsby', 'mockingbird', '1984']
    # Create vector: count occurrences of each vocab word
    vector = np.zeros(len(vocab))
    for i, word in enumerate(vocab):
        vector[i] = words.count(word)
    # Normalize vector
    norm = np.linalg.norm(vector) or 1  # Avoid division by zero
    return vector / norm

# Create embeddings for all documents
doc_vectors = [create_embedding(doc) for doc in documents]

# Step 3: Process query
query = "What is the genre of 1984?"
query_vector = create_embedding(query)

print(query_vector)
# Step 4: Retrieve most relevant document (cosine similarity)
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2) or 1)

similarities = [cosine_similarity(query_vector, doc_vec) for doc_vec in doc_vectors]
relevant_doc_index = np.argmax(similarities)
relevant_doc = documents[relevant_doc_index]

# Step 5: Generate answer (simple text parsing)
def generate_answer(query, doc):
    if "genre" in query.lower():
        # Extract genre from document (e.g., "Genre: Dystopian Fiction")
        parts = doc.split("Genre: ")
        if len(parts) > 1:
            return parts[1].strip()
    return "I don't know."

answer = generate_answer(query, relevant_doc)
print(f"Question: {query}")
print(f"Answer: {answer}")

query = "What is the genre of Harry Potter"
answer = generate_answer(query, relevant_doc)
print(f"Question: {query}")
print(f"Answer: {answer}")

query = "What is the genre of Ghost in the machine"
answer = generate_answer(query, relevant_doc)
print(f"Question: {query}")
print(f"Answer: {answer}")

Question: What is the genre of 1984?
Answer: Fiction
Question: What is the genre of Harry Potter
Answer: Fiction
Question: What is the genre of Ghost in the machine
Answer: Fiction
