In [3]:
import wikipediaapi
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np

# ---------------------------
# 1️⃣ Extract Wikipedia Text
# ---------------------------
wiki = wikipediaapi.Wikipedia(
    language='en',
    user_agent='Chandan-WikiBot/1.0 (https://github.com/chandan11248)'
)

topics = ['Physics', 'Mathematics', 'Biology']
corpus = ""

for topic in topics:
    page = wiki.page(topic)
    if page.exists():
        corpus += page.text[:10000] + " "  # first 10k chars per topic

print("Corpus length:", len(corpus))

# ---------------------------
# 2️⃣ Tokenize & Prepare Sequences
# ---------------------------
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])
total_words = len(tokenizer.word_index) + 1

input_sequences = []
words = corpus.split()

for i in range(1, len(words)):
    n_gram_seq = words[:i+1]
    seq = tokenizer.texts_to_sequences([" ".join(n_gram_seq)])[0]
    input_sequences.append(seq)

max_seq_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')

X = input_sequences[:,:-1]
y = input_sequences[:,-1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# ---------------------------
# 3️⃣ Build LSTM Model
# ---------------------------
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_seq_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
model.build(input_shape=(None, 56))
model.summary()


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


# ---------------------------
# 4️⃣ Train Model
# ---------------------------
print("Training LSTM... this may take a while")
model.fit(X, y, epochs=20, batch_size=128)

# ---------------------------
# 5️⃣ Mini QA Bot Function
# ---------------------------
def answer_question(seed_text, next_words=50):
    """
    Generates an answer based on Wikipedia corpus using LSTM.
    seed_text: the question or starting text
    next_words: number of words to generate
    """
    result = seed_text
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([result])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        predicted_index = np.argmax(model.predict(token_list, verbose=0), axis=-1)
        
        # get word from index
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
                
        result += " " + output_word
    return result

# ---------------------------
# 6️⃣ Ask a Question
# ---------------------------
question = "What is physics?"
answer = answer_question(question, next_words=50)
print("\nQuestion:", question)
print("\nAnswer generated:\n", answer)

Corpus length: 30003


2025-10-24 18:26:25.334163: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-10-24 18:26:25.334187: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 24.00 GB
2025-10-24 18:26:25.334190: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 8.88 GB
2025-10-24 18:26:25.334226: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-10-24 18:26:25.334234: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Training LSTM... this may take a while
Epoch 1/20


2025-10-24 18:26:25.800573: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 787ms/step - accuracy: 0.0586 - loss: 6.7660
Epoch 2/20
[1m23/35[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m9s[0m 785ms/step - accuracy: 0.0470 - loss: 6.1896 

KeyboardInterrupt: 