In [19]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [20]:
import re
from nltk.tokenize import sent_tokenize
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset  # For built-in dataset

In [21]:
# Step 1: Load the GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.eval()  # Set the model to evaluation mode

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [23]:
# Ensure pad token is defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [24]:
# Step 2: Load a built-in dataset
def load_and_sample_dataset():
    """
    Load and sample data from a built-in dataset for text generation.
    Returns:
        str: A sample prompt from the dataset.
    """
    dataset = load_dataset("ag_news", split="train")  # News dataset for conversational context
    sample = dataset.shuffle(seed=42).select(range(1))  # Select a random sample
    return sample[0]["text"]

In [25]:
# Step 3: Preprocess text data
def preprocess_text(text):
    """
    Clean and tokenize text input.
    """
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove non-alphanumeric characters
    sentences = sent_tokenize(text)
    return sentences

In [26]:
# Step 4: Generate conversational responses
def generate_response(prompt, max_length=150, temperature=0.7, top_k=50, top_p=0.9, repetition_penalty=1.2):
    """
    Generate a conversational response using GPT-2.
    """
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(
        input_ids,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=2,
        do_sample=True,
        early_stopping=True,
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response


In [27]:
# Step 5: Conversational system
def conversational_system():
    """
    Runs the multi-turn conversational system with built-in dataset support.
    """
    print("Conversational AI System (GPT-2)")
    print("Type 'exit' to end the conversation.\n")

    # Load a sample prompt from a dataset
    dataset_prompt = load_and_sample_dataset()
    print(f"Sample prompt loaded from dataset: {dataset_prompt}\n")

    conversation_context = f"Dataset Prompt: {dataset_prompt}\n"

    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit":
            print("Conversation ended. Goodbye!")
            break

        # Append user input to context
        conversation_context += f"User: {user_input}\nBot: "

        # Generate bot response
        bot_response = generate_response(conversation_context)
        print(f"Bot: {bot_response}")

        # Append bot response to context
        conversation_context += bot_response + "\n"


In [32]:
# Step 6: Main program
if __name__ == "__main__":
    conversational_system()

Conversational AI System (GPT-2)
Type 'exit' to end the conversation.

Sample prompt loaded from dataset: Bangladesh paralysed by strikes Opposition activists have brought many towns and cities in Bangladesh to a halt, the day after 18 people died in explosions at a political rally.

You: tell me more about bangladesh
Bot: Dataset Prompt: Bangladesh paralysed by strikes Opposition activists have brought many towns and cities in Bangladesh to a halt, the day after 18 people died in explosions at a political rally.
User: tell me more about bangladesh
Bot: พ้-ɐtʒnĪmār·́ (Bengal) [ edit ]

"Bangalore". "Sri Lanka" is an English word for India or Sri Lankan landmass which means 'land of abundance'. In this context, it was used as if Bangalore were on Indian soil with its huge population being estimated around 11 billion according TOI's figures from 2014."A large part...of our nation comes out here because
You: exit
Conversation ended. Goodbye!
