In [1]:
!pip install transformers datasets accelerate


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [3]:
from datasets import load_dataset, Dataset
import json

with open('/content/two_nation_theory_dataset.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

dataset = Dataset.from_list(data)

In [15]:
dataset[0]

{'text': 'Yasir: What was the Two-Nation Theory?\nAI: The Two-Nation Theory stated that Muslims and Hindus were two distinct nations with their own customs, religion, and traditions, and therefore Muslims should have a separate homeland.\n'}

In [25]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize(example):
    encoding = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

tokenized = dataset.map(tokenize, batched=True, remove_columns=["text"])
# tokenized = tokenized.remove_columns(["text"])

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [26]:
tokenized

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 500
})

In [27]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments

model = GPT2LMHeadModel.from_pretrained("gpt2")

training_args = TrainingArguments(
    output_dir = "./Historical_GPT_finetuned",
     report_to="none",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    warmup_steps=10,
    save_total_limit=2
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,5.1629
20,1.0685
30,0.5572
40,0.3324
50,0.1773
60,0.1351
70,0.0674
80,0.0401
90,0.0595
100,0.0374


TrainOutput(global_step=750, training_loss=0.1233032022913297, metrics={'train_runtime': 303.3965, 'train_samples_per_second': 4.944, 'train_steps_per_second': 2.472, 'total_flos': 97984512000000.0, 'train_loss': 0.1233032022913297, 'epoch': 3.0})

In [29]:
trainer.save_model("./historical-gpt2-finetuned")
tokenizer.save_pretrained("./historical-gpt2-finetuned")


('./historical-gpt2-finetuned/tokenizer_config.json',
 './historical-gpt2-finetuned/special_tokens_map.json',
 './historical-gpt2-finetuned/vocab.json',
 './historical-gpt2-finetuned/merges.txt',
 './historical-gpt2-finetuned/added_tokens.json')

# **Push Model to HuggingFace**

In [None]:
from huggingface_hub import login

# You'll be prompted for your Hugging Face token (get it from your HF profile > Settings > Access Tokens)
# login(token="hugging face token")


In [33]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_dir = "./historical-gpt2-finetuned"

model = GPT2LMHeadModel.from_pretrained(model_dir)
tokenizer = GPT2Tokenizer.from_pretrained(model_dir)


In [34]:
model.push_to_hub("historical-gpt2-finetuned")       # Will create a repo at https://huggingface.co/YOUR_USERNAME/yasir-gpt2-finetuned
tokenizer.push_to_hub("historical-gpt2-finetuned")


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jamyasir/historical-gpt2-finetuned/commit/bd75aebdf23a041e1fb63b9d1c24793f9a9b38b0', commit_message='Upload tokenizer', commit_description='', oid='bd75aebdf23a041e1fb63b9d1c24793f9a9b38b0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jamyasir/historical-gpt2-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='jamyasir/historical-gpt2-finetuned'), pr_revision=None, pr_num=None)

# **Use the FineTuned Model**

In [8]:
from transformers import pipeline

custom_generator = pipeline("text-generation", model="./historical-gpt2-finetuned", tokenizer="./historical-gpt2-finetuned")

prompt = "How did cultural differences between Hindus and Muslims contribute to the Two-Nation Theory?"
print(custom_generator(prompt, max_length=100, do_sample=True)[0]['generated_text'])


In [11]:
from transformers import pipeline

generator = pipeline("text-generation", model="jamyasir/historical-gpt2-finetuned")

prompt = "Yasir: Which events led to the failure of Hindu-Muslim unity?\nAI:"
output = generator(prompt, max_length=100, do_sample=True, temperature=0.7)[0]['generated_text']
print(output)



Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Yasir: Which events led to the failure of Hindu-Muslim unity?
AI: The Two-Nation Theory stated that Muslims and Hindus were two distinct nations with their own customs, religion, and traditions, and therefore Muslims should have a separate homeland.

