In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: Traceback (most recent call last):
  File "/Users/deepanshu.kandpal/Library/Caches/pypoetry/virtualenvs/llm-HD5

# Generate dummy data

In [1]:
import random

# Dummy dataset generation for film reviews
def generate_film_reviews(num_reviews=100, max_review_length=200):
    positive_adjectives = ["amazing", "fantastic", "captivating", "outstanding", "excellent"]
    negative_adjectives = ["disappointing", "boring", "predictable", "mediocre", "unimpressive"]
    movies = ["The Midnight Star", "Dreamscape", "Eternal Echo", "Lost Horizon", "Whispering Shadows"]

    reviews = []

    for _ in range(num_reviews):
        movie = random.choice(movies)
        rating = random.randint(1, 10)
        adjective = random.choice(positive_adjectives) if rating > 5 else random.choice(negative_adjectives)
        review_text = f"{movie} is {adjective}! I would give it a {rating}/10. "
        review_text += " ".join(["This movie", "exceeded my expectations.", "Highly recommended!", "A must-watch!"] * random.randint(1, 3))
        review_text += "\n\n"

        # Trim the review if it exceeds the max length
        review_text = review_text[:max_review_length]

        reviews.append(review_text)

    return reviews

# Save the dummy dataset to a file
dummy_reviews = generate_film_reviews()
dataset_path = "film_reviews_dataset.txt"

with open(dataset_path, "w", encoding="utf-8") as file:
    file.writelines(dummy_reviews)

print(f"Dummy dataset saved to {dataset_path}")


Dummy dataset saved to film_reviews_dataset.txt


# Train dummy model

In [16]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Load your custom film reviews dataset
dataset_path = "./movie_reviews_dataset/reviews_1.txt"
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=dataset_path,
    block_size=128,  # Adjust the block size according to your dataset
)

# Use the default data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./film_reviews_fine_tuned_v2",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./film_reviews_fine_tuned_v2")
tokenizer.save_pretrained("./film_reviews_fine_tuned_v2")




Step,Training Loss


('./film_reviews_fine_tuned_v2/tokenizer_config.json',
 './film_reviews_fine_tuned_v2/special_tokens_map.json',
 './film_reviews_fine_tuned_v2/vocab.json',
 './film_reviews_fine_tuned_v2/merges.txt',
 './film_reviews_fine_tuned_v2/added_tokens.json')

# Prediction : Text Generation

In [11]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

# Load the fine-tuned model and tokenizer
model_path = "./film_reviews_fine_tuned_v1"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

# Define a text generation pipeline
text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Prompt for text generation
prompt = "The movie I watched yesterday was"

# Generate film review
generated_review = text_generator(prompt, max_length=150, num_return_sequences=1, temperature=0.1)[0]['generated_text']

# Print the generated review
print("Generated Film Review:")
print(generated_review)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Film Review:
The movie I watched yesterday was a bit of a disappointment, but it's still a great movie. It's a very good movie, and I'm glad that it's been made.

The movie is a bit of a mess, but it's still a great movie.

The movie is a bit of a mess, but it's still a great movie.

The movie is a bit of a mess, but it's still a great movie.

The movie is a bit of a mess, but it's still a great movie.

The movie is a bit of a mess, but it's still a great movie.

The movie is a bit of a mess, but it's still a great movie


# Prediction : Conversation Bot

In [24]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

# Load the fine-tuned model and tokenizer
model_path = "./film_reviews_fine_tuned_v2"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

# Define a text generation pipeline
text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Start a conversation with the user
print("Movie Bot: Hi there! Let's talk about Animal. You can type 'exit' to end the conversation.")

while True:
    # Get user input
    user_input = input("You: ")

    # Check for exit condition
    if user_input.lower() == 'exit':
        print("Movie Bot: Goodbye!")
        break

    # Generate response
    generated_response = text_generator(user_input, top_k=50, top_p=0.95, max_length=150, num_return_sequences=3, temperature=0.6)[0]['generated_text']

    # Print the bot's response
    print("Movie Bot:", generated_response)


Movie Bot: Hi there! Let's talk about movies. You can type 'exit' to end the conversation.


You:  give your review of animal movie


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Movie Bot: give your review of animal movie. (You can also use our review guidelines here.)

The film has a lot to offer, but it is also a little bit of a shame that the main character is a man. He is, after all, a man who has been raised by his mother and father, but who has been raised by his father in an abusive household. He is also, as one of the main characters points out, a man who has been raised as a child.

The plot is a bit predictable, but it is a bit of a disappointment. The main character is a child and he is not a father, but rather a man who is given an ultimatum: he will kill his father or he will


You:  give your thought's on ranbir kapoor's performance


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Movie Bot: give your thought's on ranbir kapoor's performance, or maybe you don't even know what a runbir kapoor is.

Kapoor is a very unique character. He is a bit like a rock star, but at the same time, he is a man who has his own unique story. His role as a father figure is not only important but also something that is very important to the story.

Kapoor is a man who has a lot of baggage. The fact that he is a father figure is a huge part of his identity. It is important to know that he is a man who has become a father figure for his son. This is a story where K. is the only


You:  exit


Movie Bot: Goodbye!


# DIALOGPT 

In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")


tokenizer_config.json: 100%|████████████████████████████████████████████████████████████████████████████████| 26.0/26.0 [00:00<00:00, 2.93kB/s]
config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 642/642 [00:00<00:00, 286kB/s]
vocab.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 1.04M/1.04M [00:00<00:00, 1.18MB/s]
merges.txt: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 774kB/s]
pytorch_model.bin: 100%|██████████████████████████████████████████████████████████████████████████████████| 1.75G/1.75G [02:33<00:00, 11.4MB/s]
generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████| 124/124 [00:00<00:00, 19.6kB/s]


In [14]:
# Let's chat for 5 lines
for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> User: hi


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: hi


>> User: what are you good at


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: i can play guitar


>> User: do you know about animal movie


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: i do


>> User: what is the story


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: i have no idea


>> User: awesome


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: i am glad
