In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [9]:
from huggingface_hub import notebook_login

In [10]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Generate dummy data

In [1]:
import random

# Dummy dataset generation for film reviews
def generate_film_reviews(num_reviews=100, max_review_length=200):
    positive_adjectives = ["amazing", "fantastic", "captivating", "outstanding", "excellent"]
    negative_adjectives = ["disappointing", "boring", "predictable", "mediocre", "unimpressive"]
    movies = ["The Midnight Star", "Dreamscape", "Eternal Echo", "Lost Horizon", "Whispering Shadows"]

    reviews = []

    for _ in range(num_reviews):
        movie = random.choice(movies)
        rating = random.randint(1, 10)
        adjective = random.choice(positive_adjectives) if rating > 5 else random.choice(negative_adjectives)
        review_text = f"{movie} is {adjective}! I would give it a {rating}/10. "
        review_text += " ".join(["This movie", "exceeded my expectations.", "Highly recommended!", "A must-watch!"] * random.randint(1, 3))
        review_text += "\n\n"

        # Trim the review if it exceeds the max length
        review_text = review_text[:max_review_length]

        reviews.append(review_text)

    return reviews

# Save the dummy dataset to a file
dummy_reviews = generate_film_reviews()
dataset_path = "film_reviews_dataset.txt"

with open(dataset_path, "w", encoding="utf-8") as file:
    file.writelines(dummy_reviews)

print(f"Dummy dataset saved to {dataset_path}")


Dummy dataset saved to film_reviews_dataset.txt


# Train GPT2 model on dummy txt

In [20]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Load your custom film reviews dataset
dataset_path = "./movie_reviews_dataset/reviews_1.txt"

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=dataset_path,
    block_size=128,  # Adjust the block size according to your dataset
)

# Use the default data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./film_reviews_fine_tuned_v2",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./film_reviews_fine_tuned_v2")
tokenizer.save_pretrained("./film_reviews_fine_tuned_v2")



ValueError: Input file path ./movie_reviews_dataset/reviews_1.txt not found

# Train GPT2 on proper datasets 

In [10]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import TrainerCallback

dataset_path = "./datasets/All data_1.txt"
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=dataset_path,
    block_size=128,  # Adjust the block size according to your dataset
)

# Use the default data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# *    num_train_epochs=3,              # total number of training epochs*
# *    per_device_train_batch_size=16,  # batch size per device during training*
# *    per_device_eval_batch_size=16,   # batch size for evaluation*
# *    warmup_steps=50,                 # number of warmup steps for learning rate scheduler*
# *    weight_decay=0.01,               # strength of weight decay*
# *    logging_dir='./logs',            # directory for storing logs*
# *    logging_steps=20,*
# *    evaluation_strategy="steps"*        
# Define training arguments
training_args = TrainingArguments(
    output_dir="./anmialGPTV1",
    overwrite_output_dir=True,
    num_train_epochs=300,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',            # directory for storing logs*
)

class CustomCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.log_history:
            print(f"Step {state.global_step}, Loss: {state.log_history[-1]['loss']:.4f}")
        else:
            pass

# Create Trainer instance with the custom callback
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    callbacks=[CustomCallback()],
)

# Fine-tune the model
trainer.train()


# Save the fine-tuned model
model.save_pretrained("./anmialGPTV1")
tokenizer.save_pretrained("./anmialGPTV1")

Step,Training Loss
500,2.4774
1000,0.2768
1500,0.0683
2000,0.0407
2500,0.0304
3000,0.025
3500,0.0222
4000,0.0211
4500,0.0178
5000,0.0185


Step 501, Loss: 2.4774
Step 502, Loss: 2.4774
Step 503, Loss: 2.4774
Step 504, Loss: 2.4774
Step 505, Loss: 2.4774
Step 506, Loss: 2.4774
Step 507, Loss: 2.4774
Step 508, Loss: 2.4774
Step 509, Loss: 2.4774
Step 510, Loss: 2.4774
Step 511, Loss: 2.4774
Step 512, Loss: 2.4774
Step 513, Loss: 2.4774
Step 514, Loss: 2.4774
Step 515, Loss: 2.4774
Step 516, Loss: 2.4774
Step 517, Loss: 2.4774
Step 518, Loss: 2.4774
Step 519, Loss: 2.4774
Step 520, Loss: 2.4774
Step 521, Loss: 2.4774
Step 522, Loss: 2.4774
Step 523, Loss: 2.4774
Step 524, Loss: 2.4774
Step 525, Loss: 2.4774
Step 526, Loss: 2.4774
Step 527, Loss: 2.4774
Step 528, Loss: 2.4774
Step 529, Loss: 2.4774
Step 530, Loss: 2.4774
Step 531, Loss: 2.4774
Step 532, Loss: 2.4774
Step 533, Loss: 2.4774
Step 534, Loss: 2.4774
Step 535, Loss: 2.4774
Step 536, Loss: 2.4774
Step 537, Loss: 2.4774
Step 538, Loss: 2.4774
Step 539, Loss: 2.4774
Step 540, Loss: 2.4774
Step 541, Loss: 2.4774
Step 542, Loss: 2.4774
Step 543, Loss: 2.4774
Step 544, L

('./anmialGPTV1/tokenizer_config.json',
 './anmialGPTV1/special_tokens_map.json',
 './anmialGPTV1/vocab.json',
 './anmialGPTV1/merges.txt',
 './anmialGPTV1/added_tokens.json')

In [None]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.push_to_hub()

# Prediction : Text Generation

In [11]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

# Load the fine-tuned model and tokenizer
model_path = "./film_reviews_fine_tuned_v1"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

# Define a text generation pipeline
text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Prompt for text generation
prompt = "The movie I watched yesterday was"

# Generate film review
generated_review = text_generator(prompt, max_length=150, num_return_sequences=1, temperature=0.1)[0]['generated_text']

# Print the generated review
print("Generated Film Review:")
print(generated_review)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Film Review:
The movie I watched yesterday was a bit of a disappointment, but it's still a great movie. It's a very good movie, and I'm glad that it's been made.

The movie is a bit of a mess, but it's still a great movie.

The movie is a bit of a mess, but it's still a great movie.

The movie is a bit of a mess, but it's still a great movie.

The movie is a bit of a mess, but it's still a great movie.

The movie is a bit of a mess, but it's still a great movie.

The movie is a bit of a mess, but it's still a great movie


# Prediction : Conversation Bot

In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
import torch
torch.manual_seed(100)
# Load the fine-tuned model and tokenizer
model_path = "./anmialGPTV1"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

# Define a text generation pipeline
text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Start a conversation with the user
print("Movie Bot: Hi there! Let's talk about Animal. You can type 'exit' to end the conversation.")

# while True:
#     # Get user input
#     user_input = input("You: ")

#     # Check for exit condition
#     if user_input.lower() == 'exit':
#         print("Movie Bot: Goodbye!")
#         break

#     # Generate response
#     generated_response = text_generator(user_input, top_k=10, top_p=0.99, max_length=150, num_return_sequences=3, temperature=0.9)[0]['generated_text']

#     # Print the bot's response
#     print("Movie Bot:", generated_response)


Movie Bot: Hi there! Let's talk about Animal. You can type 'exit' to end the conversation.


## Streamer 

In [3]:
from transformers import TextStreamer

In [4]:
model.generation_config

GenerationConfig {
  "bos_token_id": 50256,
  "do_sample": true,
  "eos_token_id": 50256,
  "max_length": 50
}

In [42]:
inputs = tokenizer(["Should filmmakers be held accountable"], return_tensors="pt")
streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer=streamer, max_new_tokens=100)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Should filmmakers be held accountable for misleading viewers if they do not want to watch
	if there is something that you forget you are a writer and an actor and then you are presented with a script that you cannot ignore
	if it is not a vision that can communicate you won't use your own judgment. I think that's where I join issue with switch on the gay spot are people with all kinds of cases when you you are submitting your heart your point of view to the public say that this is Mike I know that


## Beam Search 

In [29]:
beam_output = model.generate(
    **inputs,
    max_new_tokens=100,
    num_beams=5,
    early_stopping=True
)
print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Give me your favourite scene from Animal washes wetter'n hand of advertising in messaging in our society and I'm really happy that you decided to join us is a with some exceptions understand everything in this room is really allowed within that framework we are here. Society in that space technical deal with on his show stop the jumping off point challenges misogynistic is disrespectful of women high school because he feels his just to send a message disputable speech and expression in a Saturn Street workout partner weather freedom of speech has to mean anything because freedom of


In [30]:
beam_output = model.generate(
    **inputs,
    max_new_tokens=100,
    num_beams=5,
    no_repeat_ngram_size=2,
    early_stopping=True
)
print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Give me your favourite scene from Animal was the fight scene where Bobby Deol Abrar Haq has just murdered someone on the day of his third wedding
fourth I don't even know but his face is spread with blood and he forcefully starts having sex with his newly wed wife whose pregnant by the way in front of everyone
once he is done he calls for his other wives in his haram at the knife point where he says Kuchh Sari Duniya
and if you're not looking and taking things are looking at


In [31]:
# set return_num_sequences > 1
beam_outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    num_beams=5,
    no_repeat_ngram_size=2,
    num_return_sequences=5,
    early_stopping=True
)

# now we have 3 output sequences
print("Output:\n" + 100 * '-')
for i, beam_output in enumerate(beam_outputs):
  print("{}: {}".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
0: Give me your favourite scene from Animal washes wetter'n hand of advertising in messaging in our society and I'm really happy that you decided to join us is a with some exceptions understand everything in this room is really allowed within that framework we are here. Society in that space technical deal with on his show stop the jumping off point challenges misogynistic is disrespectful of women high school because he feels his just to send a message disputable speech and expression in a Saturn Street workout partner weather freedom of speech has to mean anything because freedom to
1: Give me your favourite scene from Animal washes wetter'n hand of advertising in messaging in our society and I'm really happy that you decided to join us is a with some exceptions understand everything in this room is really allowed within that framework we are here. Society in that space techni

## Sampling

In [32]:
# set seed to reproduce results. Feel free to change the seed though to get different results
from transformers import set_seed
set_seed(42)

# activate sampling and deactivate top_k by setting top_k sampling to 0
sample_output = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    top_k=0
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Give me your favourite scene from Animal was simple but effective at the same time that we were making money because this is a film where the villain is a character and the film is about violence
violence only problem I see in films like this one writer or this one director and all of them have a purpose for existence and in the end they are just
people sitting in the dark room and thinking about what it would have done to
people around you what the breaking of this Chashma in the bunch of the wall would have done to


In [33]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# use temperature to decrease the sensitivity to low probability candidates
sample_output = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    top_k=0,
    temperature=0.6,
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Give me your favourite scene from Animal was simple but effective as a poem
line a mujhe nahin Pata ismein main Sandeep Reddy Vanga ko game Karun ya use theatre ke crowd ko Pratiksha from our entertainment team has this one really
line in one line but more than that he's telling us that we must all try to find the answer to this question
line in our entertainment team is this a question that must be asked all the time we must be really creative people we


In [34]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# set top_k to 50
sample_output = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    top_k=50
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Give me your favourite scene from Animal was simple but effective at the same time that we were making money because this is a film where the villain is a character and the film is about violence
violence only problem I see in films like this one writer or this one director and all of them have a purpose for existence and in the end they are just
people sitting in the dark room and thinking about what it would have done to
people around you what the breaking of this Chashma in the bunch of the wall would have done to


In [35]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# set top_k to 50
sample_output = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    top_p=0.92,
    top_k=0
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Give me your favourite scene from Animal was the closing scene you know I'm going to like it if somebody is treating you badly you were going to say something you're not going to
	if it's if something bothers you you say something freedom of speech in in in in that scenario this is Life gay comment section below Cowboy say Google how does respectful imagine live order the impunity with which men on the internet operate don't make faces W telling
	use binder with freaking fair this is on somebody telling me don't make faces life


In [36]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    num_return_sequences=3,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
0: Give me your favourite scene from Animal was simple but effective at the same time that we were making money because this is a film where the villain is a character and the film is about violence
violence only problem I see in films like this one writer or this one director and all of them have a purpose for existence and in the end they are just
people sitting in the dark room and thinking about what it would have done to
people around you what the breaking of this Chashma in the bunch of the wall would have done to
1: Give me your favourite scene from Animal to give you something to look at and say that is why I would have you want to watch it if you wanted
	’s outstanding performance makes Har
	Daytona feel like a real person and not just a pawn in his game which is trying to look like a cool person to ride because he's been doing it repeatedly while he speaks about it wha

In [46]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# set top_k to 50
sample_output = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    top_p=0.92,
    top_k=0,   
    num_beams=5,
    no_repeat_ngram_size=2,
    num_return_sequences=5,
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Should filmmakers be held accountable for their work if they don't like it I'm just from this whole conversation dating this one thing of a mr. Heater just called film critics mother's telling children. I will be your mother I think I mean feed the question that you asked about the time I believe eventually this discourse is important but you must be allowed to have this is the issue it feels like these are people making and expressing themselves freely in that particular
	if it's if something bothers you you say something freedom of


# DIALOGPT 

In [13]:
Dfrom transformers import AutoModelForCausalLM, AutoTokenizer
import torch


tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")


tokenizer_config.json: 100%|████████████████████████████████████████████████████████████████████████████████| 26.0/26.0 [00:00<00:00, 2.93kB/s]
config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 642/642 [00:00<00:00, 286kB/s]
vocab.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 1.04M/1.04M [00:00<00:00, 1.18MB/s]
merges.txt: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 774kB/s]
pytorch_model.bin: 100%|██████████████████████████████████████████████████████████████████████████████████| 1.75G/1.75G [02:33<00:00, 11.4MB/s]
generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████| 124/124 [00:00<00:00, 19.6kB/s]


In [14]:
# Let's chat for 5 lines
for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> User: hi


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: hi


>> User: what are you good at


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: i can play guitar


>> User: do you know about animal movie


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: i do


>> User: what is the story


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: i have no idea


>> User: awesome


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: i am glad
