In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: Traceback (most recent call last):
  File "/Users/deepanshu.kandpal/Library/Caches/pypoetry/virtualenvs/llm-r-8

# Generate dummy data

In [1]:
import random

# Dummy dataset generation for film reviews
def generate_film_reviews(num_reviews=100, max_review_length=200):
    positive_adjectives = ["amazing", "fantastic", "captivating", "outstanding", "excellent"]
    negative_adjectives = ["disappointing", "boring", "predictable", "mediocre", "unimpressive"]
    movies = ["The Midnight Star", "Dreamscape", "Eternal Echo", "Lost Horizon", "Whispering Shadows"]

    reviews = []

    for _ in range(num_reviews):
        movie = random.choice(movies)
        rating = random.randint(1, 10)
        adjective = random.choice(positive_adjectives) if rating > 5 else random.choice(negative_adjectives)
        review_text = f"{movie} is {adjective}! I would give it a {rating}/10. "
        review_text += " ".join(["This movie", "exceeded my expectations.", "Highly recommended!", "A must-watch!"] * random.randint(1, 3))
        review_text += "\n\n"

        # Trim the review if it exceeds the max length
        review_text = review_text[:max_review_length]

        reviews.append(review_text)

    return reviews

# Save the dummy dataset to a file
dummy_reviews = generate_film_reviews()
dataset_path = "film_reviews_dataset.txt"

with open(dataset_path, "w", encoding="utf-8") as file:
    file.writelines(dummy_reviews)

print(f"Dummy dataset saved to {dataset_path}")


Dummy dataset saved to film_reviews_dataset.txt


# Train dummy model

In [16]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Load your custom film reviews dataset
dataset_path = "./movie_reviews_dataset/reviews_1.txt"
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=dataset_path,
    block_size=128,  # Adjust the block size according to your dataset
)

# Use the default data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./film_reviews_fine_tuned_v2",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./film_reviews_fine_tuned_v2")
tokenizer.save_pretrained("./film_reviews_fine_tuned_v2")




Step,Training Loss


('./film_reviews_fine_tuned_v2/tokenizer_config.json',
 './film_reviews_fine_tuned_v2/special_tokens_map.json',
 './film_reviews_fine_tuned_v2/vocab.json',
 './film_reviews_fine_tuned_v2/merges.txt',
 './film_reviews_fine_tuned_v2/added_tokens.json')

# Prediction : Text Generation

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline

pipe = pipeline("text-generation", model="dkandpalz/animalGPT2")
tokenizer = AutoTokenizer.from_pretrained("dkandpalz/animalGPT2")
model = AutoModelForCausalLM.from_pretrained("dkandpalz/animalGPT2")

# Set the model to evaluation mode
model.eval()

# Define a text generation pipeline
text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Prompt for text generation
prompt = "The movie I watched yesterday was"

# Generate film review
generated_review = text_generator(prompt, max_length=150, num_return_sequences=1, temperature=0.1)[0]['generated_text']

# Print the generated review
print("Generated Film Review:")
print(generated_review)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Film Review:
The movie I watched yesterday was as a kid I remember feeling like what a scene it was yeah now you know it has
become a huge success at the box office yeah yeah we are seeing a little bit of this so what draws you to this
flawed hero uh you were attracted to that because um what I figured out even
this this question I asked myself also what I figured out from the childhood
whatever films I liked were all flawed man I don't know for some reason can can
you name a couple of films that you like IND Chu Kam shatru is not a flawed man Wes character he's a little angry young
man there's a lot actually in Hindi to there's a lot in English to there's a lot


# Prediction : Conversation Bot

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("dkandpalz/animalGPT2")
model = AutoModelForCausalLM.from_pretrained("dkandpalz/animalGPT2")

# Set the model to evaluation mode
model.eval()

# Define a text generation pipeline
text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Start a conversation with the user
print("Movie Bot: Hi there! Let's talk about Animal. You can type 'exit' to end the conversation.")

while True:
    # Get user input
    user_input = input("You: ")

    # Check for exit condition
    if user_input.lower() == 'exit':
        print("Movie Bot: Goodbye!")
        break

    # Generate response
    inputs = tokenizer(user_input, return_tensors="pt")
    generated_response = model.generate(**inputs, max_new_tokens=100, do_sample=True, top_p=0.92, top_k=0,num_beams=5,no_repeat_ngram_size=2,num_return_sequences=5,)
    output = tokenizer.decode(generated_response[0], skip_special_tokens=True)
    # Print the bot's response
    print("Movie Bot:", output)


Movie Bot: Hi there! Let's talk about Animal. You can type 'exit' to end the conversation.


You:  Who is Sandeep Reddy Vanga


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Movie Bot: Who is Sandeep Reddy Vanga Context: Sandep is an Indian film director, screenwriter and editor who makes Blockbuster free films. He is one of the highest-paid actors of Hindi cinema and has featured in Forbes India's Celebrity  list since. Kapoor is the recipient of several awards, including six Filmfare Awards. The son of actors Rishi Kapur and Neetu Singh, and the grandson of actor-director Raja Harish Chandra, Kapwin pursued filmmaking and method acting at the School of Visual Arts


You:  which is your favourite scene from Animal


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Movie Bot: which is your favourite scene from Animal Park yet another great
hug and papa that's the uh the first time we've seen a human being cut open by an ex it's fine
you know that kind of stuff whoever wants to tell their story tells the story the way they want to
tell it and uh while we're watching the film we always have this tendency of thinking what's in the market already so
and um even when the hero Reveals His Name these are two very specific decisions that you've
taken


You:  Give your Animal review


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Movie Bot: Give your Animal review
do something about it if you complain about the film then I can point to all the so to speak woke uh films that you have
made and I could say why is this okay and why are you not doing this right and that's where we are at and we need to kind of view all
of these films from the lens of unfortunately how they're going to be consumed by a society that anywh at this
point of time believes that we have reached a point where this is no longer


# POC 2 , SCRAPE DATA OF 2023 bollywood block busters (>200 cr box office)

In [10]:
# import requests
# from bs4 import BeautifulSoup
# import os

# # Define a list of example URLs (replace these with your actual URLs)
# example_urls = [
#     'https://timesofindia.indiatimes.com/entertainment/hindi/movie-reviews/animal/movie-review/105651233.cms',
#     'https://www.theguardian.com/film/2023/dec/02/animal-review-ranbir-kapoor-plays-one-of-the-vilest-protagonists-in-cinema-history',
#     # Add more URLs as needed
# ]

# # Create a directory to store the scraped data
# output_directory = 'movie_reviews_dataset'
# os.makedirs(output_directory, exist_ok=True)

# def scrape_movie_reviews(url):
#     # Fetch the web page content
#     response = requests.get(url)

#     if response.status_code == 200:
#         # Parse the HTML content
#         soup = BeautifulSoup(response.text, 'html.parser')

#         # Extract movie reviews (modify based on the website structure)
#         reviews = []
#         review_elements = soup.find_all('div', class_='section1')  # Adjust based on HTML structure
        
#         # soup = BeautifulSoup(review_elements, 'html.parser')
#         # Extract all text from the HTML
#         print(review_elements)
#         review_text = soup.get_text()
#         reviews.append(review_text)
#         # for review_element in review_elements:
#         #     review_text = review_element.find('p', class_='Review:').text  # Adjust based on HTML structure
#         #     reviews.append(review_text)

#         return reviews
#     else:
#         print(f"Failed to fetch the page. Status code: {response.status_code}")
#         return None

# # Iterate through each URL and scrape reviews
# for i, url in enumerate(example_urls, start=1):
#     print(f"Scraping data from {url}...")

#     movie_reviews = scrape_movie_reviews(url)

#     if movie_reviews:
#         # Save reviews to a text file
#         output_file_path = os.path.join(output_directory, f'reviews_{i}.txt')

#         with open(output_file_path, 'w', encoding='utf-8') as file:
#             for review in movie_reviews:
#                 file.write(review + '\n')

#         print(f"Reviews saved to {output_file_path}\n")
#     else:
#         print("Skipping...\n")

# print("Scraping complete.")


# DIALOGPT 

In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")


tokenizer_config.json: 100%|████████████████████████████████████████████████████████████████████████████████| 26.0/26.0 [00:00<00:00, 2.93kB/s]
config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 642/642 [00:00<00:00, 286kB/s]
vocab.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 1.04M/1.04M [00:00<00:00, 1.18MB/s]
merges.txt: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 774kB/s]
pytorch_model.bin: 100%|██████████████████████████████████████████████████████████████████████████████████| 1.75G/1.75G [02:33<00:00, 11.4MB/s]
generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████| 124/124 [00:00<00:00, 19.6kB/s]


In [14]:
# Let's chat for 5 lines
for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> User: hi


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: hi


>> User: what are you good at


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: i can play guitar


>> User: do you know about animal movie


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: i do


>> User: what is the story


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: i have no idea


>> User: awesome


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: i am glad
