In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [4]:
from huggingface_hub import notebook_login

In [5]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Generate dummy data

In [None]:
import random

# Dummy dataset generation for film reviews
def generate_film_reviews(num_reviews=100, max_review_length=200):
    positive_adjectives = ["amazing", "fantastic", "captivating", "outstanding", "excellent"]
    negative_adjectives = ["disappointing", "boring", "predictable", "mediocre", "unimpressive"]
    movies = ["The Midnight Star", "Dreamscape", "Eternal Echo", "Lost Horizon", "Whispering Shadows"]

    reviews = []

    for _ in range(num_reviews):
        movie = random.choice(movies)
        rating = random.randint(1, 10)
        adjective = random.choice(positive_adjectives) if rating > 5 else random.choice(negative_adjectives)
        review_text = f"{movie} is {adjective}! I would give it a {rating}/10. "
        review_text += " ".join(["This movie", "exceeded my expectations.", "Highly recommended!", "A must-watch!"] * random.randint(1, 3))
        review_text += "\n\n"

        # Trim the review if it exceeds the max length
        review_text = review_text[:max_review_length]

        reviews.append(review_text)

    return reviews

# Save the dummy dataset to a file
dummy_reviews = generate_film_reviews()
dataset_path = "film_reviews_dataset.txt"

with open(dataset_path, "w", encoding="utf-8") as file:
    file.writelines(dummy_reviews)

print(f"Dummy dataset saved to {dataset_path}")


# Train GPT2 model on dummy txt

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Load your custom film reviews dataset
dataset_path = "./movie_reviews_dataset/reviews_1.txt"

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=dataset_path,
    block_size=128,  # Adjust the block size according to your dataset
)

# Use the default data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./film_reviews_fine_tuned_v2",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./film_reviews_fine_tuned_v2")
tokenizer.save_pretrained("./film_reviews_fine_tuned_v2")

# Train GPT2 on proper datasets 

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import TrainerCallback

dataset_path = "./datasets/All data_1.txt"
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=dataset_path,
    block_size=128,  # Adjust the block size according to your dataset
)

# Use the default data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# *    num_train_epochs=3,              # total number of training epochs*
# *    per_device_train_batch_size=16,  # batch size per device during training*
# *    per_device_eval_batch_size=16,   # batch size for evaluation*
# *    warmup_steps=50,                 # number of warmup steps for learning rate scheduler*
# *    weight_decay=0.01,               # strength of weight decay*
# *    logging_dir='./logs',            # directory for storing logs*
# *    logging_steps=20,*
# *    evaluation_strategy="steps"*        
# Define training arguments
training_args = TrainingArguments(
    output_dir="./anmialGPTV1",
    overwrite_output_dir=True,
    num_train_epochs=300,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',            # directory for storing logs*
)

class CustomCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.log_history:
            print(f"Step {state.global_step}, Loss: {state.log_history[-1]['loss']:.4f}")
        else:
            pass

# Create Trainer instance with the custom callback
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    callbacks=[CustomCallback()],
)

# Fine-tune the model
trainer.train()


# Save the fine-tuned model
model.save_pretrained("./anmialGPTV1")
tokenizer.save_pretrained("./anmialGPTV1")

In [None]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.push_to_hub()

# Prediction : Text Generation

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

# Load the fine-tuned model and tokenizer
model_path = "./film_reviews_fine_tuned_v1"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

# Define a text generation pipeline
text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Prompt for text generation
prompt = "The movie I watched yesterday was"

# Generate film review
generated_review = text_generator(prompt, max_length=150, num_return_sequences=1, temperature=0.1)[0]['generated_text']

# Print the generated review
print("Generated Film Review:")
print(generated_review)


# Prediction : Conversation Bot

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
import torch
torch.manual_seed(100)
# Load the fine-tuned model and tokenizer
model_path = "./anmialGPTV1"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

# Define a text generation pipeline
text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Start a conversation with the user
print("Movie Bot: Hi there! Let's talk about Animal. You can type 'exit' to end the conversation.")

# while True:
#     # Get user input
#     user_input = input("You: ")

#     # Check for exit condition
#     if user_input.lower() == 'exit':
#         print("Movie Bot: Goodbye!")
#         break

#     # Generate response
#     generated_response = text_generator(user_input, top_k=10, top_p=0.99, max_length=150, num_return_sequences=3, temperature=0.9)[0]['generated_text']

#     # Print the bot's response
#     print("Movie Bot:", generated_response)


## Streamer 

In [3]:
from transformers import TextStreamer

In [4]:
model.generation_config

NameError: name 'model' is not defined

In [None]:
inputs = tokenizer(["Should filmmakers be held accountable"], return_tensors="pt")
streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer=streamer, max_new_tokens=100)

## Beam Search 

In [None]:
beam_output = model.generate(
    **inputs,
    max_new_tokens=100,
    num_beams=5,
    early_stopping=True
)
print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

In [None]:
beam_output = model.generate(
    **inputs,
    max_new_tokens=100,
    num_beams=5,
    no_repeat_ngram_size=2,
    early_stopping=True
)
print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

In [None]:
# set return_num_sequences > 1
beam_outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    num_beams=5,
    no_repeat_ngram_size=2,
    num_return_sequences=5,
    early_stopping=True
)

# now we have 3 output sequences
print("Output:\n" + 100 * '-')
for i, beam_output in enumerate(beam_outputs):
  print("{}: {}".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))


## Sampling

In [None]:
# set seed to reproduce results. Feel free to change the seed though to get different results
from transformers import set_seed
set_seed(42)

# activate sampling and deactivate top_k by setting top_k sampling to 0
sample_output = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    top_k=0
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

In [None]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# use temperature to decrease the sensitivity to low probability candidates
sample_output = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    top_k=0,
    temperature=0.6,
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))


In [None]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# set top_k to 50
sample_output = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    top_k=50
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

In [None]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# set top_k to 50
sample_output = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    top_p=0.92,
    top_k=0
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

In [None]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    num_return_sequences=3,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))


In [None]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# set top_k to 50
sample_output = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    top_p=0.92,
    top_k=0,   
    num_beams=5,
    no_repeat_ngram_size=2,
    num_return_sequences=5,
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

# DIALOGPT 

In [None]:
Dfrom transformers import AutoModelForCausalLM, AutoTokenizer
import torch


tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")


In [None]:
# Let's chat for 5 lines
for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

## Adding BQ

In [1]:
import os 
os.environ.clear()

In [2]:
!poetry add python-dotenv

('Configuration file exists at /Users/deepanshu.kandpal/Library/Application Support/pypoetry, reusing this directory.\n\nConsider moving TOML configuration files to /Users/deepanshu.kandpal/Library/Preferences/pypoetry, as support for the legacy directory will be removed in an upcoming release.',)
[33mThe currently activated Python version 3.8.18 is not supported by the project (^3.9,<3.12).
Trying to find and use a compatible version.[39m 
Using [36mpython3[39m (3.9.6)
The following packages are already present in the pyproject.toml and will be skipped:

  • [36mpython-dotenv[39m

If you want to update it to the latest compatible version, you can use `poetry update package`.
If you prefer to upgrade it to the latest available version, you can use `poetry add package@latest`.

Nothing to add.


In [3]:
!poetry show

('Configuration file exists at /Users/deepanshu.kandpal/Library/Application Support/pypoetry, reusing this directory.\n\nConsider moving TOML configuration files to /Users/deepanshu.kandpal/Library/Preferences/pypoetry, as support for the legacy directory will be removed in an upcoming release.',)
[33mThe currently activated Python version 3.8.18 is not supported by the project (^3.9,<3.12).
Trying to find and use a compatible version.[39m 
Using [36mpython3[39m (3.9.6)
[36maccelerate                   [39m [39;1m0.25.0      [39;22m Accelerate
[36maiohttp                      [39m [39;1m3.9.1       [39;22m Async http client/server framewo...
[36maiosignal                    [39m [39;1m1.3.1       [39;22m aiosignal: a list of registered ...
[36malembic                      [39m [39;1m1.13.0      [39;22m A database migration tool for SQ...
[36mantlr4-python3-runtime       [39m [39;1m4.9.3       [39;22m ANTLR 4.9.3 runtime for Python 3.7
[36manyio                 

In [4]:
from dotenv import load_dotenv
load_dotenv("animal.env")

True

In [5]:
for key, value in os.environ.items():
    print(f"{key}: {value}")

type: service_account
project_id: placeapi-333910
private_key_id: cba7da27e82d1a1abf43f648bb8ffa4bf504b415
private_key: MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDT0Lf4+t2wMr7kDQTHLaxWeUTMNCDnkk1bqwgIGcsGJU4KXtNbS5uPkhCinh/y5KuiNLFqcIxdKl3ZdVWDrNqVeNno1c78bYhmSf7lql97VOBbZU4jIgaKPnc3DuCiKQR9FP1unqT/zV5413KYmRXRpiH7qdx4yy9sGq8Z4hJhhbQCPze016b2NOkhXypBnvT1U8ujhSDC+tCTLMYR+BNMzkuN3cdwbbTPTXlij9bdHOfDpZ1C71TIWOn9h+DEsultWAPptSHBvup22pl2c4MK2OmUSrLZxtp2hp7wYxNYhQpz29EcaOC9NdKjP2vcZucdbbIRTzewhsjMLTQC22LBAgMBAAECggEAC1TVIrq49V4hnC2eHdGxsIaYSZxYTksjXE4QBe7rT47bV+6uu+mVjt6viSOQbfwRuoG7JAiVWb6UereuNXcf0nM+pRgbFKH/dAltW3f716s4vWrz/Skq1GZ3Fjjzh3lORirDTBGsngqsjSZpeQRniGms07vIkaW9Cbl7uJTBivn1NdyiCVnLyRgnJ/cBcK9Ic0TPM8fi2AWPBNkeFFK3QNG37EzrTWuY0ygtTNs5jR0FbTsBru6FQz01AOeRSbrOvH7kgN3JsqTOCBj6YckUNuZSLltYi3Y+aKkpTg9eMLHDDD2vL7Bw8f7d4aZ8jUDHz0rxUX7PC1Qebwx2M6jkaQKBgQDtdmFcN//V0kWGOlrV+MEMz+EWdakuuTTuJYHrKk2F58O6sinDI1PkBqb5VuneUi4JC3QOvUtS1boTF0xjynOMbDPcf1EusIbNziGrefNpTfUlaP3Kj7elJmSi8EX4ARes

In [None]:
from  animal_gpt.utils.config import (
    default_bq_cliente_mail,
    default_bq_private_key,
    default_bq_token_uri,
    default_project_id,
    default_dialect,
    default_private_key,
    format_bqprivatekey,
)
import json

In [None]:
bqlogin = (
        "{"
        + '"project_id": "'
        + default_project_id
        + '", '
        + '"private_key": "-----BEGIN PRIVATE KEY-----\\n'
        + format_bqprivatekey(default_bq_private_key)
        + '\\n-----END PRIVATE KEY-----\\n", '
        + '"client_email": "'
        + default_bq_cliente_mail
        + '", '
        + '"token_uri": "'
        + default_bq_token_uri
        + '"'
        + "}"
)

In [None]:
try:
    _ = json.loads(bqlogin)
except Exception as e:
    raise e("BQLOGIN must be a valid json string")

with open(default_private_key, "wt") as f:
    f.write(bqlogin)

In [2]:
from google.oauth2 import service_account
from google.cloud import bigquery

In [None]:
credentials = service_account.Credentials.from_service_account_file(default_private_key, )
client = bigquery.Client.from_service_account_json(default_private_key)

## Logging

In [None]:
import logging
from logging.handlers import RotatingFileHandler
from flask import Flask, render_template, request
import time
import uuid
import pandas as pd


class MyLogger:
    def __init__(self, log_filename):
        self.log_formatter = logging.Formatter('%(message)s', datefmt='%Y-%m-%d %H:%M:%S')
        self.log_handler = RotatingFileHandler(log_filename, maxBytes=10*1024*1024, backupCount=5)
        self.log_handler.setFormatter(self.log_formatter)
        self.logger = logging.getLogger('my_logger')
        self.logger.setLevel(logging.INFO)
        self.logger.addHandler(self.log_handler)

    def log_info(self, log_entry):
        self.logger.info(log_entry)

    def read_log_file(self, log_filename):
        log_entries = []
        with open(log_filename, 'r') as log_file:
            for line in log_file:
                line = line.strip()
                if line:
                    try:
                        log_entry = eval(line)
                        log_entries.append(log_entry)
                    except Exception as e:
                        print(f"Error processing log entry: {e}")
        return pd.DataFrame(log_entries)

# Create an instance of MyLogger
log_filename = f'logs/app_{time.strftime("%Y-%m-%d")}.log'
logger_instance = MyLogger(log_filename)

act_id = str(uuid.uuid4())
timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
user_input = "who is sandeep reddy vanga"
thumbs_value = "1"
log_entry = {
    'log_level': 'INFO',
    'activity_id': act_id,
    'timestamp': timestamp,
    'user_input': user_input,
    'thumbs_value': thumbs_value,
    'response': 'Bot\'s response goes here'
}
logger_instance.log_info(log_entry)



In [None]:
logs = logger_instance.read_log_file(log_filename)

In [None]:
logs

In [None]:
logs['timestamp'].max()

In [7]:
from animal_gpt.utils.bq_class import BQ

bq=  BQ()
table = "placeapi-333910.animalgpt.animalgpt_logs"

In [None]:
bq.to_bq(df=logs, table=table)

## Prediction

In [None]:

!poetry add python-dotenv==1.0.1

In [1]:
!poetry add "google-cloud-bigquery[pandas]"

('Configuration file exists at /Users/deepanshu.kandpal/Library/Application Support/pypoetry, reusing this directory.\n\nConsider moving TOML configuration files to /Users/deepanshu.kandpal/Library/Preferences/pypoetry, as support for the legacy directory will be removed in an upcoming release.',)
Using version [39;1m^3.17.1[39;22m for [36mgoogle-cloud-bigquery[39m

[34mUpdating dependencies[39m
[2K[34mResolving dependencies...[39m [39;2m(83.3s)[39;22m[34mResolving dependencies...[39m [39;2m(0.1s)[39;22m[34mResolving dependencies...[39m [39;2m(4.9s)[39;22m[34mResolving dependencies...[39m [39;2m(7.6s)[39;22m[34mResolving dependencies...[39m [39;2m(7.7s)[39;22m[34mResolving dependencies...[39m [39;2m(9.9s)[39;22m[34mResolving dependencies...[39m [39;2m(11.2s)[39;22m[34mResolving dependencies...[39m [39;2m(11.5s)[39;22m[34mResolving dependencies...[39m [39;2m(13.0s)[39;22m[34mResolving dependencies...[39m [39;2m(13.9s)[39;22m[34mResolving de

In [21]:
import faiss
import numpy as np


In [20]:
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, util


# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')
# model = AutoModel.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


In [41]:
# Generate embeddings for sentences and store in Faiss index
def create_embedding_index(sentences):
    embeddings = model.encode(sentences)
    
    # Normalize embeddings before adding to the index
    embeddings = np.array(embeddings)
    embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True)

    # Create Faiss index
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings.astype(np.float32))
    
    return index

# Perform sentence similarity search using Faiss
def find_similar_sentences(query, index, sentences, top_k=5):
    query_embedding = model.encode(query)
    query_embedding /= np.linalg.norm(query_embedding)

    # Perform similarity search using Faiss
    i, similar_indices = index.search(np.array([query_embedding]).astype(np.float32), top_k)
    print(i,similar_indices)
    # Retrieve and return similar sentences
    similar_sentences = [sentences[i] for i in similar_indices[0]]
    probabilities = i[0]
    return similar_sentences,probabilities

In [39]:
# Sample sentences
sentences = ["Give me the complete cast of Animal",
"Who has made Animal",
"Who is the director of Animal?",
"Who is Ranbir Kapoor?",
"Who is Sandeep Reddy Vanga?",
"Give your review of Animal",
"Do you think Animal is a bad film?",
"Why is Animal getting so much hate?",
"is Animal a misogynist film?",
"Do films carry any responsibililty for morality?",
"Should filmmakers be held accountable?",
"Give me your favourite scene from Animal",
"Give me theories for Animal Sequel",
"Tell me about Animal Sequel",
"What is Animal?",
"Give me Animal review?",
"who is the Main hero of Animal?",
"who all worked in animal"]

In [42]:
index = create_embedding_index(sentences)

# Example query
user_input = "Why is Animal getting so much hate?"

# Find similar sentences
similar_sentences, probabilities = find_similar_sentences(user_input, index, sentences)

# Print the results
print("Query:", user_input)
print("Similar Sentences:", similar_sentences)
print("probability:",probability )

[[1.         0.5423268  0.4972879  0.4949548  0.46516395]] [[ 7 14  6  1  5]]
Query: Why is Animal getting so much hate?
Similar Sentences: ['Why is Animal getting so much hate?', 'What is Animal?', 'Do you think Animal is a bad film?', 'Who has made Animal', 'Give your review of Animal']
probability: [1.         0.5423268  0.4972879  0.4949548  0.46516395]


In [28]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
import torch
torch.manual_seed(100)
# Load the fine-tuned model and tokenizer
model_path = "./anmialGPTV1"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model_predict = GPT2LMHeadModel.from_pretrained(model_path)

# Set the model to evaluation mode
# model.eval()

In [None]:
inputs = tokenizer(["Give me your favourite scene from Animal",], return_tensors="pt")

In [None]:
from transformers import set_seed
set_seed(42)
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# set top_k to 50
sample_output = model_predict.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    top_p=0.92,
    top_k=0,   
    num_beams=5,
    no_repeat_ngram_size=2,
    num_return_sequences=5,
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

In [6]:
read_table = "placeapi-333910.animalgpt.animal_prompt_db"


In [8]:
query_for_index =  f'''SELECT query as queries FROM `{read_table}`'''
prompts = bq.read_bq(query = query_for_index)

In [16]:
list(prompts["queries"])

['Give me the complete cast of Animal',
 'who is the Main hero of Animal?',
 'Do you think Animal is a bad film?',
 'Why is Animal getting so much hate?',
 'Who has made Animal',
 'Who is the director of Animal?',
 'Give your review of Animal',
 'What is Animal?',
 'is Animal a misogynist film?',
 'Give me your favourite scene from Animal',
 'Do films carry any responsibililty for morality?',
 'Should filmmakers be held accountable?',
 'Who is Ranbir Kapoor?',
 'who all worked in animal',
 'Who is Sandeep Reddy Vanga?']

In [81]:
query_for_prompt = f'''SELECT *  FROM `{read_table}`'''
prompt_mapping = bq.read_bq(query = query_for_prompt)

In [82]:
prompt_mapping

Unnamed: 0,query,prompt
0,Give me the complete cast of Animal,Ranbir Kapoor in a dual role as Ranvijay
1,who is the Main hero of Animal?,Ranbir Kapoor
2,Do you think Animal is a bad film?,I think it's a mixed bag
3,Why is Animal getting so much hate?,I know that some of you are
4,Who has made Animal,"Sandeep Reddy Vanga (pronounced),"
5,Who is the director of Animal?,"Sandeep Reddy Vanga (pronounced),"
6,Give your review of Animal,Critical response The film received mixed reviews
7,What is Animal?,Animal is a 2023 Indian Hindi-language action ...
8,is Animal a misogynist film?,I think so because while I was in a few minute...
9,Give me your favourite scene from Animal,was the fight scene where Bobby Deol Abrar Haq...


In [83]:
prompt_mapping.query("query == 'Give me your favourite scene from Animal' ")

Unnamed: 0,query,prompt
9,Give me your favourite scene from Animal,was the fight scene where Bobby Deol Abrar Haq...


In [84]:
index = create_embedding_index(list(prompt_mapping["query"]))


In [85]:
from transformers import set_seed
# set seed to reproduce results. Feel free to change the seed though to get different results

set_seed(42)

def create_prediction(query, index, sentence):
     print(query)
     similar_sentences, probabilities = find_similar_sentences(query, index, prompt_mapping["query"])
     print(similar_sentences)
     prompt = prompt_mapping.query(f"query == '{similar_sentences[0]}'")
     inputs = tokenizer([prompt["prompt"].iloc[0],], return_tensors="pt")
     sample_output = model_predict.generate(
     **inputs,
     max_new_tokens=100,
     do_sample=True,
     top_p=0.92,
     top_k=0,   
     num_beams=5,
     no_repeat_ngram_size=2,
     num_return_sequences=5,
     )
     output = tokenizer.decode(sample_output[0], skip_special_tokens=True)
     return output


In [86]:
prompt_mapping.query("query == 'Give me your favourite scene from Animal' ")

Unnamed: 0,query,prompt
9,Give me your favourite scene from Animal,was the fight scene where Bobby Deol Abrar Haq...


In [87]:
output = create_prediction(query=input, index=index,sentence=prompt_mapping)
output

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Give me your favourite scene from Animal
[[0.9999999  0.5932654  0.52839327 0.5119614  0.50351584]] [[9 0 6 1 4]]
['Give me your favourite scene from Animal', 'Give me the complete cast of Animal', 'Give your review of Animal', 'who is the Main hero of Animal?', 'Who has made Animal']


"was the fight scene where Bobby Deol Abrar Haq has just murdered someone on the day of his third wedding\nfourth I don't even know but his face is spread with blood and he forcefully starts having sex with his newly wed wife whose pregnant by the way in front of everyone\nonce he is done he calls for his other wives in his haram at the knife point where he says Kuchh main Tumhen yah Nahin Pata ismein main Sandeep Reddy Vanga ko game Karun ya use theatre ke crowd ko Pratiksha from our entertainment team has this"

# Final results

In [1]:
import os 
os.environ.clear()
from dotenv import load_dotenv
load_dotenv("animal.env")
for key, value in os.environ.items():
    print(f"{key}: {value}")

type: service_account
project_id: placeapi-333910
private_key_id: cba7da27e82d1a1abf43f648bb8ffa4bf504b415
private_key: MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDT0Lf4+t2wMr7kDQTHLaxWeUTMNCDnkk1bqwgIGcsGJU4KXtNbS5uPkhCinh/y5KuiNLFqcIxdKl3ZdVWDrNqVeNno1c78bYhmSf7lql97VOBbZU4jIgaKPnc3DuCiKQR9FP1unqT/zV5413KYmRXRpiH7qdx4yy9sGq8Z4hJhhbQCPze016b2NOkhXypBnvT1U8ujhSDC+tCTLMYR+BNMzkuN3cdwbbTPTXlij9bdHOfDpZ1C71TIWOn9h+DEsultWAPptSHBvup22pl2c4MK2OmUSrLZxtp2hp7wYxNYhQpz29EcaOC9NdKjP2vcZucdbbIRTzewhsjMLTQC22LBAgMBAAECggEAC1TVIrq49V4hnC2eHdGxsIaYSZxYTksjXE4QBe7rT47bV+6uu+mVjt6viSOQbfwRuoG7JAiVWb6UereuNXcf0nM+pRgbFKH/dAltW3f716s4vWrz/Skq1GZ3Fjjzh3lORirDTBGsngqsjSZpeQRniGms07vIkaW9Cbl7uJTBivn1NdyiCVnLyRgnJ/cBcK9Ic0TPM8fi2AWPBNkeFFK3QNG37EzrTWuY0ygtTNs5jR0FbTsBru6FQz01AOeRSbrOvH7kgN3JsqTOCBj6YckUNuZSLltYi3Y+aKkpTg9eMLHDDD2vL7Bw8f7d4aZ8jUDHz0rxUX7PC1Qebwx2M6jkaQKBgQDtdmFcN//V0kWGOlrV+MEMz+EWdakuuTTuJYHrKk2F58O6sinDI1PkBqb5VuneUi4JC3QOvUtS1boTF0xjynOMbDPcf1EusIbNziGrefNpTfUlaP3Kj7elJmSi8EX4ARes

In [2]:
from animal_gpt.predict.core import Prediction
from dotenv import load_dotenv
load_dotenv("animal.env")
model_path = "dkandpalz/animalGPT1"

predict = Prediction(model_path)

In [7]:
import pandas as pd
import time
import numpy as np
import uuid
from  animal_gpt.utils.bq_class import BQ
from  animal_gpt.utils.config import (
    default_logs_db
)
from animal_gpt.utils.logger import log


@log
def generate_prediction(user_input):
    act_id = str(uuid.uuid4())
    output, probabilities, predicted_prompt = predict.create_prediction(user_input)
    thumbs_value = 'fantastic'
    return output, probabilities, thumbs_value ,predicted_prompt

In [8]:
input = "tell me about animal"

In [9]:
o = generate_prediction(input)
o

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"Critical response The film received mixed reviews from critics and audiences and broke several box office records for a Hindi film, including the highest non-holiday opening, biggest single days, highest opening weekend and highest open week in India. At the overseas box-office, it broke the previous record held by Ranbir Kapoor's Brahmāstra: Part One – Shiva () and Jawan ().[][] In the first week it earned a total worldwide gross collection of ₹. crore (US$. million). On its th day"

In [1]:
from transformers import TextStreamer

tokenizer
streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer=streamer, max_new_tokens=100)