## This notebook explores two fine tuned models to support Artisse AI Prompts

*   The first model is finetuned GPT2
*   The second model is finetuned GPT Neo
*   These models were originall trained on Stability AI prompts
*   This is a draft. The prompts the models output have not been tested against Artisse AI's actual image generation algorithm

## To run the


1.  Selectect "Runtime" from the top menu of this Google Colab
2.  Select "Run All"
3.  You will be prompted twice: 1st to allow this notebook to run in your Google Drive. Then to allow this notebook to download the models into your Google Drive.
4.  It will take some time to download the models
5.  Scroll to the bottom of the notebook and explore the applet












In [None]:
%%capture
!pip install gradio
!pip install transformers
from google.colab import drive
import re
import torch
import numpy as np
import gradio as gr
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import os
import io
from googleapiclient.http import MediaIoBaseDownload
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_linear_schedule_with_warmup, AutoModelForTokenClassification, TokenClassificationPipeline

In [None]:
# turn off logging:
import logging

# Disable logging
logging.disable(logging.CRITICAL)


In [None]:
# download the models
from google.colab import drive
from google.colab import auth
from googleapiclient.discovery import build
import os

# Authenticate and mount Google Drive
drive.mount('/content/drive')

# Authenticate with Google Drive API
auth.authenticate_user()
drive_service = build('drive', 'v3')

# Specify the IDs and paths for the models
model_id_v1 = "1sLoRP8tt4H7eHM-d-KS-rySsu0E_CY4U"
tokenizer_id_v1 = "1nS5vQY65WfLl71w3Un1wOl8ccKHIR9Ho"

model_id_v2 = "1LAAkIiBFgVMJP2EQGutEUVez5Oaz_eYU"
tokenizer_id_v2 = "1t1stRthDWMRi46tt4ko9AwF_sQYL8fnj"

folder_path_model = [
    "/content/drive/MyDrive/artisse_models/v1/model",
    "/content/drive/MyDrive/artisse_models/v1/tokenizer",

    "/content/drive/MyDrive/artisse_models/v2/model",
    "/content/drive/MyDrive/artisse_models/v2/tokenizer"
]

# Create the folders if they don't exist
for f in folder_path_model:
    if not os.path.exists(f):
        os.makedirs(f)
        print("Folder created successfully!")
    else:
        print("Folder already exists.")

# Download the model and tokenizer files
for model_folder, file_id in zip(folder_path_model, [model_id_v1, tokenizer_id_v1, model_id_v2, tokenizer_id_v2]):
    # List all files in the folder
    file_list = drive_service.files().list(q="'{}' in parents and trashed=false".format(file_id), spaces='drive',
                                           fields='files(id, name)').execute().get('files', [])

    # Download each file
    for file in file_list:
        file_path = os.path.join(model_folder, file['name'])
        request = drive_service.files().get_media(fileId=file['id'])
        fh = io.FileIO(file_path, mode='wb')
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        while done is False:
            status, done = downloader.next_chunk()
            print(f"Downloaded {int(status.progress() * 100)}% of {file['name']}")


In [None]:
# Load the model from your Google Drive
model_path_v1 = '/content/drive/MyDrive/artisse_models/v1/model'
tokenizer_path_v1 = '/content/drive/MyDrive/artisse_models/v1/tokenizer'
model_path_v2 = '/content/drive/MyDrive/artisse_models/v2/model'
tokenizer_path_v2 = '/content/drive/MyDrive/artisse_models/v2/tokenizer'

# Load the model and tokenizer
model_v1 = AutoModelForCausalLM.from_pretrained(model_path_v1)
tokenizer_v1 = AutoTokenizer.from_pretrained(tokenizer_path_v1)

model_v2 = AutoModelForCausalLM.from_pretrained(model_path_v2)
tokenizer_v2 = AutoTokenizer.from_pretrained(tokenizer_path_v2)


In [None]:
# Load models for verb removal
# https://huggingface.co/QCRI/bert-base-multilingual-cased-pos-english?text=My+name+is+Sarah+and+I+live+in+London
model_pos_name = "QCRI/bert-base-multilingual-cased-pos-english"
tokenizer_pos = AutoTokenizer.from_pretrained(model_pos_name)
model_pos = AutoModelForTokenClassification.from_pretrained(model_pos_name)
pipeline = TokenClassificationPipeline(model=model_pos, tokenizer=tokenizer_pos)

In [None]:

def cleanup_sentence(sentence):
    """
    Cleans up a sentence by removing leading/trailing spaces, capitalizing the first letter,
    fixing word tokenization issues, and removing extra spaces before punctuation marks.

    Parameters:
    - sentence: A string representing the sentence to be cleaned up.

    Return Value:
    - cleaned_sentence: A string containing the cleaned-up sentence.
    """

    # Remove leading/trailing spaces
    sentence = sentence.strip()

    # Capitalize the first letter
    sentence = sentence.capitalize()

    # Fix word tokenization issues
    # Combine replacements
    sentence = sentence.replace(" ##", "").replace(" ' ", "'")

    # Remove extra spaces before punctuation marks
    sentence = re.sub(r'\s*([.,?!:])', r'\1', sentence)

    return sentence


def reconstruct_sentence(orig_sentence, pos):
    """
    Reconstructs a sentence by filtering out verbs, adverbs, and subordinating conjunctions based on their part-of-speech tags.
    The reconstructed sentence is then cleaned up using the cleanup_sentence() function.

    Parameters:
    - orig_sentence: An array of the original sentence with punctuation still connected to words.
    - pos: A list of dictionaries containing information about each word's part-of-speech.

    Return Value:
    - reconstructed_text: A string containing the reconstructed and cleaned-up sentence.
    """

    filtered_words = []
    for word_info in pos:
        index = word_info['index'] - 1  # Adjust index to match Python's 0-based indexing
        if word_info['entity'] != 'VRB' and word_info['entity'] != 'VBG' and word_info['entity'] != 'RB':
            word = pos[index]['word']
            filtered_words.append(word)

    reconstructed_text = cleanup_sentence(' '.join(filtered_words))

    return reconstructed_text


def prompt_editor(original_prompt):
    """
    Edits a prompt by splitting it into words, processing it using an NLP pipeline, and then reconstructing and cleaning up the sentence.

    Parameters:
    - original_prompt: A string representing the original prompt to be edited.

    Return Value:
    - edited_prompt: A string containing the edited and cleaned-up prompt.
    """

    orig_word_array = original_prompt.split(' ')
    outputs = pipeline(original_prompt)  # Assuming pipeline is defined elsewhere
    return reconstruct_sentence(orig_word_array, outputs)

# prompt_editor("A stylish lady in a sleek business suit, confidently leading a meeting in a modern office space in Asia.")

In [None]:
prompt1 = "A petite lady in a vibrant floral dress, lafing heartily while enjoyin a cup of cofee in a cuaint café in Rome."
# poorly_written_prompt= "A woman in a chef's apron, preparing a meal in a sunlit kitchen."
# poorly_written_prompt = "A petite lady in a vibrant floral dress, lafing heartily while enjoyin a cup of cofee in a cuaint café in Rome."

# poorly_written_prompt= "A woman in a ski outfit, drinking hot cocoa, and holding hands with her mini dog"
# poorly_written_prompt= "A man in a ski outfit, drinking hot cocoa, and with puffs of snow on his hair"
prompt2 = "a man in a garden, sunshine on his face, eating an almond"
# Generate the attention mask

encoded_prompt = tokenizer_v1.encode(prompt2,  return_tensors="pt")
encoded_prompt

In [None]:
attention_mask = torch.ones(encoded_prompt.shape, dtype=torch.long, device=encoded_prompt.device)


output = model_v1.generate(encoded_prompt, max_length=50, attention_mask=attention_mask, num_return_sequences=1)
improved_prompt = tokenizer_v1.decode(output[0], skip_special_tokens=True)
print("Improved prompt:", improved_prompt)

In [None]:
# top - k  top - p combined
# set seed to reproduce results. Feel free to change the seed though to get different results

attention_mask = torch.ones(encoded_prompt.shape, dtype=torch.long, device=encoded_prompt.device)

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = model_v1.generate(
    encoded_prompt,
    do_sample=True,
    max_length=40,
    top_k=50,
    attention_mask = attention_mask, #torch.ones(encoded_prompt.shape, dtype=torch.long, device=encoded_prompt.device)
    top_p=0.98,
    num_return_sequences=3
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer_v1.decode(sample_output, skip_special_tokens=True)))


In [None]:
# top-k top-p combined
# set seed to reproduce results. Feel free to change the seed though to get different results
def generate_prompt(original_prompt, model, tokenizer):
    encoded_prompt = tokenizer.encode(original_prompt, return_tensors="pt")

    attention_mask = torch.ones(encoded_prompt.shape, dtype=torch.long, device=encoded_prompt.device)

    # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
    sample_outputs = model.generate(
        encoded_prompt,
        do_sample=True,
        max_length=60,
        top_k=50,
        attention_mask=attention_mask,
        top_p=0.95,
        num_return_sequences=3
    )

    return tokenizer.decode(sample_outputs[0], skip_special_tokens=True)

# test
# generate_prompt("a lady")


## Applet
Scroll to bottom of notebook. In the applet that appears, type in prompts, press Generate and read the completed and edited prompt

In [None]:
def initiate_model_gpt2(original_prompt):
    """
    Initiates the model by generating and editing a prompt based on the given original_prompt.

    Args:
        original_prompt (str): The original prompt to start the model.

    Returns:
        str: The edited prompt generated by the model.

    """
    generated_prompt = generate_prompt(original_prompt, model_v1, tokenizer_v1)
    edited_prompt = prompt_editor(generated_prompt)
    return edited_prompt

def initiate_model_neo(original_prompt):
    """
    Initiates the model by generating and editing a prompt based on the given original_prompt.

    Args:
        original_prompt (str): The original prompt to start the model.

    Returns:
        str: The edited prompt generated by the model.

    """
    generated_prompt = generate_prompt(original_prompt, model_v2, tokenizer_v2)
    edited_prompt = prompt_editor(generated_prompt)
    return edited_prompt

# def add_example_to_input_gpt2(example):
#     text_input_gpt2.value += example


with gr.Blocks() as demo:
    gr.Markdown("Complete your prompt using a custom trained model (verbs will automatically be removed)")

    with gr.Accordion("Examples of prompts") as accordion:
        accordion = gr.Markdown("A focused lady in a red bodysuit, laughing heartily while enjoying a cup of coffee in a quaint café in Rome.<br>A man in a stylish tuxedo, strolling through a tranquil park in Chengdu.")

    with gr.Tab("initiate fine-tuned GPT2"):
        text_input_gpt2 = gr.Textbox(label="Original Prompt", placeholder="Write a prompt for image generation")
        text_output_gpt2 = gr.Textbox(label="Generated Prompt with fine-tuned GPT2")
        text_button2 = gr.Button("Complete Prompt")
    with gr.Tab("initiate fine-tuned GPT Neo"):
        text_input_neo = gr.Textbox(label="Original Prompt", placeholder="Write a prompt for image generation")
        text_output_neo = gr.Textbox(label="Generated Prompt with fine-tuned GPT Neo")
        text_button3 = gr.Button("Complete Prompt")


    text_button2.click(initiate_model_gpt2, inputs=text_input_gpt2, outputs=text_output_gpt2)
    text_button3.click(initiate_model_neo, inputs=text_input_neo, outputs=text_output_neo)


demo.launch(debug=False) # set the to true to show errors in gradio


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

