In [None]:
!pip install pandas numpy

In [None]:
from pathlib import Path

DATA_PATH = Path("./fixed_data")

In [None]:
adjectives_file = DATA_PATH / "adjectives.txt"
authors_file = DATA_PATH / "authors.txt"
books_file = DATA_PATH / "books.txt"
business_documents_file = DATA_PATH / "business_documents.txt"
movies_file = DATA_PATH / "movies.txt"
plays_file = DATA_PATH / "plays.txt"
writting_style_file = DATA_PATH / "writting_style.txt"
    
def lower_all(texts):
    return [text.lower() for text in texts]

def strip_all(texts):
    return [text.strip() for text in texts]

def read_lines(file):
    with open(file, "r") as f:
        return strip_all(f.readlines())

In [None]:
from enum import Enum

class ObjectiveType(Enum):
    AUTHOR = 0
    BOOK = 1
    DOC = 2
    MOVIE = 3
    PLAY = 4
    STYLE = 5
    ADJECTIVE = 6

class ObjectiveElement():
    # type = authors, books, bu
    def __init__(self, type: ObjectiveType, value: str):
        self.links = []

        self.type = type
        self.value = value

    def create_prompt(self):
        templ = {
            ObjectiveType.AUTHOR: "Rewrite this text but do it using the writing style of {}",
            ObjectiveType.BOOK: "Rewrite this text but do it using the tone of {}",
            ObjectiveType.DOC: "Rewrite this text as if it was a {} document",
            ObjectiveType.MOVIE: "Rewrite this text but do it using the writing style of the movie {}",
            ObjectiveType.PLAY: "Rewrite this text but do it using the writing style of the play {}",
            ObjectiveType.STYLE: "Rewrite this text as if it was written as a {}",
            ObjectiveType.ADJECTIVE: "Rewrite this text but do it using {} tone",
        }
        return templ[self.type].format(self.value)
    
    def explain_prompt(self):
        templ = {
            ObjectiveType.AUTHOR: "Describe the writing style of {} in 40 words",
            ObjectiveType.BOOK: "Describe the writing style of the book \"{}\" in 40 words",
            ObjectiveType.DOC: "Describe what is the \"{}\" document in 40 words",
            ObjectiveType.MOVIE: "Describe the tone, style, topic of the movie \"{}\" in 40 words",
            ObjectiveType.PLAY: "Describe the tone, style, topic of the play \"{}\" in 40 words",
            ObjectiveType.STYLE: "Describe the writting style \"{}\" in 40 words",
        }
        return templ.get(self.type, "").format(self.value)


In [None]:
adjectives = [ObjectiveElement(ObjectiveType.ADJECTIVE, val) for val in lower_all(read_lines(adjectives_file))]
stypes = [ObjectiveElement(ObjectiveType.STYLE, val) for val in lower_all(read_lines(writting_style_file))]

def parse_author(lines):
    return [line.split("(")[0].strip() for line in lines]

authors = [ObjectiveElement(ObjectiveType.AUTHOR, val) for val in parse_author(read_lines(authors_file))]
docs = [ObjectiveElement(ObjectiveType.DOC, val) for val in lower_all(read_lines(business_documents_file))]

def parse_play(lines):
    return [line.split("by")[0].strip().replace('"', "") for line in lines]

plays = [ObjectiveElement(ObjectiveType.PLAY, val) for val in parse_play(read_lines(plays_file))]

def parse_movie(lines):
    return [line.split("(")[0].strip() for line in lines]

movies = [ObjectiveElement(ObjectiveType.MOVIE, val) for val in parse_movie(read_lines(movies_file))]

def parse_book(lines):
    return [line.split("by")[0].strip().replace('"', "") for line in lines]

books = [ObjectiveElement(ObjectiveType.BOOK, val) for val in parse_book(read_lines(books_file))]

all_elements = adjectives + authors + books + docs + movies + plays + stypes
print(all_elements.__len__())
print(all_elements[-1].create_prompt(), all_elements[-1].explain_prompt())

find text to infer

In [None]:
import pandas as pd
import json

df = pd.read_csv("./data/0401/150_suppl_original_text.csv")

def count_words(text):
    return len(text.split())

df = df[df['original_text'].apply(lambda x: count_words(x) > 50 and count_words(x) < 100 and "[Your Name]" not in x)]
all_texts = df['original_text'].tolist()

def calculate_diversity(text):
    words = text.split()
    unique_words = set(words)
    diversity_score = len(unique_words) / len(words)
    return diversity_score

diversity_scores = [(text, calculate_diversity(text)) for text in all_texts]

# Sort by diversity score
diversity_scores.sort(key=lambda x: x[1], reverse=True)

# Choose the top N texts
N = 6
diverse_texts = diversity_scores[:N]

jsondata = [diverse_texts[i][0] for i in range(N)]
print(json.dumps(jsondata, indent=2))

generate core data file

In [None]:
df_core = pd.DataFrame({
    "type": [a.type.name for a in all_elements],
    "objective": [a.value for a in all_elements],
    "rewrite_prompt": [a.create_prompt() for a in all_elements],
    "explain_prompt": [a.explain_prompt() for a in all_elements],
})

df_core.to_csv("core_objectives.csv", index=False)

# Ask gemma to build dataset

In [None]:
!pip install bitsandbytes accelerate transformers

import os

os.environ["HF_TOKEN"] = "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"

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-7b-it", quantization_config=quantization_config)

In [None]:
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

def ask_gemma(input_text, max_new_tokens=100):
    input_ids = tokenizer(input_text, padding=True, return_tensors="pt")

    outputs = model.generate(**input_ids, max_new_tokens=max_new_tokens)
    results = [
        tokenizer.decode(outputs[i][len(input_ids[i]):], skip_special_tokens=True)
        for i in range(len(input_text))
    ]
    if isinstance(input_text, list):
        return results
    return results[0]

In [None]:
import tqdm
import numpy as np
import pandas as pd

# load data first
df_core = pd.read_csv("core_objectives.csv", keep_default_na=False)
df_core = df_core.iloc[:1]

jsondata = [
  "Georgette, a loving mother, always puts her family's needs first. From waking up early to prepare breakfast to tucking her children into bed at night, she is the heart of their home. Her warm hugs and encouraging words make every day brighter. Georgette's unwavering love and dedication make her an incredible mother.",
  "Hey Mari! Just wanted to share something funny that happened today. So, I was telling my friends about that crazy party we went to last weekend, and you know how I tend to exaggerate things? Well, I may have added a little embellishment to the story. They were cracking up, but I couldn't help but laugh at myself too. Anyway, hope you're having a great day! Let's catch up soon.",
  "Jerry, a talented musician, nervously stepped onto the stage. As he began to play, his fingers stumbled, and the melody turned into a jumbled mess. The audience fell silent, disappointment filling the air. But Jerry didn't let this failure define him; he practiced harder and returned to the stage stronger than ever.",
  "Ladies and gentlemen, thank you for joining us today. I stand before you to shed light on the fascinating puffin. Found in the North Atlantic, these adorable birds are known for their colorful beaks and exceptional diving skills. Let's celebrate the puffin's resilience and conservation efforts. Together, we can ensure a bright future for these magnificent creatures. Thank you.",
  "Dear Mr. Johnson,\\n\\nI hope this email finds you well. Attached is the invoice for the recent services provided by Gale's Plumbing. The total amount due is $250, which includes the cost of labor and materials. Please review the invoice and kindly make the payment within 14 days. If you have any questions or concerns, feel free to reach out. Thank you for your business!\\n\\nBest regards,\\nEmily Smith\\nGale's Plumbing",
  "Ladies and gentlemen, thank you for being here today. I want to address a common issue we all face: chafing. Whether it's during exercise or everyday activities, chafing can be uncomfortable and irritating. But fear not! With the help of our new product, \"ChafeAway,\" you can bid farewell to chafing forever. Say goodbye to discomfort and hello to smooth, irritation-free skin. Try \"ChafeAway\" today and experience the difference for yourself."
]

ls = []
for i, row in tqdm.tqdm(df_core.iterrows(), total=len(df_core)):
    explain_prompt = row["explain_prompt"]
    # if explain_prompt == float('nan'):
    #     explain_prompt = None
        
    # print(f"explain {explain_prompt}, {type(explain_prompt)}")
    ls_prompt = [] if not explain_prompt else [ explain_prompt ]
    ls_prompt += [
        '{}: """{}"""'.format(row["rewrite_prompt"], text) for text in jsondata
    ]
    # print(ls_prompt)
    batch_outputs = ask_gemma(ls_prompt, max_new_tokens=200)
    desc, ls_rewritten = ("", batch_outputs) if not explain_prompt else (batch_outputs[0], batch_outputs[1:])
    
    d = {
        "type": row["type"],
        "objective": row["objective"],
        "description": desc,
    }
    d.update({
        f"rewritten_{i}": ls_rewritten[i] for i in range(len(ls_rewritten))
    })
    ls.append(d)

df_final = pd.DataFrame(ls)
df_final.to_csv("final_objectives.csv", index=False)

In [None]:
df_final.iloc[0]['rewritten_1']

# Analyze data

In [11]:
import pandas as pd
import re

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)

df = pd.read_csv("final_objectives.csv", keep_default_na=False)

df = df.drop_duplicates(subset='objective')

# df[df['type'] == 'AUTHOR'].head()

max_length_view = None


def lower_first_char(text):
    return text[0].lower() + text[1:]


def clean_rewrite(text):
    text = text.strip()

    # remove "Sure ..."
    p = r'^\**((Sure|).*[Hh]ere|[Rr]ewritten).+\:\**'
    text = re.sub(p, '', text)

    ## remove someone's Text/Style
    p = r'^\**.+([Tt]ext|[Ss]tyle|[Vv]ersion|[Rr]ewrite)\:\**'
    text = re.sub(p, '', text)

    text = text.strip()
    if text.startswith('"'):
        text = text[1:]
    if text.endswith('"'):
        text = text[:-1]

    if max_length_view:
        text = text[:max_length_view]

    return text


def clean_objective(text):
    if text.startswith('"') and text.endswith('"'):
        text = text[1:-1]
    return text


def clean_description(row):
    row_type = row['type']
    text = row['description']
    if not text:
        return row

    _prefix = {
        # 'AUTHOR': 'The author\'s writing style is ',
        # 'BOOK': 'The book\'s writing style is ',
        'DOC': 'The document is ',
        'MOVIE': 'The movie\'s style is ',
        'PLAY': 'The play\'s style is ',
        'STYLE': 'The writing style is ',
        'ADJECTIVE': 'The tone is ',
    }
    prefix = _prefix.get(row['type'], '')

    text = str(text).strip()
    if text.startswith('or less.\n\n'):
        text = text[len('or less.\n\n'):]

    if row_type == 'AUTHOR':
        parts = re.split(r"('s writing style is|'s writing is)", text)
        if len(parts) > 2:
            text = parts[2]
        text = f"The author's writing style is {lower_first_char(text)}"
    elif row_type == 'BOOK':
        text = re.sub(r'^([A-Z].+)+(\'s|s\') ".+"', '', text)
        # print(text)
        text = re.sub(r'^(The writing style of|The book) ".+"', '', text)
        text = text.replace(row['objective'], '')
        parts = re.split(r"(is characterized by|writing style is)", text)
        # print(parts)
        if len(parts) > 2:
            text = text[len(parts[0]):]
        # if text.startswith('is ') or text.startswith('has '):
        text = f"The book {lower_first_char(text)}"
    elif row_type == 'DOC':
        text = re.sub(r'^([Tt]he|[Aa]|[Aa]n) "*' + re.escape(row['objective']) + '"* (document|)', 'This ', text, flags=re.IGNORECASE)
    elif row_type == 'MOVIE':
        text = re.sub(r'^(The movie|The|)\s*"*' + row['objective'] +'"*', 'This movie', text)
        text = re.sub(r'^([A-Z].+)+\s*is ".+"', 'This movie is', text)
        # fix case
        text = text.replace("Star Wars: A New Hope", "This movie")
    elif row_type == 'PLAY':
        text = re.sub(r'^(The play|The|)\s*"*' + row['objective'] +'"*', 'This play', text)
    elif row_type == 'STYLE':
        text = re.sub(r'^(A |)' + row['objective'] +'( writing|)', 'This style is', text, flags=re.IGNORECASE)
    else:
        try:
            text = prefix + lower_first_char(text)
        except:
            print(parts)

    # Remove redundant spaces
    text = ' '.join(text.split())

    if max_length_view:
        text = text[:max_length_view]

    row['description'] = text
    return row

df['objective'] = df['objective'].apply(clean_objective)
df = df.apply(clean_description, axis=1)
for i in range(6):
    df[f"rewritten_{i}"] = df[f"rewritten_{i}"].apply(clean_rewrite)

# df[df['objective'] == "William Shakespeare"][['type', 'objective', 'description']].head(100)

df.to_csv("final_objectives_cleaned.csv", index=False)

# df[df['type'] == 'ADJECTIVE'][['type', 'objective', 'description']].head(400)



In [None]:
# df[df['type'] == 'BOOK'][['type', 'objective', 'description']].head(400)

# clean_description(df[df['objective'] == 'The War of the Worlds'].iloc[0])

# Do embed style

In [3]:
from sentence_transformers import SentenceTransformer
# sentences = ["This is an example sentence", "Each sentence is converted"]

style_model = SentenceTransformer('AnnaWegmann/Style-Embedding')
# embeddings = model.encode(sentences)
# print(embeddings)

In [14]:
import tqdm
import pandas as pd
import numpy as np

df = pd.read_csv("final_objectives_cleaned.csv", keep_default_na=False)

ls = []
for i, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    sentences = [
        row[f'rewritten_{i}'] for i in range(6)
    ]
    # print(sentences)
    embeddings = model.encode(sentences)
    ls.append(embeddings)

array = np.array(ls)

# Save the array to disk
np.save('style_embedding.npy', array)

100%|██████████| 834/834 [01:29<00:00,  9.36it/s]


test style embedding

In [1]:
import numpy as np

style_embedding = np.load('style_embedding.npy')
print(style_embedding.shape)

(834, 6, 768)


In [9]:
text = """O Georgette, a heart of gold,
Whose love for family knows no hold.
With gentle grace, she takes her stand,
A guiding light, hand in hand.

From dawn's first light to night's embrace,
She nurtures her brood with love and grace.
Her breakfast feast, a fragrant brew,
And lullaby, a sweet evening dew.

Her hugs, warm as the sun's embrace,
And words of wisdom, a guiding force.
In every moment, she finds delight,
To be the heart, the guiding light.

Her unwavering love, a gift so rare,
Makes her an exceptional mother beyond compare.
So let us praise her, this gentle soul,
Whose heart of gold will forever unfold."""

embeddings = style_model.encode([
    text
])


from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Assuming 'data' is your array of embeddings
cosine_similarities = cosine_similarity(embeddings, style_embedding[:, 0, :])

# Get the top K indices
K = 5
top_k_indices = np.argsort(cosine_similarities[0])[-K:]

print(top_k_indices)
print(cosine_similarities[0][top_k_indices])

import pandas as pd
data_df = pd.read_csv("final_objectives_cleaned.csv", keep_default_na=False)
data_df.iloc[top_k_indices]

[808 711 253 318 198]
[0.977219   0.98530954 0.9908514  0.9960144  0.99999994]


Unnamed: 0,type,objective,description,rewritten_0,rewritten_1,rewritten_2,rewritten_3,rewritten_4,rewritten_5
808,STYLE,sea shanty,This style is is a type of folk song that uses...,(Verse 1)\nThere once was a mother named Georg...,"Heave ho, me hearties, listen to me tale,\nOf ...","(Verse 1)\nA tale of a musician named Jerry,\n...","A tale I have to tell, me hearties, so lend me...","A tale I tell of services bold,\nOf pipes and ...","(Verse 1)\nLadies and gentlemen, gather 'round..."
711,STYLE,satirical poetry,This style is uses humor to criticize and expo...,"Oh, Georgette, the queen of breakfast bliss,\n...","Oh, Mari, my dear, I've a tale to spin,\nOf pa...","Oh, the tale of Jerry, a musician so bold,\nWh...","Ladies and gentlemen, gather 'round,\nTo hear ...","Oh, Mr. Johnson, a penny pincher so sly,\nA vi...","Ladies and gentlemen, gather 'round,\nTo hear ..."
253,AUTHOR,William Blake,The author's writing style is characterized by...,"Oh, Georgette, radiant as the dawn,\nA heart t...","O Mari, my dear, I write to thee with joy,\nA ...","O, Jerry, the musician of grace,\nWith trembli...","O Ladies and Gentlemen, behold the puffin's gr...","O Mr. Johnson, doth the inkwell sing of thee,\...","O Ladies and Gentlemen, behold the plight of m..."
318,AUTHOR,Alexander Pope,The author's writing style is characterized by...,"O Georgette, a woman of boundless grace,\nWhos...","O Mari, my dear, I pen to thee with joy,\nTo s...","O Jerry, a musician of such grace,\nWith tremb...","Fair Ladies and Gentlemen, I extend my gratitu...","O Mr. Johnson, esteemed client,\n\nI pen this ...","Fair Ladies and Gentlemen, I extend my gratitu..."
198,AUTHOR,William Shakespeare,The author's writing style is characterized by...,"O Georgette, a heart of gold,\nWhose love for ...","O Mari, fair maiden, I pen to thee with a tale...","O Jerry, a musician of exquisite talent, doth ...","Fair ladies and gentlefolk, I extend my gratit...","O, esteemed Mr. Johnson, doth this email find ...","Fair ladies and gentlefolk, I extend my gratit..."


In [8]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

a = np.array([
    [[1, 2, 3], [4, 5, 6]],
    [[7, 8, 9], [10, 11, 12]],
    [[13, 14, 15], [16, 17, 18]],
])

target = [1, 2, 3]

scores = cosine_similarity([target], a.reshape(-1, len(target))).reshape(-1, 2).mean(axis=1)
print(scores)
scores[:-2:-1]

[0.98731592 0.95533513 0.94457103]


array([0.94457103])