In [1]:
import warnings
from tqdm import tqdm
import torch
import pandas as pd
from itertools import product
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from random import randint
from transformers import AutoModelForCausalLM, AutoTokenizer
warnings.filterwarnings("ignore")




# Style transfer

In [2]:
data = pd.read_pickle('data/top_auth_final.pkl')

In [3]:
len(data)

5000

### Prompt engineering

We try 3 different kind of prompts. First the "standard prompt" which was also used very similar in my practical work. This consist of first instructions then more information. (Pre-ins) 
The second type is proposed in [this](https://aclanthology.org/2024.findings-acl.693.pdf) paper called post-ins. Where the information is given first and the instruction is given at the end. (hence post)
Lastly, we try Augmented zero-shot learning proposed in [this](https://aclanthology.org/2022.acl-short.94.pdf) paper, which gives examples how similar tasks to the given task are solved.

In [3]:
class Prompt:
    def __init__(self, start, stop, prompt_type:str="standard", sent:bool=False, emot:bool=False, rating:bool=False, cont:bool=False, style:bool=False, summary:bool=False):
        self.all_prompts = []  
        self.all_og = []
        self.prompt = None
        for elem in range(start, stop):
            self.row = data.iloc[elem]
            self.base = "Use the specifications to generate a movie review with the title:'" + self.row["title"] + "'. Don't directly reference the title itself in the body of the review."
            
            self.sentiment = self.get_sent() if sent else None
            self.emotion = self.get_emot() if emot else None
            self.rating = self.get_rating() if rating else None
            self.content = self.get_cont() if cont else None
            self.style = self.get_style() if style else None
            self.summary = self.get_summary() if summary else None

            if prompt_type == "standard" or prompt_type == "postIn":
                self.generate(prompt_type)
            elif prompt_type == "AZSL": # has its own generate function
                self.generate(elem)
            self.all_og.append(self.row["text"])
            
    def get_sent(self):
        sentiment = self.row["sentiment"]
        
        if sentiment == "NEG":
            return " The review should be of negative sentiment."
        elif sentiment == "NEU":
            return " The review should be of neutral sentiment."
        else:
            return " The review should be of positive sentiment."
            
            
    def get_emot(self):
        emotion = self.row["emotion"]
        
        if emotion != "others": # others is of no use in the prompt
            return " The review should convey the " + emotion + " emotion."
        else:
            # or add nothing?
            return " The review should convey no specific emotion."
    
    def get_rating(self):
        return " The review rates the movie as " + str(self.row["rating"]) + " out of 10."
    
    def get_cont(self):
        content = self.row["NER"].entities
        type_dict = {"PER": "person", "TITLE": "title", "LOC":"location", "ORG":"organization", "OTHER":"other", "EVENT": "event", "TIME":"time", "DATE":"date", "PROD":"product",
                     "GROUP":"group"}
        if len(content) == 0: # not every review will have entities
            return 
        else:
            output = " The review should include the following entities: "
            for idx, entity in enumerate(content):
                if idx == 0:
                    output += "The " + type_dict[entity["type"]] + " " + entity["text"] + ", "
                elif idx != len(content)-1:
                    output += "the " + type_dict[entity["type"]] + " " + entity["text"] + ", "
                else:
                    output += "the " + type_dict[entity["type"]] + " " + entity["text"] + ";"
            return output
    
    def get_style(self):
        return " The review should be " + str(self.row["length"]) + " words long."
    
    def get_summary(self):
        return "The review should be based on the following summary: " + str(self.row["summary"]) + '"'
    
    def generate(self, prompt_type:str):
        if prompt_type == "standard": # generate with all that was given
            prompt = self.base
            attributes = [self.sentiment, self.emotion, self.rating, self.content, self.style, self.summary]
            for attr in attributes:
                if attr:
                    prompt += attr
            self.prompt = prompt
            self.all_prompts.append(self.prompt)
        elif prompt_type == "postIn":
            prompt = ""
            attributes = [self.sentiment, self.emotion, self.rating, self.content, self.style, self.summary]
            for attr in attributes:
                if attr:
                    prompt += attr
            # add instructions later
            self.prompt = prompt + self.base
            self.all_prompts.append(self.prompt)
        else:
            pass
    def print_all(self):
        print(self.all_prompts)
    
    def print_one(self, idx):
        print(self.all_prompts[idx])

In [4]:
class AugmentedZeroShot(Prompt):
    def __init__(self, start, stop):
        super().__init__(start, stop, prompt_type="AZSL", sent=True, summary=True)
        
    def generate(self, cur_elem):
        # start, stop will be between 0-4999, //1000 we get which user the current batch is for
        # we want a different SENT then the current prompt
        sent_text1  = self.sentiment
        rnd_rev = 0
        sent_dict = {"POS":"positive", "NEG":"negative", "NEU":"neutral"}
        while sent_text1 == self.sentiment:
            user = cur_elem//1000
            rnd_rev = randint(1000*user, 1000*(user+1)-1)
            # will not choose same user as sent is same
            self.row = data.iloc[rnd_rev]
            sent_text1 = self.get_sent()
        sum_text1 = self.get_summary()
        org_text1 = data["text"].iloc[rnd_rev]

        prompt = f"""
        Here are some instructions: {sum_text1}, {sent_text1}
        Here is a {sent_dict[data["sentiment"].iloc[rnd_rev]]} movie review that fits these instructions: {{{org_text1}}}

        Here are some instructions: {self.summary, self.sentiment}
        Now write a {sent_dict[data["sentiment"].iloc[cur_elem]]} movie review that fits these instructions: {{
        """
        self.prompt = prompt
        self.all_prompts.append(self.prompt)
        

In [4]:
test_prompt = Prompt(0, 1, sent=True, emot=True, rating=True, cont=True, style=True)
test_prompt.print_all()

["Use the specifications to generate a movie review with the title:'Somebody call PETA !'. Don't directly reference the title itself in the body of the review. The review should be of negative sentiment. The review should convey the disgust emotion. The review rates the movie as 1.0 out of 10. The review should be 283 words long."]


In [6]:
test_prompt = Prompt(4, 5,  emot=True, summary=True)
test_prompt.print_one(0)

Use the specifications to generate a movie review with the title:'One of my favorite John Hughes films of all time !'. The review should convey the disgust emotion.The review should be based on the following summary: I think this is one of John Hughes ' best films . Jean Louisa Kelly plays the oldest daughter , Tia , with a chip on her shoulder . Laurie Metcalf plays the neighbor who sets her sights on the not so quite bachelor Uncle Buck ."


In [7]:
test_prompt = Prompt(4, 5, "postIn", emot=True, summary=True)
test_prompt.print_one(0)

 The review should convey the disgust emotion.The review should be based on the following summary: I think this is one of John Hughes ' best films . Jean Louisa Kelly plays the oldest daughter , Tia , with a chip on her shoulder . Laurie Metcalf plays the neighbor who sets her sights on the not so quite bachelor Uncle Buck ."Use the specifications to generate a movie review with the title:'One of my favorite John Hughes films of all time !'.


In [8]:
AugmentedZeroShot(0,1).print_all()

['\n        Here are some instructions: The review should be based on the following summary: Juliette Binoche plays a single mother who comes to a small village in La Belle France. Dame Judi Dench plays the 70 year old widow full of life and she earns an academy award nomination no matter what she does in the film industry. Alfred Molina plays the French mayor. Carrie Anne Moss plays Dench\'s daughter.",  The review should be of positive sentiment.\n        Here is a positive movie review that fits these instructions: {Okay , the film has a first rate cast including the wonderful Oscar Winner Juliette Binoche who plays a single mother who comes to a small village in La Belle France where the rules are governed by a small-minded mayor . Anyway , Lena Olin plays a battered wife who is taken in by Binoche\'s character . Dame Judi Dench plays a 70 year old widow full of life and she earns an academy award nomination no matter what she does in the film industry . Anyway , British Alfred Mol

### Prompt evaluation
Evaluate different prompt options on a small subset

In [5]:
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def rouge_score(set1, set2):
    scores = scorer.score(set1, set2)
    return scores['rouge1'].fmeasure

def jaccard_similarity(set1, set2):
    set1 = set(set1.split())
    set2 = set(set2.split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

def overlap_coefficient(set1, set2):
    set1 = set(set1.split())
    set2 = set(set2.split())
    intersection = len(set1.intersection(set2))
    min_size = min(len(set1), len(set2))
    return intersection / min_size

def cosine_similarity_custom(set1, set2):
    emb1 = model.encode(set1)
    emb2 = model.encode(set2)
    sim = cosine_similarity([emb1], [emb2])
    return sim[0][0]

def calculate_averages(scores):
    averages = {metric: sum(values) / len(values) for metric, values in scores.items()}
    return averages

In [6]:
class Generation:
    """
    Data class for ease of access
    """
    def __init__(self, prompt, original, generated, loss, user_id):
        self.prompt = prompt
        self.original = original
        self.generated = generated
        self.loss = loss
        self.user_id = user_id
def format_prompts(tokenizer, system, gen_prompts):
    prompts = []
    for prompt in gen_prompts:
        if system:
            message_format = [
                {
                    "role": "system",
                    "content": "You are a writer writing movie reviews. Don't include a title in the review.",
                },
                {"role": "user", "content": prompt},
            ]
        else:
            message_format = [
                {"role": "user", "content": "You are a writer writing movie reviews. Don't include a title in the review." + prompt},
            ]
        
        prompts.append(tokenizer.apply_chat_template(message_format, tokenize=False, add_generation_prompt=True))
        
    return prompts

def generate_reviews(model, tokenizer, split, param_dict, nr_reviews, system):
    """
    
    :param model: 
    :param split: 
    :param param_dict: 
    :param nr_reviews: reviews generated per user
    :param system: 
    :return: 
    """
    generations = []
    for idx in tqdm(range(5), desc="Generating Reviews per user"):
        if  param_dict['prompt_type'] == 'standard' or param_dict['prompt_type'] == 'postIn':
            prompt_gen = Prompt(1000*idx, 1000*idx  + nr_reviews, **param_dict)
        elif param_dict['prompt_type'] == 'AZSL':
            prompt_gen = AugmentedZeroShot(1000*idx, 1000*idx  + nr_reviews)
    
        formated_prompts = format_prompts(tokenizer, system, prompt_gen.all_prompts)
        
        for prompt in range(len(formated_prompts)):
            
            inputs = tokenizer(formated_prompts[prompt], return_tensors="pt").to("cuda")
            with torch.no_grad():
                outputs = model.generate(**inputs, max_new_tokens=500, do_sample=True)
            
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            with torch.no_grad():
                loss_outputs = model(**inputs, labels=inputs["input_ids"])
                loss = loss_outputs.loss.item() 

            extracted_text = generated_text.split(f"{split}")[1]
            
            gen = Generation(prompt_gen.all_prompts[prompt], prompt_gen.all_og[prompt], extracted_text, loss, idx)
            generations.append(gen)
            
    return generations


def evaluate_gens(generations, average:bool = True):
    scores = {'rouge': [], 'jaccard': [], 'overlap': [], 'cosine': []}
    for generation in generations:
        org = generation.original
        gen = generation.generated
        scores['rouge'].append(rouge_score(org, gen))
        scores['jaccard'].append(jaccard_similarity(org, gen))
        scores['overlap'].append(overlap_coefficient(org, gen))
        scores['cosine'].append(cosine_similarity_custom(org, gen))
    
    if average:
        avg = calculate_averages(scores)
        print(avg)
        return avg
        
    else:
        return scores
    

def text_evaluation_pipeline(model_id: str,  split:str, nr_reviews:int, param_dict:dict,  token=None, system: bool = True):
    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="cuda")
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    
    generations = generate_reviews(model, tokenizer, split, param_dict, nr_reviews, system)
    avg = evaluate_gens(generations)
    print("Done!")
    return generations, avg, param_dict


In [7]:
param_grid = {
    'prompt_type': ['standard', 'postIn'],
    'sent': [True, False],
    'emot': [True, False],
    'rating': [True, False],
    'cont': [True, False],
    'style': [True, False],
    'summary': [True, False]
}

keys = param_grid.keys()
combinations = list(product(*param_grid.values()))
print(len(combinations))
combination_dicts = [dict(zip(keys, comb)) for comb in combinations]
combination_dicts[:1]

128


[{'prompt_type': 'standard',
  'sent': True,
  'emot': True,
  'rating': True,
  'cont': True,
  'style': True,
  'summary': True}]

In [59]:
r, g, _ = text_evaluation_pipeline("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "<|assistant|>\n", 1, dict(zip(keys, combinations[2])))

Generating Reviews per user: 100%|██████████| 5/5 [00:27<00:00,  5.46s/it]

{'rouge': 0.32932070023363186, 'jaccard': 0.12584005694665854, 'overlap': 0.29186712076441035, 'cosine': 0.704013991355896}
Done!





In [60]:
r[0].prompt

'Use the specifications to generate a movie review with the title:\'Somebody call PETA !\'. The review should be of negative sentiment. The review should convey the disgust emotion. The review rates the movie as 1.0 out of 10.The review should be based on the following summary: Liberace is a dog and a small one , but he does not want to be paraded around like a show dog ."'

In [61]:
r[0].original

"I caught glimpses of this show which feature a gay male couple and their dog , Liberace . First of all , they come across as the most stereotyped gay couple that I have seen and yet they are real . I couldn't watch them with their dog as they took this tiny lap dog ( I mean the dog weighed under 10 pounds and was not fully developed ) and pushed into these contests . I thought Showbiz Moms and Dads were ridiculous but this young gay couple are obviously immature and neglectful of Liberace . I understand that some people want to place their animals for show but don't put Liberace on for show like a doll . Liberace is a dog and a small one . I had a rabbit who weighed more than this dog and I wouldn't put it in contests . Liberace isn't even attractive . Look I know people love their animals , I still can't get over the loss of my rabbit but even I have a good sense about what she wanted . I don't think Liberace wants to be paraded around like show dog . He probably just wants to sleep 

In [62]:
r[0].generated

'Insight: If you\'re a writer trying to convey a negative sentiment about a movie, it\'s important to choose a topic that relates to audiences and be descriptive. In this case, this review is about someone\'s favorite pet, Liberace, who does not fit the stereotype of being a show dog. The review will use a disgusted tone and will rate the movie 1.0 out of 10 based on its absurdity and demeaning portrayal of animals.\n\nTITLE: "Somebody call PETA: Liberace, a cat and mouse"\n\nSUMMARY: In this review, we are seeing a movie about Liberace as a cat, which is a completely inappropriate scenario given that cats are highly intelligent and gentle creatures. The movie\'s portrayal of Liberace as a mischievous little black dog, complete with squeaky bones and unpredictable behavior, is nothing short of humiliating. The dog is not depicted as a loyal guardian and a well-behaved pet, but rather as a pet that is meant to be played with and adored by humans. The only question that we can assume fro

In [63]:
r[0].loss

2.8571536540985107

In [None]:
r[0].user_id

In [8]:
def run_evaluation(model_name, prompt, nr_reviews, params,token=None, system=False, output_file=None):
    scores = []
    for idx, comb in tqdm(enumerate(params), total=len(params), desc="Evaluating"):
        print(comb)
        print("Run {} out of {}".format(idx+1, len(params)))
        
        if token:
            _, avg, output_dict = text_evaluation_pipeline(model_name, prompt, nr_reviews, comb, token, system=system)
        else:
            _, avg, output_dict = text_evaluation_pipeline(model_name, prompt, nr_reviews, comb)
        
        scores.append({'scores': avg, 'dict': output_dict})
        print(25*"-")
    
    results = pd.DataFrame(scores)
    if output_file:
        results.to_pickle(output_file)


In [None]:
run_evaluation("stabilityai/stablelm-2-zephyr-1_6b", "<|assistant|>\n", 10, combination_dicts,output_file='data/zephyr_results.pkl')

In [None]:
token=""
run_evaluation("google/gemma-2b-it", "<start_of_turn>model\n", 10 , combination_dicts,token=token, system=False, output_file='data/gemma_results.pkl')

In [None]:
run_evaluation("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "<|assistant|>\n", 10 ,combination_dicts,output_file='data/llama_results.pkl')

In [34]:
r, g, _ = text_evaluation_pipeline("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "<|assistant|>\n", 10, {'prompt_type': 'AZSL'})

Generating Reviews per user: 100%|██████████| 5/5 [03:47<00:00, 45.59s/it]


{'rouge': 0.25972254315031934, 'jaccard': 0.07968139481097804, 'overlap': 0.18540822616127545, 'cosine': 0.309208712130785}
Done!


In [35]:
_, avg, output_dict = text_evaluation_pipeline("stabilityai/stablelm-2-zephyr-1_6b", "<|assistant|>\n", 10, {'prompt_type': 'AZSL'})

Generating Reviews per user: 100%|██████████| 5/5 [04:22<00:00, 52.52s/it]


{'rouge': 0.28158176182139555, 'jaccard': 0.0934156603256676, 'overlap': 0.22259994551241966, 'cosine': 0.36789584673941134}
Done!


In [12]:
token=""
_, avg, output_dict = text_evaluation_pipeline("google/gemma-2b-it", "<start_of_turn>model\n", 10, {'prompt_type': 'AZSL'},token, system=False)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Reviews per user:  80%|████████  | 4/5 [02:50<00:42, 42.64s/it]

KeyboardInterrupt



In [7]:
def process_results(result_df, name):
    df_scores = pd.json_normalize(result_df['scores'])
    df_combined = pd.concat([result_df['dict'], df_scores], axis=1)
    df_combined.to_pickle(f'data/{name}_results_combined.pkl')
    print(df_scores.describe())
    return df_combined

In [1]:
res_lama = pd.read_pickle('data/llama_results.pkl')
res_gemma =  pd.read_pickle('data/gemma_results.pkl')
res_zephyr =  pd.read_pickle('data/zephyr_results.pkl')

NameError: name 'pd' is not defined

In [12]:
combined_llama = process_results(res_lama, "llama")

            rouge     jaccard     overlap      cosine
count  128.000000  128.000000  128.000000  128.000000
mean     0.288835    0.099309    0.242000    0.514060
std      0.016296    0.009323    0.019666    0.069630
min      0.258362    0.083447    0.204757    0.387405
25%      0.272341    0.090506    0.226027    0.466458
50%      0.289672    0.100032    0.243423    0.528439
75%      0.303725    0.107225    0.257960    0.569139
max      0.317632    0.117593    0.289715    0.615510


In [16]:
combined_gemma = process_results(res_gemma, "gemma")

            rouge     jaccard     overlap      cosine
count  128.000000  128.000000  128.000000  128.000000
mean     0.267880    0.083965    0.201333    0.526148
std      0.025902    0.009448    0.020742    0.076011
min      0.201983    0.062473    0.151082    0.359568
25%      0.252047    0.076648    0.186359    0.466854
50%      0.267331    0.085165    0.203859    0.556817
75%      0.290101    0.091398    0.215795    0.579668
max      0.320379    0.100734    0.249525    0.632725


In [13]:
combined_zephyr = process_results(res_zephyr, "zephyr")

            rouge     jaccard     overlap      cosine
count  128.000000  128.000000  128.000000  128.000000
mean     0.282992    0.093083    0.230876    0.531514
std      0.017026    0.009613    0.024812    0.075213
min      0.250315    0.075015    0.185224    0.387755
25%      0.274842    0.086530    0.213177    0.495578
50%      0.285122    0.093758    0.234215    0.556004
75%      0.296700    0.100660    0.250480    0.583683
max      0.313723    0.109757    0.274205    0.637742


In [35]:
best_params_top10 = {}

In [36]:
results = [combined_llama, combined_gemma, combined_zephyr]
for idx, res in enumerate(["llama", "gemma", "zephyr"]):
    results[idx]['average_score'] = results[idx][["rouge", "jaccard", "overlap", "cosine"]].mean(axis=1)
    top_10_best = results[idx].nlargest(10, 'average_score')
    best_params_top10[res] = top_10_best["dict"].to_list()

In [None]:
run_evaluation("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "<|assistant|>\n", 50, best_params_top10["llama"] ,output_file='data/llama_results_top10.pkl')

In [None]:
token=""
run_evaluation("google/gemma-2b-it", "<start_of_turn>model\n", 50, best_params_top10["gemma"],token=token, system=False, output_file='data/gemma_results_top10.pkl')

In [None]:
run_evaluation("stabilityai/stablelm-2-zephyr-1_6b", "<|assistant|>\n", 50, best_params_top10["zephyr"],output_file='data/zephyr_results_top1ß.pkl')

In [8]:
res_lama = pd.read_pickle('data/llama_results_top10.pkl')
res_gemma =  pd.read_pickle('data/gemma_results_top10.pkl')
res_zephyr =  pd.read_pickle('data/zephyr_results_top10.pkl')
combined_llama = process_results(res_lama, "llama_top10")
combined_gemma = process_results(res_gemma, "gemma_top10")
combined_zephyr = process_results(res_zephyr, "zephyr_top10")

           rouge    jaccard    overlap     cosine
count  10.000000  10.000000  10.000000  10.000000
mean    0.312801   0.111273   0.261894   0.572720
std     0.002740   0.001811   0.004617   0.011259
min     0.308933   0.109356   0.256820   0.553425
25%     0.310928   0.109634   0.257928   0.566089
50%     0.311579   0.110952   0.261433   0.571758
75%     0.315651   0.112594   0.265122   0.575564
max     0.316343   0.114496   0.270093   0.591995
           rouge    jaccard    overlap     cosine
count  10.000000  10.000000  10.000000  10.000000
mean    0.302545   0.097261   0.211452   0.595528
std     0.010731   0.002632   0.008067   0.009509
min     0.285010   0.092914   0.202805   0.580439
25%     0.293550   0.095184   0.205117   0.588632
50%     0.304083   0.097390   0.208121   0.598349
75%     0.310671   0.098896   0.219643   0.602069
max     0.317209   0.101086   0.222638   0.606823
           rouge    jaccard    overlap     cosine
count  10.000000  10.000000  10.000000  10.000000


In [9]:
best_params = {}
results = [combined_llama, combined_gemma, combined_zephyr]
for idx, res in enumerate(["llama", "gemma", "zephyr"]):
    results[idx]['average_score'] = results[idx][["rouge", "jaccard", "overlap", "cosine"]].mean(axis=1)
    top_1_best = results[idx].nlargest(1, 'average_score')
    best_params[res] = top_1_best["dict"].to_list()

In [11]:
best_params

{'llama': [{'prompt_type': 'postIn',
   'sent': False,
   'emot': False,
   'rating': False,
   'cont': True,
   'style': False,
   'summary': True}],
 'gemma': [{'prompt_type': 'standard',
   'sent': True,
   'emot': False,
   'rating': False,
   'cont': True,
   'style': True,
   'summary': True}],
 'zephyr': [{'prompt_type': 'standard',
   'sent': False,
   'emot': False,
   'rating': False,
   'cont': True,
   'style': False,
   'summary': True}]}

### Text generation

For text generation we collect all scores without averaging to gain insight into the performance per user.
Furthermore the loss is collected per generation, to calculate perplexity later on.

In [7]:
def text_generation_pipeline(model_id: str, model_name:str ,split:str, nr_reviews:int, param_dict:dict,  token=None, system: bool = True):
    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="cuda", token=token)
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, token=token)
    
    generations = generate_reviews(model, tokenizer, split, param_dict, nr_reviews, system)
    reviews_list = [gen.generated for gen in generations]
    loss_list = [gen.loss for gen in generations]
    id_list = [gen.user_id + 101 for gen in generations]
    reviews = pd.DataFrame({"text": reviews_list})
    reviews["id"] = id_list
    # save as csv
    reviews.to_csv(f"data/reviews_{model_name}.csv", index=False)
    scores = evaluate_gens(generations, False)
    scores["loss"] = loss_list
    results = pd.DataFrame(scores)
   
    results.to_pickle(f"data/{model_name}_5kgen_results.pkl")
    print("Done!")
    

In [16]:
test_avg = text_generation_pipeline("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "Testrun", "<|assistant|>\n", 2, best_params["llama"][0])

Generating Reviews per user: 100%|██████████| 5/5 [00:53<00:00, 10.65s/it]

Done!





In [17]:
test_gens = pd.read_csv("data/reviews_Testrun_new.csv")
test_gens

Unnamed: 0,text,id
0,Movie Review: Liberace (Little Dog)\n\nSummari...,101
1,"As a writer of movie reviews, my focus will be...",101
2,The organization IMDb\n\nSacha Guitry\n\nRevie...,102
3,[The opening shot shows a train moving slowly ...,102
4,"Dear movie review enthusiasts,\n\nWe present y...",103
5,"As a hard-bitten writer of movie reviews, I br...",103
6,"Title: ""300""\n\nMajor Entity: The Title\n\nThe...",104
7,Title: The Aviator\n\nOscar (Best Director) No...,104
8,"As a fan of Mister Shaw, I'd like to share my ...",105
9,The Review\n\nBased on the summary mentioned a...,105


In [18]:
test_results = pd.read_pickle("data/Testrun_5kgen_results.pkl")
test_results

Unnamed: 0,rouge,jaccard,overlap,cosine,loss
0,0.335878,0.109929,0.208054,0.540787,3.05055
1,0.35206,0.110714,0.262712,0.637902,2.632026
2,0.267477,0.105263,0.19469,0.665693,3.190504
3,0.22,0.085246,0.236364,0.517712,3.418683
4,0.38069,0.122396,0.283133,0.672919,2.650599
5,0.391785,0.117647,0.215054,0.713778,2.991619
6,0.320074,0.096192,0.251309,0.586557,3.160706
7,0.291777,0.130802,0.389937,0.78477,2.894013
8,0.171271,0.090498,0.30303,0.702443,3.391676
9,0.254072,0.094737,0.264706,0.619912,3.469871


# Baseline

In [8]:
baseline_params = {'prompt_type': 'standard',
  'sent': False,
  'emot': False,
  'rating': False,
  'cont': False,
  'style': False,
  'summary': False}

In [13]:
scores_lama_baseline= text_generation_pipeline("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "Llama_baseline", "<|assistant|>\n", 1000, baseline_params)

Generating Reviews per user: 100%|██████████| 5/5 [7:09:22<00:00, 5152.45s/it]  


Done!


In [9]:
token="",
scores_gemma_baseline = text_generation_pipeline("google/gemma-2b-it", "gemma_baseline", "model\n", 1000, baseline_params, token, False)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Reviews per user: 100%|██████████| 5/5 [8:58:44<00:00, 6464.80s/it]  


Done!


In [9]:
scores_zephyr_ = text_generation_pipeline("stabilityai/stablelm-2-zephyr-1_6b", "zephyr_baseline", "<|assistant|>\n", 1000, baseline_params)

Generating Reviews per user: 100%|██████████| 5/5 [7:57:14<00:00, 5726.83s/it]  


Done!


Every model is optimized for a different prompt format. Most models used for chat purposes, which also happen to be best at writing creative texts, use roles to define the prompt.
[List of available models](https://huggingface.co/models?pipeline_tag=text-generation&sort=likes)

## TinyLlama
Top 5 users were reproduced
[model here](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0)

In [19]:
scores_lama = text_generation_pipeline("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "Llama", "<|assistant|>\n", 1000, best_params["llama"][0])

Generating Reviews per user: 100%|██████████| 5/5 [7:01:43<00:00, 5060.70s/it]  


Done!


## Google Gemma 2b it
[model here](https://huggingface.co/google/gemma-2b-it)

In [11]:
token="",
scores_gemma = text_generation_pipeline("google/gemma-2b-it", "gemma", "model\n", 1000, best_params["gemma"][0], token, False)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Reviews per user: 100%|██████████| 5/5 [6:30:37<00:00, 4687.42s/it]  


Done!


## Zephyr-1 6B
[model here](https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b)

In [13]:
scores_zephyr = text_generation_pipeline("stabilityai/stablelm-2-zephyr-1_6b", "zephyr", "<|assistant|>\n", 1000, best_params["zephyr"][0])

Generating Reviews per user: 100%|██████████| 5/5 [8:43:18<00:00, 6279.76s/it]  


Done!
