In [1]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import ast
import torch
import re

In [2]:
# Carica il modello e il tokenizer da file
gpt2Model = GPT2LMHeadModel.from_pretrained('ModelGPT/model_after_train')
tokenizer = GPT2Tokenizer.from_pretrained('ModelGPT/tokenizer_after_train')

# Carica la correlation matrix da file
loaded_correlation = pd.read_pickle('ModelClassifier/rating_correlation.pkl')

# Carica il tokenizer per gli ingredienti da file
with open('ModelClassifier/tokenizer_ingredients.pkl', 'rb') as file:
    tokenizer_ingredients = pickle.load(file)
    
# Carica il tokenizer per i passaggi da file
with open('ModelClassifier/tokenizer_steps.pkl', 'rb') as file:
    tokenizer_steps = pickle.load(file)
    
# Load the interaction dataset
interaction_df = pd.read_csv('Datasets\RAW_interactions.csv')

recipes_df = pd.read_csv('Datasets\RAW_recipes.csv')

In [3]:
# Funzione per ottenere una ricetta per un dato rating
def get_recipe_for_rating(df, rating):
    # Filtra per il rating desiderato
    filtered_df = df[df['rating'] == rating]
    
    # Assicurati che non ci siano rating superiori nella stessa ricetta
    recipe_ids = filtered_df['recipe_id'].unique()
    
    for recipe_id in recipe_ids:
        # Controlla che il rating massimo per questa ricetta non sia superiore al rating desiderato
        max_rating = df[df['recipe_id'] == recipe_id]['rating'].max()
        if max_rating <= rating:
            return filtered_df[filtered_df['recipe_id'] == recipe_id].iloc[0]
    
    # Se non trovi una ricetta valida, ritorna None
    return None

In [4]:
# Funzione per ottenere gli ingredienti e i passi delle ricette per un dato rating
def get_recipe_details_for_rating(ratings_df, recipes_df, rating):
    # Trova una ricetta che soddisfi il rating specifico
    recipe_info = get_recipe_for_rating(ratings_df, rating)
    
    if recipe_info is not None:
        recipe_id = recipe_info['recipe_id']
        
        # Estrai gli ingredienti e i passi dalla ricetta
        recipe_details = recipes_df[recipes_df['id'] == recipe_id]
        if not recipe_details.empty:
            return recipe_details[['id', 'ingredients', 'steps']].iloc[0]
    
    # Se non trovi una ricetta valida, ritorna None
    return None

# Trova i dettagli delle ricette per ogni rating da 0 a 5
recipe_details = {}
for rating in range(6):
    details = get_recipe_details_for_rating(interaction_df, recipes_df, rating)
    if details is not None:
        recipe_details[rating] = details

# Converti il risultato in DataFrame per una visione migliore
recipe_details_df = pd.DataFrame(recipe_details).T.reset_index(drop=True)

print(recipe_details_df)

       id                                        ingredients  \
0  487707  ['zucchini', 'onion', 'red bell pepper', 'cher...   
1  512986  ['heavy cream', 'oranges, zest of', 'cinnamon'...   
2  224025                  ['butter', 'cornstarch', 'honey']   
3  381108  ['potatoes', 'butter', 'flour', 'salt', 'milk'...   
4   44394  ["devil's food cake mix", 'vegetable oil', 'eg...   
5   40893  ['great northern beans', 'yellow onion', 'dice...   

                                               steps  
0  ['preheat oven to 350 degrees f if you are goi...  
1  ['heat the cream , zest , and cinnamon in a sa...  
2  ['melt butter in sauce pan', 'stir in cornstar...  
3  ['in a large saucepan , melt butter and add fl...  
4  ['blend together cake mix , oil and eggs', 'ad...  
5  ['combine beans , onion , chilies , 1 / 2 teas...  


In [5]:
# Funzione per pulire e trasformare la stringa in una lista
def clean_ingredient_steps_string(ingredient_str):
    # Rimuovere le virgolette in eccesso e trasformare la stringa in lista
    try:
        # Prima rimuovi eventuali spazi e virgolette non necessarie
        ingredient_str = ingredient_str.strip("[]").replace("'", "").strip()

        # Poi, separa gli ingredienti o gli steps se sono stati convertiti in una singola stringa
        ingredients = [ing.strip() for ing in ingredient_str.split(',')]

        return ingredients
    except Exception as e:
        print(f"Errore nella pulizia dell'ingrediente: {e}")
        return []

# Applicare la funzione di pulizia su tutta la colonna 'ingredients_list' e 'steps'
recipe_details_df['ingredients_list'] = recipe_details_df['ingredients'].apply(clean_ingredient_steps_string)
recipe_details_df['steps_list'] = recipe_details_df['steps'].apply(clean_ingredient_steps_string)

In [96]:
# Verifica del risultato
print(type(recipe_details_df['ingredients_list'][0]))  # Controllo il tipo della colonna ingredient_list
print(type(recipe_details_df['steps_list'][0]))  # Controllo il tipo della colonna steps_list

# Verifica del risultato
print(recipe_details_df['ingredients_list'][1])  # Ora dovrebbe essere una lista di stringhe
print(recipe_details_df['steps_list'][1])  # Ora dovrebbe essere una lista di stringhe

<class 'list'>
<class 'list'>
['heavy cream', 'oranges', 'zest of', 'cinnamon', 'semisweet chocolate', 'egg yolks', 'sugar', 'vanilla extract']
['heat the cream', 'zest', 'and cinnamon in a saucepan to boiling', 'remove from the heat and let steep 2 hours', 'heat the chocolate in a small heavy pan over low heat until melted', 'continue cooking', 'stirring constantly', 'until it is "scorched" and thick', 'about 3 minutes', 'transfer to a large mixing bowl', 'strain the cream into another saucepan and reheat to boiling', 'whisk the egg yolks and 1 / 4 celsius sugar until combined', 'whisk about 1 cup of the hot cream into the yolks', 'then whisk into the remaining cream in the pan', 'cook', 'stirring constantly', 'over medium heat until thick enough to coat the back of a spoon', '3 to 5 minutes', 'do not boil', 'strain the custard into the chocolate and stir until smooth', 'strain again', 'place in a larger bowl filled with ice water and chill the custard well', '"fereeze in an ice cream

In [7]:
# Tokenizzazione e padding
new_ingredients_seq = tokenizer_ingredients.texts_to_sequences(recipe_details_df['ingredients_list'])
new_steps_seq = tokenizer_steps.texts_to_sequences(recipe_details_df['steps_list'])

#il Calcolo della lunghezza massima per gli ingredienti e i passi è già stata fatta prima nel notebook
ingredient_maxlen = 43
steps_maxlen = 218

# Padding
new_ingredients_padded = pad_sequences(new_ingredients_seq, maxlen=ingredient_maxlen)
new_steps_padded = pad_sequences(new_steps_seq, maxlen=steps_maxlen)

In [8]:
# Mappa per ingredienti e passi
def get_feature_names(tokenizer, maxlen):
    index_word = {index: word for word, index in tokenizer.word_index.items()}
    return [index_word.get(i, f'unknown_{i}') for i in range(1, maxlen + 1)]

new_ingredient_names = get_feature_names(tokenizer_ingredients, ingredient_maxlen)
new_steps_names = get_feature_names(tokenizer_steps, steps_maxlen)

# Creazione di DataFrame
new_ingredients_df = pd.DataFrame(new_ingredients_padded, columns=new_ingredient_names)
new_steps_df = pd.DataFrame(new_steps_padded, columns=new_steps_names)

# Combinazione delle caratteristiche e dei rating
new_features_df = pd.concat([new_ingredients_df, new_steps_df], axis=1)

In [9]:
# Calcolo della stima dei rating basata sulle correlazioni
rating_estimates_real_recipes = new_features_df.dot(loaded_correlation)
rating_estimates_real_recipes = rating_estimates_real_recipes / loaded_correlation.abs().sum()

#queste sono i valori delle ricette originali secondo la correlation matrix
print(rating_estimates_real_recipes)

0   -242645.911082
1   -207138.169533
2      -150.751155
3    -77809.523383
4    -76454.855237
5     -7137.231904
dtype: float64


In [10]:
# Funzione per convertire la stringa della lista in una stringa separata da virgole
def convert_ingredients_string(ingredients_str):
    try:
        # Convertire la stringa in una lista di ingredienti
        ingredients_list = ast.literal_eval(ingredients_str)
        # Unire gli ingredienti in una stringa separata da virgole
        return ', '.join(ingredients_list)
    except Exception as e:
        print(f"Errore nella conversione: {e}")
        return ''

# Applicare la funzione alla colonna "ingredients"
recipe_details_df['ingredients'] = recipe_details_df['ingredients'].apply(convert_ingredients_string)

# Visualizzare il DataFrame risultante
print(recipe_details_df['ingredients'])

0    zucchini, onion, red bell pepper, cherry tomat...
1    heavy cream, oranges, zest of, cinnamon, semis...
2                            butter, cornstarch, honey
3    potatoes, butter, flour, salt, milk, worcester...
4    devil's food cake mix, vegetable oil, eggs, re...
5    great northern beans, yellow onion, diced gree...
Name: ingredients, dtype: object


# Generazione rating variabile

In [87]:
#cambiare questo valore [0, 5] per generare e controllare ricette con un determinato rating
rating_test = 1

In [88]:
#max_new_tokens: numero massimo di nuovi token da generare
#temperature: Controlla la casualità delle previsioni. Un valore più basso rende il testo più deterministico, mentre un valore più alto lo rende più vario.
#top_k: Limita le scelte del modello ai k migliori risultati (per migliorare la qualità delle generazioni).
#top_p: Percentuale cumulativa di probabilità considerata per la scelta dei token. un valore alto aumenta la varietà e creatività includendo però i token meno probabili e viceversa


def generate_recipe(prompt, max_new_tokens=100, temperature=0.8, top_k=50, top_p=0.9):
    input_text = f"Ingredients: {prompt}. Steps:"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    sample_output = gpt2Model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        do_sample=True,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id,
        attention_mask=torch.ones_like(input_ids)
    )

    generated_text = tokenizer.decode(sample_output[0], skip_special_tokens=True)
    generated_text = " ".join(generated_text.split()).strip()  # Rimuovi spazi extra

    return generated_text

# Genera 5 ricette chiamando la funzione 5 volte
prompt = recipe_details_df['ingredients'][rating_test]
generated_recipes = [generate_recipe(prompt) for _ in range(5)]

# Stampa le ricette generate
for i, recipe in enumerate(generated_recipes, 1):
    print(f"Generated Recipe {i}:\n{recipe}\n")

Generated Recipe 1:
Ingredients: heavy cream, oranges, zest of, cinnamon, semisweet chocolate, egg yolks, sugar, vanilla extract. Steps: 1. heat the cream over medium-high heat in a saucepan. 2. add the orange zest, the cinnamon, and the chopped chocolate and stir until the zest is absorbed. 3. add the egg yolks, sugar, and the vanilla. 4. whisk until very well combined. 5. bring to a simmer and continue stirring frequently until thickened. 6. serve warm or cold, garnished with cinnamon sticks if desired. 7. can be made up to 8 hours

Generated Recipe 2:
Ingredients: heavy cream, oranges, zest of, cinnamon, semisweet chocolate, egg yolks, sugar, vanilla extract. Steps: 1. heat the cream until it just starts to boil, then lower the temperature and cook for 15 minutes, stirring occasionally. 2. remove from the heat and stir in the orange zest, cinnamon, and semisweet chocolate pieces. 3. cool to room temperature. 4. place a small bowl filled with ice cubes in your freezer. 5. in another 

In [89]:
#funzione per estrarre dalle ricette generate gli ingredienti e gli steps
def extract_ingredients_and_steps_from_recipes(recipes):
    all_ingredients_lists = []
    all_steps_lists = []

    for text in recipes:
        # Separare la sezione degli ingredienti e degli steps
        ingredients_section = re.search(r'Ingredients: (.+?)\. Steps:', text)
        steps_section = re.search(r'Steps: (.+)', text)

        if ingredients_section:
            ingredients_text = ingredients_section.group(1).strip()
            # Convertire gli ingredienti in una lista di stringhe
            ingredients_list = [ingredient.strip() for ingredient in ingredients_text.split(',') if ingredient.strip()]
            all_ingredients_lists.append(ingredients_list)
        else:
            all_ingredients_lists.append([])

        if steps_section:
            steps_text = steps_section.group(1).strip()
            # Separare gli steps e rimuovere numeri e punti
            steps_raw = re.split(r'\d+\.\s*', steps_text)  # Usa regex per separare usando numeri e punto
            # Filtrare e pulire gli steps
            steps_list = [step.strip() for step in steps_raw if step.strip()]
            all_steps_lists.append(steps_list)
        else:
            all_steps_lists.append([])

    return all_ingredients_lists, all_steps_lists

all_ingredients_lists, all_steps_lists = extract_ingredients_and_steps_from_recipes(generated_recipes)

print("All Ingredients Lists:", all_ingredients_lists)
print("All Steps Lists:", all_steps_lists)

All Ingredients Lists: [['heavy cream', 'oranges', 'zest of', 'cinnamon', 'semisweet chocolate', 'egg yolks', 'sugar', 'vanilla extract'], ['heavy cream', 'oranges', 'zest of', 'cinnamon', 'semisweet chocolate', 'egg yolks', 'sugar', 'vanilla extract'], ['heavy cream', 'oranges', 'zest of', 'cinnamon', 'semisweet chocolate', 'egg yolks', 'sugar', 'vanilla extract'], ['heavy cream', 'oranges', 'zest of', 'cinnamon', 'semisweet chocolate', 'egg yolks', 'sugar', 'vanilla extract'], ['heavy cream', 'oranges', 'zest of', 'cinnamon', 'semisweet chocolate', 'egg yolks', 'sugar', 'vanilla extract']]
All Steps Lists: [['heat the cream over medium-high heat in a saucepan.', 'add the orange zest, the cinnamon, and the chopped chocolate and stir until the zest is absorbed.', 'add the egg yolks, sugar, and the vanilla.', 'whisk until very well combined.', 'bring to a simmer and continue stirring frequently until thickened.', 'serve warm or cold, garnished with cinnamon sticks if desired.', 'can be 

In [90]:
#rimuove i punti finali negli steps per poterli tokenizzare e usare dopo
def remove_trailing_periods_from_steps(nested_steps_list):
    if not all(isinstance(sublist, list) for sublist in nested_steps_list) or \
       not all(isinstance(step, str) for sublist in nested_steps_list for step in sublist):
        raise ValueError("La lista deve essere una lista di liste di stringhe.")

    # Applica la pulizia a ciascuna sotto-lista
    cleaned_nested_list = [
        [step.strip().rstrip('.') for step in sublist]
        for sublist in nested_steps_list
    ]

    return cleaned_nested_list
# Rimuovi i punti finali
cleaned_all_steps_lists = remove_trailing_periods_from_steps(all_steps_lists)

# Stampa il risultato
for i, steps in enumerate(cleaned_all_steps_lists, 1):
    print(f"Recipe {i} Steps: {steps}")

Recipe 1 Steps: ['heat the cream over medium-high heat in a saucepan', 'add the orange zest, the cinnamon, and the chopped chocolate and stir until the zest is absorbed', 'add the egg yolks, sugar, and the vanilla', 'whisk until very well combined', 'bring to a simmer and continue stirring frequently until thickened', 'serve warm or cold, garnished with cinnamon sticks if desired', 'can be made up to 8 hours']
Recipe 2 Steps: ['heat the cream until it just starts to boil, then lower the temperature and cook for 15 minutes, stirring occasionally', 'remove from the heat and stir in the orange zest, cinnamon, and semisweet chocolate pieces', 'cool to room temperature', 'place a small bowl filled with ice cubes in your freezer', 'in another large bowl, whisk together the egg yolks, sugar, and vanilla extract', 'beat vigorously with an electric mixer on high']
Recipe 3 Steps: ['place heavy cream, orange zest and cinnamon in a medium saucepan and bring to a boil', 'reduce heat and simmer unt

In [91]:
# Tokenizzazione e padding
new_ingredients_seq = tokenizer_ingredients.texts_to_sequences(all_ingredients_lists)
new_steps_seq = tokenizer_steps.texts_to_sequences(cleaned_all_steps_lists)

#il Calcolo della lunghezza massima per gli ingredienti e i passi è già stata fatta prima nel notebook
ingredient_maxlen = 43
steps_maxlen = 218

# Padding
new_ingredients_padded = pad_sequences(new_ingredients_seq, maxlen=ingredient_maxlen)
new_steps_padded = pad_sequences(new_steps_seq, maxlen=steps_maxlen)

In [92]:
# Mappa per ingredienti e passi
def get_feature_names(tokenizer, maxlen):
    index_word = {index: word for word, index in tokenizer.word_index.items()}
    return [index_word.get(i, f'unknown_{i}') for i in range(1, maxlen + 1)]

new_ingredient_names = get_feature_names(tokenizer_ingredients, ingredient_maxlen)
new_steps_names = get_feature_names(tokenizer_steps, steps_maxlen)

# Creazione di DataFrame
new_ingredients_df = pd.DataFrame(new_ingredients_padded, columns=new_ingredient_names)
new_steps_df = pd.DataFrame(new_steps_padded, columns=new_steps_names)

# Combinazione delle caratteristiche e dei rating
new_features_df = pd.concat([new_ingredients_df, new_steps_df], axis=1)

In [93]:
# Calcolo della stima dei rating basata sulle correlazioni
rating_estimates = new_features_df.dot(loaded_correlation)
rating_estimates = rating_estimates / loaded_correlation.abs().sum()

print(rating_estimates)

0       -1.011367
1       -9.101329
2   -21987.154177
3       -1.011367
4   -12051.446443
dtype: float64


In [94]:
# Trova l'indice del valore massimo
max_index = rating_estimates.idxmax()

print(f"L'indice del valore massimo è: {max_index}")
#ricetta con valore massimo usando la correlation matrix
print(generated_recipes[max_index])

L'indice del valore massimo è: 0
Ingredients: heavy cream, oranges, zest of, cinnamon, semisweet chocolate, egg yolks, sugar, vanilla extract. Steps: 1. heat the cream over medium-high heat in a saucepan. 2. add the orange zest, the cinnamon, and the chopped chocolate and stir until the zest is absorbed. 3. add the egg yolks, sugar, and the vanilla. 4. whisk until very well combined. 5. bring to a simmer and continue stirring frequently until thickened. 6. serve warm or cold, garnished with cinnamon sticks if desired. 7. can be made up to 8 hours


In [95]:
print(f"Valore della ricetta originale:  {rating_estimates_real_recipes[rating_test]}")
print(f"Valore della ricetta generata:  {rating_estimates[max_index]}")

Valore della ricetta originale:  -207138.169533192
Valore della ricetta generata:  -1.0113673860488306
