![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)

# Importing Libraries

In [None]:
import unicodedata
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.stem import PorterStemmer
from IPython.display import clear_output
import pandas as pd

# Loading File

In [None]:
recipes = pd.read_csv('../data/raw/recipes_one_clean.csv')

In [None]:
recipes

In [None]:
recipes['preparations']

In [None]:
limit = 10 

for index in range(min(limit, len(recipes))):
    label = recipes.at[index, 'preparations']
    print(f"Row {index} Label:")
    print(label)
    print()

# Clean Preparations

In [None]:
def clean_recipes_column(df, column_name, num_rows=None):
    """
    Clean and preprocess a column of text data in a DataFrame, including pluralization based on numbers.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the text column to be cleaned.
        column_name (str): The name of the column to be cleaned.
        num_rows (int, optional): The number of rows to process. If None, processes all rows.

    Returns:
        None: The function modifies the DataFrame in place by updating the specified column.
    """
    stop_words = set(stopwords.words('portuguese'))
    p = PorterStemmer()
    
    if num_rows is None:
        num_rows = df.shape[0]
    else:
        num_rows = min(num_rows, df.shape[0])  
    
    for i in range(num_rows):
        clear_output(wait=True)
        print(f"Cleaning recipe {i+1}/{num_rows} ({((i+1)/num_rows)*100:.2f}%)")

        recipe = df.at[i, column_name]
        

        tokens = word_tokenize(recipe, language='portuguese')


        recipe = recipe.replace('q . b .', 'q.b.')
        recipe = recipe.replace('q. b.', 'q.b.')
        recipe = recipe.replace('q.b .', 'q.b.')
        recipe = recipe.replace('q . b.', 'q.b.')
        recipe = recipe.replace('q.b .', 'q.b.')
        

        cleaned_tokens = []
        for token in tokens:
            # Check if the token is a valid word in the Portuguese language
            if token.lower() in words.words():
                cleaned_tokens.append(token)
            # Check if the token is not a standalone zero
            elif token != '0':
                cleaned_tokens.append(token)
        
        # Join the cleaned tokens back into a string
        cleaned_recipe = ' '.join(cleaned_tokens)
        
        # Handle pluralization based on numbers
        words_in_recipe = cleaned_recipe.split()
        for j in range(len(words_in_recipe)):
            # Check if the word is a number
            if words_in_recipe[j].isdigit():
                # Check if the next word is pluralizable
                if j + 1 < len(words_in_recipe):
                    next_word = words_in_recipe[j+1].lower()
                    # Check if the next word is pluralized within parentheses
                    if next_word.startswith('(') and next_word.endswith(')'):
                        # Remove the parentheses and handle pluralization
                        singular = p.stem(next_word[1:-1])
                        if singular:
                            words_in_recipe[j+1] = singular
                    else:
                        # Check if the next word is pluralizable
                        singular = p.stem(next_word)
                        if singular:
                            words_in_recipe[j+1] = singular
        
        # Reconstruct the cleaned recipe
        cleaned_recipe = ' '.join(words_in_recipe)
        
        # Handle parentheses and "de" insertion
        cleaned_recipe = cleaned_recipe.replace('(colher)', 'colher')
        cleaned_recipe = cleaned_recipe.replace('(colheres)', 'colheres')
        cleaned_recipe = cleaned_recipe.replace('colher cafe', 'colher de cafe')
        cleaned_recipe = cleaned_recipe.replace('colher cha', 'colher de cha')
        cleaned_recipe = cleaned_recipe.replace('colher sobremesa', 'colher de sobremesa')
        cleaned_recipe = cleaned_recipe.replace('colher sopa', 'colher de sopa')
        
        # Update the DataFrame with the cleaned recipe
        df.at[i, column_name] = cleaned_recipe
    
    print("Cleaning complete!")

In [None]:
clean_recipes_column(recipes, 'preparations', num_rows = 10)

In [None]:
limit = 10

for index in range(min(limit, len(recipes))):
    label = recipes.at[index, 'preparations']
    print(f"Row {index} Label:")
    print(label)
    print()