In [2]:
spark

# Importing the `lib`

In [1]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

# Reading the `Recipes dataset`

In [17]:
dataset = 's3://fcc-spark-example/dataset/2023/recipes_dataset/epi_r.csv'

food = (
            spark
                .read
                .csv(dataset, inferSchema=True, header=True)
        )

                                                                                

In [5]:
# To avoid the warning around, the truncation of the string representation of the execution plan 
# It's important to note that this warning is for debugging and informational purposes. 
# Truncating the string representation doesn't affect the actual execution or functionality of your Spark application.

spark.conf.set("spark.sql.debug.maxToStringFields", "100")

In [6]:
print(food.count(), len(food.columns))
food.printSchema()

[Stage 5:>                                                          (0 + 2) / 2]

20057 680
root
 |-- title: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- calories: string (nullable = true)
 |-- protein: double (nullable = true)
 |-- fat: double (nullable = true)
 |-- sodium: double (nullable = true)
 |-- #cakeweek: double (nullable = true)
 |-- #wasteless: double (nullable = true)
 |-- 22-minute meals: double (nullable = true)
 |-- 3-ingredient recipes: double (nullable = true)
 |-- 30 days of groceries: double (nullable = true)
 |-- advance prep required: double (nullable = true)
 |-- alabama: double (nullable = true)
 |-- alaska: double (nullable = true)
 |-- alcoholic: double (nullable = true)
 |-- almond: double (nullable = true)
 |-- amaretto: double (nullable = true)
 |-- anchovy: double (nullable = true)
 |-- anise: double (nullable = true)
 |-- anniversary: double (nullable = true)
 |-- anthony bourdain: double (nullable = true)
 |-- aperitif: double (nullable = true)
 |-- appetizer: double (nullable = true)
 |-- apple: double (nullabl

                                                                                

# Standardizing `column` names

In [14]:
def sanitize_column_name(col_name):
    '''
    Drops unwanted characters from the column name.
    We replace spaces, dashes and slashes with underscore,
    and only keep alphanumeric characters.
    '''
    answer = col_name
    
    for i, j in ((" ", "_"), ("-", "_"), ("/", "_"), ("&", "and")):
        answer = answer.replace(i, j)
    return "".join([
                        char
                        for char in answer
                        if char.isalpha() or char.isdigit() or char == "_"
                    ]
                   )

food = food.toDF(*[sanitize_column_name(col_name) for col_name in food.columns])

In [15]:
food.columns

['title',
 'rating',
 'calories',
 'protein',
 'fat',
 'sodium',
 'cakeweek',
 'wasteless',
 '22_minute_meals',
 '3_ingredient_recipes',
 '30_days_of_groceries',
 'advance_prep_required',
 'alabama',
 'alaska',
 'alcoholic',
 'almond',
 'amaretto',
 'anchovy',
 'anise',
 'anniversary',
 'anthony_bourdain',
 'aperitif',
 'appetizer',
 'apple',
 'apple_juice',
 'apricot',
 'arizona',
 'artichoke',
 'arugula',
 'asian_pear',
 'asparagus',
 'aspen',
 'atlanta',
 'australia',
 'avocado',
 'back_to_school',
 'backyard_bbq',
 'bacon',
 'bake',
 'banana',
 'barley',
 'basil',
 'bass',
 'bastille_day',
 'bean',
 'beef',
 'beef_rib',
 'beef_shank',
 'beef_tenderloin',
 'beer',
 'beet',
 'bell_pepper',
 'berry',
 'beverly_hills',
 'birthday',
 'biscuit',
 'bitters',
 'blackberry',
 'blender',
 'blue_cheese',
 'blueberry',
 'boil',
 'bok_choy',
 'bon_appétit',
 'bon_apptit',
 'boston',
 'bourbon',
 'braise',
 'bran',
 'brandy',
 'bread',
 'breadcrumbs',
 'breakfast',
 'brie',
 'brine',
 'brisket',

In [18]:
def sanitize_column_name(col_name):
    """
    Drops unwanted characters from the column name.
    Replaces spaces, dashes, slashes with underscores,
    and keeps only alphanumeric characters and underscores.
    """
    replacements = {
        " ": "_",
        "-": "_",
        "/": "_",
        "&": "and"
    }

    # Apply character replacements
    for char, replacement in replacements.items():
        col_name = col_name.replace(char, replacement)

    # Filter and join valid characters
    sanitized_name = "".join(char for char in col_name if char.isalnum() or char == "_")

    return sanitized_name

food = food.toDF(*[sanitize_column_name(col_name) for col_name in food.columns])

# EDA and Feature Engineering

In [27]:
for x in food.columns:
    food.select(x).summary().show(truncate=False)

+-------+---------------------------------------------------------------------------------+
|summary|title                                                                            |
+-------+---------------------------------------------------------------------------------+
|count  |20057                                                                            |
|mean   |null                                                                             |
|stddev |null                                                                             |
|min    |                Blistered-Chile–Pumpkin Seed Salsa "                             |
|25%    |null                                                                             |
|50%    |null                                                                             |
|75%    |null                                                                             |
|max    |Zuppa di Cavolo Nero, Cannellini, e Salsicce: Kale, White Bean, and Sau

                                                                                

+-------+---------------------+
|summary|chicago              |
+-------+---------------------+
|count  |20052                |
|mean   |1.9948134849391582E-4|
|stddev |0.014122729991017437 |
|min    |0.0                  |
|25%    |0.0                  |
|50%    |0.0                  |
|75%    |0.0                  |
|max    |1.0                  |
+-------+---------------------+

+-------+-------------------+
|summary|chicken            |
+-------+-------------------+
|count  |20052              |
|mean   |0.06702573309395571|
|stddev |0.25007279521905046|
|min    |0.0                |
|25%    |0.0                |
|50%    |0.0                |
|75%    |0.0                |
|max    |1.0                |
+-------+-------------------+

+-------+-------------------+
|summary|chickpea           |
+-------+-------------------+
|count  |20052              |
|mean   |0.00807899461400359|
|stddev |0.0895216405518936 |
|min    |0.0                |
|25%    |0.0                |
|50%    |0.0  

### Identifying the binary columns from our data frame

In [28]:
import pandas as pd

pd.set_option("display.max_rows", 1000)

is_binary = (
                food.agg(
                    *[
                        (F.size(F.collect_set(col_name)) == 2).alias(col_name)
                        for col_name in food.columns
                     ]
                    )
                .toPandas()
            )

is_binary.unstack()

                                                                                

title                     0    False
rating                    0    False
calories                  0    False
protein                   0    False
fat                       0    False
sodium                    0    False
cakeweek                  0    False
wasteless                 0    False
22_minute_meals           0     True
3_ingredient_recipes      0     True
30_days_of_groceries      0     True
advance_prep_required     0     True
alabama                   0     True
alaska                    0     True
alcoholic                 0     True
almond                    0     True
amaretto                  0     True
anchovy                   0     True
anise                     0     True
anniversary               0     True
anthony_bourdain          0     True
aperitif                  0     True
appetizer                 0     True
apple                     0     True
apple_juice               0     True
apricot                   0     True
arizona                   0     True
a

In [42]:
is_binary.unstack()[is_binary.unstack() == True].count()

672

In [43]:
is_binary.unstack()[is_binary.unstack() == False].count()

8

### Resolving data mishandling incidents and creating our initial set of features