In [35]:
import pandas as pd

In [37]:
data_for_training = pd.read_csv("learn_copy.csv", sep = ",")

data_for_training

Unnamed: 0,ambient_humidity,baking_duration,cooling_period,cream_fat_content,egg_temperature,egg_yolk_count,lemon_zest_ph,oven_temperature,preheating_time,salt_ratio,sugar_content,vanilla_extract,origin_Porto,quality_class_OK
0,-0.515888,-0.281398,-0.403099,0.997275,-0.335390,0.399042,-0.115283,0.000664,-0.178057,-0.375237,-0.585693,0.401632,0,0
1,0.518888,1.295482,0.152593,-0.208031,-0.564364,-0.397801,0.489596,0.981653,0.830914,-0.430995,-1.347853,-0.963388,0,0
2,-1.636895,0.528720,-0.889330,0.444843,-0.299766,-0.397801,1.094474,-1.219236,-0.818035,0.946507,-0.718688,1.511479,1,1
3,0.173963,-0.281398,1.125056,-0.208031,1.157440,-1.194643,0.489596,1.588385,-0.818035,-0.182937,0.778581,-0.780461,0,1
4,-0.429657,-0.762823,0.152593,0.846612,-0.484943,-1.194643,-1.325039,-0.255603,0.131824,0.278420,0.883526,0.605929,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5188,0.001500,-1.314938,0.222055,-1.212454,0.274794,0.399042,0.489596,-0.017669,-1.114635,-0.878807,0.520407,0.401632,0,1
5189,0.087731,-0.133691,0.569363,-1.011569,0.874577,-1.194643,-0.115283,1.695455,0.294352,-0.194543,1.558166,0.927714,0,0
5190,0.777582,-1.314938,0.291517,-0.358695,0.694838,0.399042,2.304231,-0.243707,-1.793398,-1.318549,-1.510596,0.605929,0,1
5191,0.863813,-0.133691,0.569363,1.399044,1.515835,-1.194643,-0.720161,1.992872,-0.504778,0.294960,1.903130,0.401632,0,0


Features

Create the new features

In [None]:
# --- Feature Creation for Pastel de Nata dataset ---

# Baking efficiency: time relative to oven temperature
df['baking_efficiency'] = df['baking_duration'] / df['oven_temperature']

# Sweet–salt balance: sugar content relative to salt ratio
df['sweet_salt_balance'] = df['sugar_content'] / df['salt_ratio']

# Egg quality factor: influence of egg temperature and count
df['egg_quality_factor'] = df['egg_yolk_count'] * df['egg_temperature']

# Flavor complexity: interaction of lemon zest pH and vanilla extract concentration
df['flavor_complexity'] = df['lemon_zest_ph'] * df['vanilla_extract']

# Cooling efficiency: ratio between cooling period and baking duration
df['cooling_efficiency'] = df['cooling_period'] / df['baking_duration']

# Fat–sugar harmony: relation between cream fat and sugar levels
df['fat_sugar_harmony'] = df['cream_fat_content'] / df['sugar_content']

# Oven performance: baking temp adjusted by preheating time
df['oven_performance'] = df['oven_temperature'] / df['preheating_time']

# --- Extended Feature Engineering for Pastel de Nata dataset ---

# 1. Temperature-based features
df['temp_diff'] = df['oven_temperature'] - df['egg_temperature']          # Temperature shock
df['avg_temperature'] = (df['oven_temperature'] + df['egg_temperature']) / 2  # Mean working temperature
df['temp_time_balance'] = df['oven_temperature'] / (df['baking_duration'] + 1)  # Oven intensity per min

# 2. Ingredient ratios
df['sugar_fat_ratio'] = df['sugar_content'] / (df['cream_fat_content'] + 1e-3)  # Sweetness vs fat
df['lemon_vanilla_balance'] = df['lemon_zest_ph'] + df['vanilla_extract']       # Combined flavor index
df['egg_fat_blend'] = df['egg_yolk_count'] + df['cream_fat_content']            # Richness factor

# 3. Time–temperature interactions
df['heat_exposure'] = df['baking_duration'] * df['oven_temperature']            # Overall heat energy
df['total_process_time'] = df['preheating_time'] + df['baking_duration'] + df['cooling_period']
df['post_bake_ratio'] = df['cooling_period'] / (df['baking_duration'] + 1e-3)   # Cooling time vs baking

# 4. Quality-related heuristics
df['sugar_salt_temp'] = (df['sugar_content'] / (df['salt_ratio'] + 1e-3)) + df['oven_temperature']
df['egg_flavor_mix'] = df['egg_temperature'] + df['vanilla_extract'] + df['lemon_zest_ph']
df['richness_index'] = (df['egg_yolk_count'] * df['cream_fat_content']) / (df['salt_ratio'] + 1)

# 5. Combined interactions (higher-order)
df['heat_fat_interaction'] = df['oven_temperature'] * df['cream_fat_content']
df['sweet_fat_temp_combo'] = (df['sugar_content'] + df['cream_fat_content']) * df['oven_temperature']
df['time_flavor_factor'] = (df['baking_duration'] + df['cooling_period']) * (df['vanilla_extract'] + df['lemon_zest_ph'])





Feature Selection

In [7]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

# Correlation
corr = df.corr()
plt.figure(figsize=(10, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()

# Random forest importance
X = df.drop('quality_class', axis=1)
y = df['quality_class']

rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)
importances = pd.Series(rf.feature_importances_, index=X.columns)
importances.sort_values(ascending=False)


NameError: name 'df' is not defined

Train

In [45]:
from sklearn.model_selection import train_test_split

X = data_for_training.drop('quality_class_OK', axis=1)
y = data_for_training['quality_class_OK']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)