In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the dataset (adjust the file path if needed)
fruits_vegetables_df = pd.read_csv('data/filtered_fruits_vegetables.csv')

# Define the features (nutritional content) and the target (transformed_reconstructed_energy)
features = ['energy_100g', 'fat_100g', 'carbohydrates_100g', 'sugars_100g', 'proteins_100g', 'salt_100g']
X = fruits_vegetables_df[features]
y = fruits_vegetables_df['transformed_reconstructed_energy']

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the price proxy (transformed_reconstructed_energy) for the test set
y_test_pred = model.predict(X_test)

# Evaluate the model using Mean Squared Error (MSE), Root Mean Squared Error (RMSE), and R²
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_test_pred)

# Print out the performance metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")

# Add the predictions back to the original dataset
fruits_vegetables_df['predicted_price_proxy'] = model.predict(X)

# Display the first few rows with predictions
print(fruits_vegetables_df[['product', 'transformed_reconstructed_energy', 'predicted_price_proxy']].head())

Mean Squared Error (MSE): 0.003894499314081935
Root Mean Squared Error (RMSE): 0.06240592370986856
R-squared (R²): 0.9956722315135723
            product  transformed_reconstructed_energy  predicted_price_proxy
0              Chia                          0.694293               0.541046
1         Spearmint                          0.605544               0.640491
2  Artichoke Hearts                         -1.491192              -1.440884
3               Fig                         -0.229172              -0.305448
4            Almond                          0.432530               0.275610


In [2]:

# Load the dataset (adjust the file path if needed)
fruits_vegetables_df = pd.read_csv('part 1 ETL Workflow/working.csv')

# Define the features (nutritional content) and the target (transformed_reconstructed_energy)
features = ['energy_100g', 'fat_100g', 'carbohydrates_100g', 'sugars_100g', 'proteins_100g', 'salt_100g']
X = fruits_vegetables_df[features]
y = fruits_vegetables_df['transformed_reconstructed_energy']

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the price proxy (transformed_reconstructed_energy) for the test set
y_test_pred = model.predict(X_test)

# Evaluate the model using Mean Squared Error (MSE), Root Mean Squared Error (RMSE), and R²
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_test_pred)

# Print out the performance metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")

# Add the predictions back to the original dataset
fruits_vegetables_df['predicted_price_proxy'] = model.predict(X)

# Display the first few rows with predictions
print(fruits_vegetables_df[['product', 'transformed_reconstructed_energy', 'predicted_price_proxy']].head())

Mean Squared Error (MSE): 0.006394478541278931
Root Mean Squared Error (RMSE): 0.07996548343678622
R-squared (R²): 0.9929496427295043
     product  transformed_reconstructed_energy  predicted_price_proxy
0  Artichoke                         -1.033042              -1.098399
1  Asparagus                         -1.207215              -1.214924
2      Beets                         -1.254806              -1.237975
3   Broccoli                         -1.373130              -1.312663
4    Carrots                         -1.309162              -1.275185
