In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.impute import SimpleImputer

# Enable inline plotting
%matplotlib inline

# Load the dataset
df = pd.read_csv("fastfood.csv")

# Display the first few rows of the dataset
df.head()

# Check information about the dataset
df.info()

# Describe the dataset
df.describe()

# Check for missing values
df.isna().sum()

# Remove duplicates
df = df.drop_duplicates()

# Clean up any newline characters in string columns
df['restaurant'] = df['restaurant'].replace('\n', '', regex=True)
df['item'] = df['item'].replace('\n', '', regex=True)
df['total_fat'] = df['total_fat'].replace('\n', '', regex=True)
df['cholesterol'] = df['cholesterol'].replace('\n', '', regex=True)

# Handling missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
df[['fiber', 'protein']] = imputer.fit_transform(df[['fiber', 'protein']])

# For other vitamin columns, you might want to use a different strategy
vitamin_imputer = SimpleImputer(strategy='mean')
df[['vit_a', 'vit_c', 'calcium']] = vitamin_imputer.fit_transform(df[['vit_a', 'vit_c', 'calcium']])

# Remove outliers using IQR for 'calories' and 'protein'
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    return df[(df[column] >= (Q1 - 1.5 * IQR)) & (df[column] <= (Q3 + 1.5 * IQR))]

df = remove_outliers(df, 'calories')
df = remove_outliers(df, 'protein')

# Feature and target selection
num_columns = ['cal_fat', 'total_fat', 'sat_fat', 'trans_fat', 'cholesterol', 'sodium', 'total_carb', 'fiber', 'sugar', 'protein', 'vit_a', 'vit_c', 'calcium']
target_column = 'calories'
X = df[num_columns]
y = df[target_column]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the XGBoost model
model_xgb = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model_xgb.fit(X_train, y_train)

# Predictions and evaluation
y_pred_xgb = model_xgb.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBoost Mean Squared Error:", mse_xgb)
print("XGBoost R-squared:", r2_xgb)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

grid_search = GridSearchCV(xgb.XGBRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_y_pred = best_model.predict(X_test)

best_mse = mean_squared_error(y_test, best_y_pred)
best_r2 = r2_score(y_test, best_y_pred)

print("Best Model Mean Squared Error:", best_mse)
print("Best Model R-squared:", best_r2)

# Final Prediction Example
# Prepare inputs for prediction



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515 entries, 0 to 514
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   restaurant   515 non-null    object 
 1   item         515 non-null    object 
 2   calories     515 non-null    int64  
 3   cal_fat      515 non-null    int64  
 4   total_fat    515 non-null    int64  
 5   sat_fat      515 non-null    float64
 6   trans_fat    515 non-null    float64
 7   cholesterol  515 non-null    int64  
 8   sodium       515 non-null    int64  
 9   total_carb   515 non-null    int64  
 10  fiber        503 non-null    float64
 11  sugar        515 non-null    int64  
 12  protein      514 non-null    float64
 13  vit_a        301 non-null    float64
 14  vit_c        305 non-null    float64
 15  calcium      305 non-null    float64
 16  salad        515 non-null    object 
dtypes: float64(7), int64(7), object(3)
memory usage: 68.5+ KB
XGBoost Mean Squared Error: 3609.0

In [5]:
df.isnull().sum()

restaurant     0
item           0
calories       0
cal_fat        0
total_fat      0
sat_fat        0
trans_fat      0
cholesterol    0
sodium         0
total_carb     0
fiber          0
sugar          0
protein        0
vit_a          0
vit_c          0
calcium        0
salad          0
dtype: int64

In [4]:
# example_input = np.array([[60, 7, 2, 0, 95, 1110, 44, 3, 11, 37, 4, 20, 20]])  # Example input
# example_input_scaled = scaler.transform(example_input)  # Scale the input

# # Make prediction
# final_prediction = best_model.predict(example_input_scaled)
# print("Final Prediction Output:", final_prediction[0])

# Another prediction example
another_input = np.array([[50, 5, 1.5, 0, 65, 630, 35, 2, 3, 24, 4, 6, 15]])  # Another input
another_input_scaled = scaler.transform(another_input)  # Scale the input

# Make prediction
another_final_prediction = best_model.predict(another_input_scaled)
print("Another Final Prediction Output:", another_final_prediction[0])

Another Final Prediction Output: 315.12082




In [None]:
import numpy as np
import joblib

# Example input
example_input = np.array([[60, 7, 2, 0, 95, 1110, 44, 3, 11, 37, 4, 20, 20]])

# Load the scaler
scaler = joblib.load('MinMaxScaler.pkl')

# Scale the input
example_input_scaled = scaler.transform(example_input)

# Make prediction using your trained model (for example, an SVR model)
y_pred = model.predict(example_input_scaled)

print("Predicted output:", y_pred[0])
