In [77]:
import pandas as pd

from sklearn.model_selection import train_test_split
from scipy.stats import shapiro

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor

import numpy as np

In [65]:
data = pd.read_csv('./fastfood.csv')
data = data[data['total_fat'] < 125]

In [66]:
data['protein'] = pd.to_numeric(data['protein'], errors='coerce')
data['cal_fat'] = pd.to_numeric(data['cal_fat'], errors='coerce')

clean_data = data.fillna({'fiber': data['fiber'].mean(), 'protein': data['protein'].mean(), 'calcium': data['calcium'].mean(), 'cal_fat': data['cal_fat'].mean()})
print(clean_data.isna().sum())

restaurant     0
item           0
calories       0
cal_fat        0
total_fat      0
sat_fat        0
trans_fat      0
cholesterol    0
sodium         0
total_carb     0
fiber          0
sugar          0
protein        0
calcium        0
dtype: int64


In [67]:
data = data.drop(labels=['restaurant', 'item'], axis=1)

In [68]:
print(data.dtypes)
data.corr('pearson')

calories         int64
cal_fat        float64
total_fat        int64
sat_fat        float64
trans_fat      float64
cholesterol      int64
sodium           int64
total_carb       int64
fiber          float64
sugar            int64
protein        float64
calcium        float64
dtype: object


Unnamed: 0,calories,cal_fat,total_fat,sat_fat,trans_fat,cholesterol,sodium,total_carb,fiber,sugar,protein,calcium
calories,1.0,0.892599,0.890436,0.737299,0.514554,0.781681,0.804922,0.736879,0.319167,0.4742,0.811771,0.392425
cal_fat,0.892599,1.0,0.994596,0.850312,0.624711,0.800751,0.651293,0.43458,0.045422,0.282007,0.673128,0.190728
total_fat,0.890436,0.994596,1.0,0.844278,0.624484,0.796281,0.650101,0.437524,0.053443,0.287212,0.671752,0.195442
sat_fat,0.737299,0.850312,0.844278,1.0,0.79643,0.767767,0.487795,0.291214,-0.034586,0.248399,0.583454,0.314497
trans_fat,0.514554,0.624711,0.624484,0.79643,1.0,0.613262,0.25514,0.120297,-0.123712,0.126449,0.426314,0.119261
cholesterol,0.781681,0.800751,0.796281,0.767767,0.613262,1.0,0.635705,0.288301,-0.054583,0.378594,0.891284,0.195426
sodium,0.804922,0.651293,0.650101,0.487795,0.25514,0.635705,1.0,0.679939,0.328594,0.45445,0.760327,0.323056
total_carb,0.736879,0.43458,0.437524,0.291214,0.120297,0.288301,0.679939,1.0,0.655822,0.553394,0.507886,0.522814
fiber,0.319167,0.045422,0.053443,-0.034586,-0.123712,-0.054583,0.328594,0.655822,1.0,0.235825,0.184344,0.546025
sugar,0.4742,0.282007,0.287212,0.248399,0.126449,0.378594,0.45445,0.553394,0.235825,1.0,0.455295,0.309529


In [69]:
target = 'protein'
features = ['calories', 'cholesterol', 'sodium']
data_columns = ['protein', 'calories', 'cholesterol', 'sodium']

for feature in features:
    _, p = shapiro(data[feature])
    print(feature + ' ' + 'non normal' if p < 0.05 else 'normal')

data = data[data_columns]
print(data)

calories non normal
cholesterol non normal
sodium non normal
     protein  calories  cholesterol  sodium
0       37.0       380           95    1110
1       46.0       840          130    1580
2       70.0      1130          220    1920
3       55.0       750          155    1940
4       46.0       920          120    1980
..       ...       ...          ...     ...
510     23.0       780           50    1850
511     23.0       580           60    1270
512     26.0       780           60    1340
513     32.0       720           70    1260
514     28.0       720           55    1340

[513 rows x 4 columns]


In [70]:
train, test = train_test_split(data, test_size=0.2, random_state=123)

In [71]:
lm1 = smf.ols(f'{target} ~ ' + ' + '.join(features), data=train).fit()

In [75]:
print(lm1.summary())

                            OLS Regression Results                            
Dep. Variable:                protein   R-squared:                       0.852
Model:                            OLS   Adj. R-squared:                  0.851
Method:                 Least Squares   F-statistic:                     778.0
Date:                Tue, 08 Jul 2025   Prob (F-statistic):          1.17e-167
Time:                        14:58:20   Log-Likelihood:                -1326.2
No. Observations:                 409   AIC:                             2660.
Df Residuals:                     405   BIC:                             2677.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       2.5108      0.694      3.618      

In [74]:
lm1.predict(data.iloc[1])

1    43.324037
dtype: float64

In [78]:
X = sm.add_constant(train[features])
vif = pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])], index=X.columns)
print(np.sqrt(vif))

const          2.255865
calories       2.069238
cholesterol    1.623138
sodium         1.673344
dtype: float64
