In [9]:
# Import packages

import pandas as pd
import seaborn as sns

# Load dataset
penguins = sns.load_dataset("penguins", cache=False)

# Examine first 5 rows of dataset
penguins.head()

# Data cleaning by specifiying and renaming columns, and dropping rows with missing values.
# Subset data (selecting specific portions of the data from a larger dataset).
# This allows you to focus on relevant variables. 
penguins = penguins[["body_mass_g", "bill_length_mm", "sex", "species"]]

# rename columns
penguins.columns = ["body_mass_g", "bill_length_mm", "gender", "species"]

# Drop rows with missing values
penguins.dropna(inplace=True)

# reset index
penguins.reset_index(inplace=True, drop=True)

# Examine first 5 rows of data
penguins.head()


# Create holdout sample (to better test and evaluate the results)
# Subset x and y variables
penguins_x = penguins[["bill_length_mm", "gender", "species"]]
penguins_y = penguins[["body_mass_g"]]

# Import train-test-split function from sci-kit learn
from sklearn.model_selection import train_test_split

# Create training data sets and holdout (testing) data sets
x_train, x_test, y_train, y_test = train_test_split(penguins_x, penguins_y, test_size = 0.3, random_state = 42)



#=============================================================
#  Model Construction. X, bill length (mm) is continuous, and gender and species is categorical.
# Write out OLS formula as a string
# ============================================================


ols_formula = "body_mass_g ~ bill_length_mm + C(gender) + C(species)"

# Import ols() function from statsmodels package
from statsmodels.formula.api import ols

# Create OLS dataframe
ols_data = pd.concat([x_train, y_train], axis = 1)

# Create OLS object and fit the model
OLS = ols(formula = ols_formula, data = ols_data)
model = OLS.fit()

#=====================================
# Model evaluation and interpretation
#=====================================

# Get model results
model.summary()

# We can now interpret each of the beta coefficients for each X variable.
# We know that the variable was encoded as Male = 1, Female = 0.




0,1,2,3
Dep. Variable:,body_mass_g,R-squared:,0.85
Model:,OLS,Adj. R-squared:,0.847
Method:,Least Squares,F-statistic:,322.6
Date:,"Tue, 13 Aug 2024",Prob (F-statistic):,1.31e-92
Time:,22:35:56,Log-Likelihood:,-1671.7
No. Observations:,233,AIC:,3353.0
Df Residuals:,228,BIC:,3371.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2032.2111,354.087,5.739,0.000,1334.510,2729.913
C(gender)[T.Male],528.9508,55.105,9.599,0.000,420.371,637.531
C(species)[T.Chinstrap],-285.3865,106.339,-2.684,0.008,-494.920,-75.853
C(species)[T.Gentoo],1081.6246,94.953,11.391,0.000,894.526,1268.723
bill_length_mm,35.5505,9.493,3.745,0.000,16.845,54.256

0,1,2,3
Omnibus:,0.339,Durbin-Watson:,1.948
Prob(Omnibus):,0.844,Jarque-Bera (JB):,0.436
Skew:,0.084,Prob(JB):,0.804
Kurtosis:,2.871,Cond. No.,798.0
