In [8]:
# Import packages

import pandas as pd
import seaborn as sns

# Load dataset
penguins = sns.load_dataset("penguins", cache=False)

# Examine first 5 rows of dataset
penguins.head()

# Data cleaning by specifiying and renaming columns, and dropping rows with missing values.
# Subset data (selecting specific portions of the data from a larger dataset).
# This allows you to focus on relevant variables. 
penguins = penguins[["body_mass_g", "bill_length_mm", "sex", "species"]]

# rename columns
penguins.columns = ["body_mass_g", "bill_length_mm", "gender", "species"]

# Drop rows with missing values
penguins.dropna(inplace=True)

# reset index
penguins.reset_index(inplace=True, drop=True)

# Examine first 5 rows of data
penguins.head()


# Create holdout sample (to better test and evaluate the results)
# Subset x and y variables
penguins_x = penguins[["bill_length_mm", "gender", "species"]]
penguins_y = penguins[["body_mass_g"]]

# Import train-test-split function from sci-kit learn
from sklearn.model_selection import train_test_split

# Create training data sets and holdout (testing) data sets
x_train, x_test, y_train, y_test = train_test_split(penguins_x, penguins_y, test_size = 0.3, random_state = 42)

# Model Construction. X, bill length (mm) is continuous, and gender and species is categorical.
# Write out OLS formula as a string
ols_formula = "body_mass_g ~ bill_length_mm + C(gender) + C(species)"

# Import ols() function from statsmodels package
from statsmodels.formula.api import ols

# Create OLS dataframe
ols_data = pd.concat([x_train, y_train], axis = 1)

# Create OLS object and fit the model
OLS = ols(formula = ols_formula, data = ols_data)
model = OLS.fit()

# Get model results
model.summary()


