In [15]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np


url = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
autompg = pd.read_csv(url, sep="\s+", header=None, comment="\t", quotechar="\"", na_values=["?"])


# Assign headers to the dataframe
autompg.columns = ["mpg", "cyl", "disp", "hp", "wt", "acc", "year", "origin", "name"]


# Remove rows with missing 'hp' data
autompg = autompg[autompg['hp'].notna()]

# Remove 'plymouth reliant'
autompg = autompg[autompg['name'] != "plymouth reliant"]

# Create row names based on the engine, year, and name
autompg.index = autompg['cyl'].astype(str) + " cylinder " + autompg['year'].astype(str) + " " + autompg['name']

# Drop the 'name' column
autompg.drop('name', axis=1, inplace=True)

# Convert horsepower from object to numeric
autompg['hp'] = pd.to_numeric(autompg['hp'], errors='coerce')

# Create a dummy variable for foreign vs. domestic cars. Domestic = 1.
autompg['domestic'] = (autompg['origin'] == 1).astype(int)

# Remove 3 and 5 cylinder cars
autompg = autompg[~autompg['cyl'].isin([3, 5])]

# Change 'cyl' to a category variable
# autompg['cyl'] = autompg['cyl'].astype('category')
# # Perform one-hot encoding on the 'cyl' column


## One-hot encoding For the categorical Values. 
autompg = pd.get_dummies(autompg, columns=['cyl'], prefix='cyl')

#We convert each dummy variable to a category variable
autompg["cyl_4"]=autompg["cyl_4"].astype("category")
autompg["cyl_6"]=autompg["cyl_6"].astype("category")
autompg["cyl_8"]=autompg["cyl_8"].astype("category")


In [12]:
autompg['disp_sq'] = autompg['disp'] ** 2
autompg['hp_sq'] = autompg['hp'] ** 2
autompg['wt_sq'] = autompg['wt'] ** 2
autompg['acc_sq'] = autompg['acc'] ** 2

# Formula for the full model
formula = 'np.log(mpg) ~ cyl_4 +cyl_6 + cyl_8 + disp + hp + wt + acc + year + domestic + \
           disp_sq + hp_sq + wt_sq + acc_sq + \
           (cyl_4 + cyl_6 + cyl_8 +disp + hp + wt + acc + year + domestic)**2'
           
autompg_big_mod = smf.ols(formula, data=autompg).fit()

In [13]:
autompg.info()

<class 'pandas.core.frame.DataFrame'>
Index: 383 entries, 8 cylinder 70 chevrolet chevelle malibu to 4 cylinder 82 chevy s-10
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   mpg       383 non-null    float64 
 1   disp      383 non-null    float64 
 2   hp        383 non-null    float64 
 3   wt        383 non-null    float64 
 4   acc       383 non-null    float64 
 5   year      383 non-null    int64   
 6   origin    383 non-null    int64   
 7   domestic  383 non-null    int32   
 8   cyl_4     383 non-null    category
 9   cyl_6     383 non-null    category
 10  cyl_8     383 non-null    category
 11  disp_sq   383 non-null    float64 
 12  hp_sq     383 non-null    float64 
 13  wt_sq     383 non-null    float64 
 14  acc_sq    383 non-null    float64 
dtypes: category(3), float64(9), int32(1), int64(2)
memory usage: 47.0+ KB


In [14]:
print(autompg_big_mod.summary())

                            OLS Regression Results                            
Dep. Variable:            np.log(mpg)   R-squared:                       0.922
Model:                            OLS   Adj. R-squared:                  0.913
Method:                 Least Squares   F-statistic:                     106.6
Date:                Mon, 30 Oct 2023   Prob (F-statistic):          7.76e-167
Time:                        15:20:25   Log-Likelihood:                 355.70
No. Observations:                 383   AIC:                            -633.4
Df Residuals:                     344   BIC:                            -479.4
Df Model:                          38                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                 3.23

Vemos que ahora si tenemos las interacciones de todas las variables categoricas, esto es cyl4, cyl 6 y cyl 8