### Feature engineer

#### **Pregunta:** Siguiendo únicamente el paso a paso descrito en el notebook y utilizando como variable dependiente "charges" y el resto de las variables presentes en el dataframe (exceptuando "region") como variables independientes, ¿Entre qué intervalo de valores se encuentra el intercepto para dicho modelo?

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf

In [2]:
df = pd.read_csv("insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19.0,female,27.9,0,yes,southwest,16884.924
1,18.0,male,33.77,1,no,southeast,1725.5523
2,28.0,male,33.0,3,no,southeast,4449.462
3,33.0,male,22.705,0,no,northwest,21984.47061
4,32.0,male,28.88,0,no,northwest,3866.8552


* Quitar la columna region

In [4]:
col_elim = df.pop("region")
col_elim

0       southwest
1       southeast
2       southeast
3       northwest
4       northwest
          ...    
1355    southeast
1356    northwest
1357    northwest
1358    northwest
1359    southwest
Name: region, Length: 1360, dtype: object

In [5]:
df2 = df.dropna()
df2

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19.0,female,27.900,0,yes,16884.92400
1,18.0,male,33.770,1,no,1725.55230
2,28.0,male,33.000,3,no,4449.46200
3,33.0,male,22.705,0,no,21984.47061
4,32.0,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1355,25.0,male,33.660,4,no,4504.66240
1356,64.0,male,24.700,1,no,30166.61817
1357,28.0,female,25.935,1,no,4133.64165
1358,20.0,female,22.420,0,yes,14711.74380


* Genere variables dummies (binarias) para el género de la persona y si es fumador o no, pensando en añadirlas posteriormente como variables independientes para explicar "charges".

In [6]:
genero_dummy = pd.get_dummies(df2['sex'], prefix='sex', drop_first=True)
fumador_dummy = pd.get_dummies(df2['smoker'], prefix='smoker', drop_first=True)

df = pd.concat([df2, genero_dummy, fumador_dummy], axis=1)
df

Unnamed: 0,age,sex,bmi,children,smoker,charges,sex_male,smoker_yes
0,19.0,female,27.900,0,yes,16884.92400,False,True
1,18.0,male,33.770,1,no,1725.55230,True,False
2,28.0,male,33.000,3,no,4449.46200,True,False
3,33.0,male,22.705,0,no,21984.47061,True,False
4,32.0,male,28.880,0,no,3866.85520,True,False
...,...,...,...,...,...,...,...,...
1355,25.0,male,33.660,4,no,4504.66240,True,False
1356,64.0,male,24.700,1,no,30166.61817,True,False
1357,28.0,female,25.935,1,no,4133.64165,False,False
1358,20.0,female,22.420,0,yes,14711.74380,False,True


In [7]:
# Mapear valores de la columna 'sex'
df['sex'] = df['sex'].map({'female': 1, 'male': 2})

# Mapear valores de la columna 'smoker'
df['smoker'] = df['smoker'].map({'yes': 1, 'no': 0})

In [8]:
x= df[["age", "sex", "bmi", "children", "smoker"]]
y= df["charges"]
x

Unnamed: 0,age,sex,bmi,children,smoker
0,19.0,1,27.900,0,1
1,18.0,2,33.770,1,0
2,28.0,2,33.000,3,0
3,33.0,2,22.705,0,0
4,32.0,2,28.880,0,0
...,...,...,...,...,...
1355,25.0,2,33.660,4,0
1356,64.0,2,24.700,1,0
1357,28.0,1,25.935,1,0
1358,20.0,1,22.420,0,1


In [10]:
from sklearn.linear_model import LinearRegression
import numpy as np

# Create a linear regression object
model = LinearRegression()

# Fit the model to the data
model.fit(x, y)

# Get the coefficients of the linear regression equation
coefficients = model.coef_

# Get the intercept of the linear regression equation
intercept = model.intercept_

# Print the linear regression equation
print("Linear Regression Equation:")
print("y = {:.2f} + {:.2f} * x".format(intercept, coefficients[0]))

Linear Regression Equation:
y = -12069.70 + 258.86 * x


In [21]:
from sklearn.linear_model import LinearRegression

# Create a multiple linear regression object
model = LinearRegression()

# Fit the model to the data
model.fit(x, y)

# Get the coefficients of the multiple linear regression equation
coefficients = model.coef_

# Get the intercept of the multiple linear regression equation
intercept = model.intercept_

# Print the multiple linear regression equation
print("Multiple Linear Regression Equation:")
for i, feature in enumerate(["age", "sex", "bmi", "children", "smoker"]):
    print("{:s}: {:.2f}".format(feature, coefficients[i]))
print("Intercept: {:.2f}".format(intercept))

Multiple Linear Regression Equation:
age: 258.86
sex: -109.36
bmi: 325.89
children: 469.95
smoker: 23822.82
Intercept: -12069.70


In [22]:
import statsmodels.api as sm

# Add a constant term to the X features
X_with_const = sm.add_constant(x)

# Create a multiple linear regression model
model = sm.OLS(y, X_with_const)

# Fit the model to the data
results = model.fit()

# Get the confidence intervals of the intercept
conf_interval = results.conf_int(alpha=0.05)

# Extract the confidence intervals of the intercept
lower_bound = conf_interval.loc['const', 0]
upper_bound = conf_interval.loc['const', 1]

# Print the confidence intervals of the intercept
print("Confidence Intervals of the Intercept:")
print("Lower Bound:", lower_bound)
print("Upper Bound:", upper_bound)

Confidence Intervals of the Intercept:
Lower Bound: -14134.359685221658
Upper Bound: -10005.040211927862
