In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score

<h1>Cargar los datos</h1>

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

# Columnas del conjunto de datos
column_names = ["MPG", "Cylinders", "Displacement", "Horsepower", "Weight", "Acceleration", "Model Year", "Origin", "Car Name"]

# Lee el conjunto de datos en un DataFrame
dataframe = pd.read_csv(url, delim_whitespace=True, names=column_names)
column_data_types = dataframe.dtypes

print(column_data_types)

MPG             float64
Cylinders         int64
Displacement    float64
Horsepower       object
Weight          float64
Acceleration    float64
Model Year        int64
Origin            int64
Car Name         object
dtype: object


In [3]:
import numpy as np

dataframe["Horsepower"] = dataframe["Horsepower"].replace('?', np.nan)
dataframe=dataframe.dropna()
dataframe["Horsepower"] = dataframe["Horsepower"].astype(float)

In [4]:
dataframe.drop("Car Name", axis=1, inplace=True)

In [5]:
x = dataframe.drop("MPG", axis=1)
y = dataframe["MPG"]

In [6]:
scaler = StandardScaler()
x= scaler.fit_transform(x)

<h1>Validación cruzada con regresión lineal</h1>

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

In [8]:
modelo_regresion = LinearRegression()

In [9]:
scores = cross_val_score(modelo_regresion, x, y, cv=10, scoring='neg_mean_squared_error')


In [10]:
mse_scores = -scores

# Imprimir los resultados de la validación cruzada
print("Error cuadrático medio en cada división:", mse_scores)
print("Error cuadrático medio promedio:", mse_scores.mean())

Error cuadrático medio en cada división: [ 9.58828251  8.87472359 11.57301071  8.30423008  4.6629437   6.74765794
 18.41064555 12.59276339 30.92977951 14.89550975]
Error cuadrático medio promedio: 12.657954671776423


In [11]:
scores = cross_val_score(modelo_regresion, x, y, cv=10, scoring='r2')

In [12]:
print("Coeficiente de determinación (R²) en cada división:", scores)
print("Coeficiente de determinación (R²) promedio:", scores.mean())

Coeficiente de determinación (R²) en cada división: [0.6467103  0.77241016 0.59004729 0.7865709  0.80982887 0.83918792
 0.57751578 0.75474699 0.10028113 0.54874121]
Coeficiente de determinación (R²) promedio: 0.6426040546895855


<h1>Validación cruzada con bosque aleatorio</h1>

In [13]:
from sklearn.ensemble import RandomForestRegressor

In [14]:
modelo_bosque = RandomForestRegressor(n_estimators=100, random_state=42)

In [15]:
scores = cross_val_score(modelo_bosque, x, y, cv=5, scoring='neg_mean_squared_error')


In [16]:
mse_scores = -scores

In [17]:
print("Error cuadrático medio en cada división:", mse_scores)
print("Error cuadrático medio promedio:", mse_scores.mean())

Error cuadrático medio en cada división: [ 3.16142182  5.78300403  4.35145479 10.84264767 21.74271132]
Error cuadrático medio promedio: 9.176247926030507


In [18]:
scores = cross_val_score(modelo_bosque, x, y, cv=10, scoring='r2')

In [19]:
print("Coeficiente de determinación (R²) en cada división:", scores)
print("Coeficiente de determinación (R²) promedio:", scores.mean())

Coeficiente de determinación (R²) en cada división: [0.88878799 0.90244952 0.79666913 0.86932073 0.82250406 0.92314889
 0.76532174 0.79728907 0.40300131 0.53087672]
Coeficiente de determinación (R²) promedio: 0.7699369173736083


<h1>Validación cruzada con kneightbours</h1>

In [20]:
from sklearn.neighbors import KNeighborsRegressor

In [21]:
k = 5
knn = KNeighborsRegressor(n_neighbors=k)

In [22]:
scores = cross_val_score(knn, x, y, cv=5, scoring='neg_mean_squared_error')


In [23]:
mse_scores = -scores

In [24]:
print("Coeficiente de determinación (R²) en cada división:", scores)
print("Coeficiente de determinación (R²) promedio:", scores.mean())

Coeficiente de determinación (R²) en cada división: [ -5.0843038   -5.62184304  -6.70945641 -13.02497949 -28.5262    ]
Coeficiente de determinación (R²) promedio: -11.793356546575785


<h1>Hold out con gradient bootsting</h1>

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

In [27]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [28]:
n_estimators = 100  
learning_rate = 0.1  
gb_regressor = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, random_state=42)


In [29]:
gb_regressor.fit(X_train, y_train)

In [30]:
y_pred = gb_regressor.predict(X_test)

In [33]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Error cuadrático medio:", mse)
print("Coeficiente de determinación (R2):", r2)

Error cuadrático medio: 6.102402116531098
Coeficiente de determinación (R2): 0.8804401962286983


<h1>Hold out con Kneightbours</h1>

In [34]:
k = 5  
knn_regressor = KNeighborsRegressor(n_neighbors=k)

In [36]:
knn_regressor.fit(X_train, y_train)

In [38]:
y_pred = knn_regressor.predict(X_test)

In [39]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Error cuadrático medio:", mse)
print("Coeficiente de determinación (R2):", r2)

Error cuadrático medio: 7.054698734177215
Coeficiente de determinación (R2): 0.8617825603398058
