In [4]:
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings('ignore')

In [5]:
print("--- 1. Carregando os Dados ---")
# Using the previously uploaded file
path = 'carros_limpo (1).csv'
df = pd.read_csv(path)

# Drop any potential NaN values in 'price' or target columns to avoid errors
df = df.dropna(subset=['price'])
df = df._get_numeric_data()
df.fillna(df.mean(), inplace=True) # Fill other NaNs just in case
print("Formato dos dados:", df.shape)

--- 1. Carregando os Dados ---
Formato dos dados: (201, 20)


In [6]:
print("\n--- 2. Preparando os Dados ---")
y_data = df['price']
x_data = df.drop('price', axis=1)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.10, random_state=1)
print("Amostras de teste:", x_test.shape[0])
print("Amostras de treino:", x_train.shape[0])


--- 2. Preparando os Dados ---
Amostras de teste: 21
Amostras de treino: 180


In [7]:
print("\n--- 3. Regressão Linear Simples ('horsepower') ---")
lre = LinearRegression()
lre.fit(x_train[['horsepower']], y_train)
print(f"Score no Treino (R^2): {lre.score(x_train[['horsepower']], y_train):.4f}")
print(f"Score no Teste (R^2): {lre.score(x_test[['horsepower']], y_test):.4f}")

Rcross = cross_val_score(lre, x_data[['horsepower']], y_data, cv=4)
print(f"Validação Cruzada (R^2 médio): {Rcross.mean():.4f}")


--- 3. Regressão Linear Simples ('horsepower') ---
Score no Treino (R^2): 0.6620
Score no Teste (R^2): 0.3635
Validação Cruzada (R^2 médio): 0.5221


In [8]:
print("\n--- 4. Regressão Linear Múltipla ---")
cols = ['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']
lr = LinearRegression()
lr.fit(x_train[cols], y_train)
print(f"Score Teste Múltipla (R^2): {lr.score(x_test[cols], y_test):.4f}")


--- 4. Regressão Linear Múltipla ---
Score Teste Múltipla (R^2): 0.6070


In [9]:
print("\n--- 5. Regressão Polinomial (Grau 5) ---")
x_train2, x_test2, y_train2, y_test2 = train_test_split(x_data, y_data, test_size=0.45, random_state=0)
pr = PolynomialFeatures(degree=5)
x_train_pr = pr.fit_transform(x_train2[['horsepower']])
x_test_pr = pr.fit_transform(x_test2[['horsepower']])
poly = LinearRegression()
poly.fit(x_train_pr, y_train2)
print(f"Score Treino Polinomial: {poly.score(x_train_pr, y_train2):.4f}")
print(f"Score Teste Polinomial (Overfitting evidente!): {poly.score(x_test_pr, y_test2):.4f}")



--- 5. Regressão Polinomial (Grau 5) ---
Score Treino Polinomial: 0.5569
Score Teste Polinomial (Overfitting evidente!): -29.8156


In [10]:
print("\n--- 6. Ridge Regression ---")
pr2 = PolynomialFeatures(degree=2)
features = ['horsepower', 'curb-weight', 'engine-size', 'highway-mpg','normalized-losses','symboling']
x_train_pr2 = pr2.fit_transform(x_train[features])
x_test_pr2 = pr2.fit_transform(x_test[features])

RigeModel = Ridge(alpha=1)
RigeModel.fit(x_train_pr2, y_train)
print(f"Ridge Reg. Score Teste (Alpha=1): {RigeModel.score(x_test_pr2, y_test):.4f}")



--- 6. Ridge Regression ---
Ridge Reg. Score Teste (Alpha=1): 0.7048


In [11]:
print("\n--- 7. Grid Search para Ridge Regression ---")
parameters1 = [{'alpha': [0.001,0.1,1, 10, 100, 1000, 10000, 100000]}]
RR = Ridge()
Grid1 = GridSearchCV(RR, parameters1, cv=4)
Grid1.fit(x_data[cols], y_data)
BestRR = Grid1.best_estimator_
print(f"Melhor Estimador (Alpha encontrado): {BestRR.alpha}")
print(f"Score do GridSearch no Teste: {BestRR.score(x_test[cols], y_test):.4f}")


--- 7. Grid Search para Ridge Regression ---
Melhor Estimador (Alpha encontrado): 10000
Score do GridSearch no Teste: 0.6373
