<a href="https://colab.research.google.com/github/dlcodnjs2/2023/blob/main/linear_models_for_regression_%EC%8B%A4%EC%8A%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

targetUrl = "https://raw.githubusercontent.com/dlcodnjs2/2023/main/auto-mpg.csv"

df = pd.read_csv(targetUrl, sep=',')

df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [3]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['horsepower'] = label_encoder.fit_transform(df['horsepower'])
df['car name'] = label_encoder.fit_transform(df['car name'])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    int64  
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    int64  
dtypes: float64(3), int64(6)
memory usage: 28.1 KB


In [4]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)

df_imputed = imputer.fit_transform(df)

df_imputed2 = pd.DataFrame(df_imputed)

df_imputed2.columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name']

df_imputed2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    float64
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    float64
 7   origin        398 non-null    float64
 8   car name      398 non-null    float64
dtypes: float64(9)
memory usage: 28.1 KB


In [5]:
from sklearn.model_selection import train_test_split

X = df.drop(['mpg', 'car name'],axis=1)
y = df['mpg']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=156)

In [6]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
std.fit(X_train.iloc[:,0:])
X_train.iloc[:,0:]= std.transform(X_train.iloc[:,0:])

X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin
343,-0.865303,-1.083138,0.129181,-1.411107,0.516755,1.414454,1.841446
112,-0.865303,-0.683483,0.977769,-0.767954,1.082616,-0.803985,-0.711781
6,1.464521,2.402228,-0.176311,1.600706,-2.277183,-1.635899,-0.711781
315,-0.865303,-0.413948,1.147486,0.035119,1.648477,1.137149,-0.711781
325,-0.865303,-0.980901,-0.040537,-1.028691,2.214338,1.137149,0.564833


In [7]:
X_test.iloc[:,0:] = std.transform(X_test.iloc[:,0:])
X_test.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin
28,1.464521,1.008082,-0.379972,2.038746,1.082616,-1.635899,-0.711781
273,-0.865303,-0.711366,1.385091,-0.657864,-0.190571,0.58254,1.841446
284,0.299609,0.273831,-1.500108,0.448824,0.410656,0.859844,-0.711781
77,-0.865303,-0.692777,0.672277,-0.535028,0.905785,-1.081289,0.564833
173,-0.865303,-0.711366,1.385091,-0.495627,0.552122,-0.249375,1.841446


In [8]:
df_cat_std = pd.concat([X_train,y_train], axis=1)

df_cat_std2 = pd.concat([X_test,y_test], axis=1)

In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import numpy as np

degrees = [1, 2, 3, 4, 5]

y_train_pred = np.zeros((len(X_train), len(degrees)))
y_test_pred = np.zeros((len(X_test), len(degrees)))

for degree in degrees:
  model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
  neg_mse_scores = cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
  rmse_scores  = np.sqrt(-1 * neg_mse_scores)
  avg_rmse = np.mean(rmse_scores)

print(' 5 folds 의 개별 Negative MSE scores: ', np.round(neg_mse_scores, 2))
print(' 5 folds 의 개별 RMSE scores : ', np.round(rmse_scores, 2))
print(' 5 folds 의 평균 RMSE : {0:.3f} '.format(avg_rmse))

 5 folds 의 개별 Negative MSE scores:  [-785.8  -959.63 -442.89 -301.02 -190.06]
 5 folds 의 개별 RMSE scores :  [28.03 30.98 21.04 17.35 13.79]
 5 folds 의 평균 RMSE : 22.238 


In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

model = Ridge()

poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X_train)

grid_params = {'alpha' : [1e-4, 1e-3, 1e-2, 0.1, 0.5, 1.0, 5.0, 10.0]}

gs = GridSearchCV(model, grid_params, scoring='neg_mean_squared_error',cv=5)
gs.fit(X_poly, y_train)

print("Best Parameter: ", gs.best_params_)
print("Best MSE Score: ", gs.best_score_)

Best Parameter:  {'alpha': 10.0}
Best MSE Score:  -9.966364485050107
