[자동차 연비 예측]
- 데이터셋 : auto-mpg.csv
- 학습방법 : 지도학습 > 회귀
- 알고리즘 : LinearRegression

1) 데이터 준비 및 feature/target 분석

In [8]:
import pandas as pd

In [9]:
data=pd.read_csv('../data/auto_mpg.csv')

In [10]:
data.corr(numeric_only=True)

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year,origin
mpg,1.0,-0.775396,-0.804203,-0.831741,0.420289,0.579267,0.56345
cylinders,-0.775396,1.0,0.950721,0.896017,-0.505419,-0.348746,-0.562543
displacement,-0.804203,0.950721,1.0,0.932824,-0.543684,-0.370164,-0.609409
weight,-0.831741,0.896017,0.932824,1.0,-0.417457,-0.306564,-0.581024
acceleration,0.420289,-0.505419,-0.543684,-0.417457,1.0,0.288137,0.205873
model year,0.579267,-0.348746,-0.370164,-0.306564,0.288137,1.0,0.180662
origin,0.56345,-0.562543,-0.609409,-0.581024,0.205873,0.180662,1.0


In [11]:
data[data['horsepower']=='?']

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
32,25.0,4,98.0,?,2046,19.0,71,1,ford pinto
126,21.0,6,200.0,?,2875,17.0,74,1,ford maverick
330,40.9,4,85.0,?,1835,17.3,80,2,renault lecar deluxe
336,23.6,4,140.0,?,2905,14.3,80,1,ford mustang cobra
354,34.5,4,100.0,?,2320,15.8,81,2,renault 18i
374,23.0,4,151.0,?,3035,20.5,82,1,amc concord dl


In [12]:
idx=data[data['horsepower']=='?'].index
data=data.drop(idx).copy()
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    object 
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
 8   car name      392 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 30.6+ KB


In [13]:
data['horsepower']=data['horsepower'].astype('int')
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    int32  
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
 8   car name      392 non-null    object 
dtypes: float64(3), int32(1), int64(4), object(1)
memory usage: 29.1+ KB


In [15]:
data.corr(numeric_only=True)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
mpg,1.0,-0.777618,-0.805127,-0.778427,-0.832244,0.423329,0.580541,0.565209
cylinders,-0.777618,1.0,0.950823,0.842983,0.897527,-0.504683,-0.345647,-0.568932
displacement,-0.805127,0.950823,1.0,0.897257,0.932994,-0.5438,-0.369855,-0.614535
horsepower,-0.778427,0.842983,0.897257,1.0,0.864538,-0.689196,-0.416361,-0.455171
weight,-0.832244,0.897527,0.932994,0.864538,1.0,-0.416839,-0.30912,-0.585005
acceleration,0.423329,-0.504683,-0.5438,-0.689196,-0.416839,1.0,0.290316,0.212746
model year,0.580541,-0.345647,-0.369855,-0.416361,-0.30912,0.290316,1.0,0.181528
origin,0.565209,-0.568932,-0.614535,-0.455171,-0.585005,0.212746,0.181528,1.0


2) feature/target 분리

In [17]:
feature=data.iloc[:,1:7]
target=data['mpg']

print(f'feature: {feature.shape},{feature.ndim}d')
print(f'target: {target.shape},{target.ndim}d')

feature: (392, 6),2d
target: (392,),1d


3) 데이터셋 준비

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
x_train,x_test,y_train,y_test=train_test_split(feature,target,random_state=8)

In [20]:
print(f'x_train:{x_train.shape},{x_train.ndim}D')
print(f'y_train:{y_train.shape},{y_train.ndim}D')

print(f'x_test:{x_test.shape},{x_test.ndim}D')
print(f'y_test:{y_test.shape},{y_test.ndim}D')

x_train:(294, 6),2D
y_train:(294,),1D
x_test:(98, 6),2D
y_test:(98,),1D


4) 훈련/학습 진행

In [21]:
from sklearn.linear_model import LinearRegression

In [22]:
model=LinearRegression()
model.fit(feature,target)

In [23]:
print(f'model.coef_: {len(model.coef_)}개, {model.coef_}')
print(f'model.intercept_: {model.intercept_}')

model.coef_: 6개, [-3.29859089e-01  7.67843024e-03 -3.91355574e-04 -6.79461791e-03
  8.52732469e-02  7.53367180e-01]
model.intercept_: -14.535250480506097


5) 모델 성능 평가

In [24]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [25]:
model.score(feature,target)

0.8092552890383932

In [26]:
pre_data=model.predict(feature)

mse=mean_squared_error(target,pre_data)
rmse=mean_squared_error(target,pre_data,squared=False)
mae=mean_absolute_error(target,pre_data)
r2=r2_score(target,pre_data)

In [27]:
print(f'mse: {mse}')
print(f'rmse: {rmse}')
print(f'mae: {mae}')
print(f'r2: {r2}')

mse: 11.590170981415227
rmse: 3.4044340177796406
mae: 2.618264046728958
r2: 0.8092552890383932
