문제) 'auto-mpg.csv' 파일에 대하여 EDA를 수행 후 선형 회귀 모델을 구현

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import optimizers

In [2]:
# IPython 디스플레이 설정 - 출력할 열의 개수 한도 늘리기
pd.set_option( 'display.max_columns', 10 )

# 단순 선형 회귀 모델

## 1. 데이터 준비

In [3]:
# csv파일 데이터프레임으로 변환
raw_df = pd.read_csv( 'auto-mpg.csv', header = None, encoding = 'utf-8' )

In [4]:
# 열이름 지정
raw_df.columns = [ 'mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
               'acceleration', 'model', 'origin', 'name' ]

In [5]:
raw_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [6]:
df = raw_df.copy()

In [7]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


### 'name'변수 제거

In [8]:
df.pop( 'name' )

0      chevrolet chevelle malibu
1              buick skylark 320
2             plymouth satellite
3                  amc rebel sst
4                    ford torino
                 ...            
393              ford mustang gl
394                    vw pickup
395                dodge rampage
396                  ford ranger
397                   chevy s-10
Name: name, Length: 398, dtype: object

In [9]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


## 2. 데이터 탐색

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model         398 non-null    int64  
 7   origin        398 non-null    int64  
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [11]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,3.0


#### 엔진 출력인 'horsepower' 변수가 포함되어 있지 않으므로 자료형 변경

In [12]:
df[ 'horsepower' ].unique()

array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',
       '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',
       '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',
       '193.0', '?', '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',
       '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',
       '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',
       '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',
       '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',
       '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',
       '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',
       '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',
       '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',
       '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',
       '116.0', '82.00'], dtype=object)

In [13]:
df[ 'horsepower' ].replace( '?', np.nan, inplace = True )
df.dropna( subset = [ 'horsepower' ], axis = 0, inplace = True )
df[ 'horsepower' ].unique()

array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',
       '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',
       '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',
       '193.0', '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',
       '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',
       '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',
       '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',
       '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',
       '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',
       '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',
       '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',
       '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',
       '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',
       '116.0', '82.00'], dtype=object)

In [14]:
df[ 'horsepower' ] = df[ 'horsepower' ].astype( 'float' )

In [15]:
origin = df.pop( 'origin' )

In [16]:
df['USA'] = ( origin == 1 ) * 1.0
df['Europe'] = ( origin == 2 ) * 1.0
df['Japan'] = ( origin == 3 ) * 1.0

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    float64
 5   acceleration  392 non-null    float64
 6   model         392 non-null    int64  
 7   USA           392 non-null    float64
 8   Europe        392 non-null    float64
 9   Japan         392 non-null    float64
dtypes: float64(8), int64(2)
memory usage: 33.7 KB


In [18]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,USA,Europe,Japan
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,23.445918,5.471939,194.41199,104.469388,2977.584184,15.541327,75.979592,0.625,0.173469,0.201531
std,7.805007,1.705783,104.644004,38.49116,849.40256,2.758864,3.683737,0.484742,0.379136,0.401656
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,0.0,0.0,0.0
25%,17.0,4.0,105.0,75.0,2225.25,13.775,73.0,0.0,0.0,0.0
50%,22.75,4.0,151.0,93.5,2803.5,15.5,76.0,1.0,0.0,0.0
75%,29.0,8.0,275.75,126.0,3614.75,17.025,79.0,1.0,0.0,0.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,1.0,1.0,1.0


### 훈련용 데이터 셋과 테스트용 데이터 셋 분리

In [19]:
train_df = df.sample( frac = 0.8, random_state = 0 )
test_df = df.drop( train_df.index )

In [20]:
print( 'train data 개수 : {:5d}'.format( len( train_df ) ) )
print( 'test data 개수 : {:5d}'.format( len( test_df ) ) )

train data 개수 :   314
test data 개수 :    78


In [21]:
# 학습용 데이터
train_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,USA,Europe,Japan
146,28.0,4,90.0,75.0,2125.0,14.5,74,1.0,0.0,0.0
282,22.3,4,140.0,88.0,2890.0,17.3,79,1.0,0.0,0.0
69,12.0,8,350.0,160.0,4456.0,13.5,72,1.0,0.0,0.0
378,38.0,4,105.0,63.0,2125.0,14.7,82,1.0,0.0,0.0
331,33.8,4,97.0,67.0,2145.0,18.0,80,0.0,0.0,1.0


In [22]:
# 테스트용 데이터
test_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,USA,Europe,Japan
9,15.0,8,390.0,190.0,3850.0,8.5,70,1.0,0.0,0.0
25,10.0,8,360.0,215.0,4615.0,14.0,70,1.0,0.0,0.0
28,9.0,8,304.0,193.0,4732.0,18.5,70,1.0,0.0,0.0
31,25.0,4,113.0,95.0,2228.0,14.0,71,0.0,0.0,1.0
33,19.0,6,232.0,100.0,2634.0,13.0,71,1.0,0.0,0.0


## 3. 속성 선택

- 단순 선형 회귀 모델 변수로 사용할 후보 변수를 선택한다.
- 예측 목표 변수인 종속 변수( y )가 될 'mpg' 변수와 독립 변수( X )로 사용할 후보로 3개 변수(  'cylinders', 'horsepower', 'weight' )를 포함

In [23]:
df = train_df[ [ 'mpg', 'cylinders', 'horsepower', 'weight' ] ]
df.head()

Unnamed: 0,mpg,cylinders,horsepower,weight
146,28.0,4,75.0,2125.0
282,22.3,4,88.0,2890.0
69,12.0,8,160.0,4456.0
378,38.0,4,63.0,2125.0
331,33.8,4,67.0,2145.0


In [24]:
df.describe()

Unnamed: 0,mpg,cylinders,horsepower,weight
count,314.0,314.0,314.0,314.0
mean,23.31051,5.477707,104.869427,2990.251592
std,7.728652,1.699788,38.096214,843.898596
min,10.0,3.0,46.0,1649.0
25%,17.0,4.0,76.25,2256.5
50%,22.0,4.0,94.5,2822.5
75%,28.95,8.0,128.0,3608.0
max,46.6,8.0,225.0,5140.0


### 종속 변수 y인 'mpg'( 연비 )와 다른 변수간의 선형관계 파악을 위한 시각화

In [25]:
df.corr()

Unnamed: 0,mpg,cylinders,horsepower,weight
mpg,1.0,-0.770246,-0.764545,-0.819802
cylinders,-0.770246,1.0,0.844247,0.893869
horsepower,-0.764545,0.844247,1.0,0.857417
weight,-0.819802,0.893869,0.857417,1.0


In [None]:
sns.pairplot( df )

<seaborn.axisgrid.PairGrid at 0x1fdbcc32dc8>

### 산점도를 통해 'mpg' 변수와 선형관계를 보이는 'horsepower' 변수와 'weight' 변수를 독립변수 X로 선택하는 것을 고려한다.

### 정규화

In [None]:
df_stats = df.describe()
df_stats.pop( "mpg" )
df_stats = df_stats.transpose()
df_stats

### y( Label ) 분리

In [None]:
y_train = train_df.pop( 'mpg' )
y_test = test_df.pop( 'mpg' )

In [None]:
print( len( y_train ), len( y_test ) )

In [None]:
def normalization( x ):
  return ( x - df_stats[ 'mean' ] ) / df_stats[ 'std' ]

normed_train_df = normalization( train_df )
normed_test_df = normalization( test_df )

## 4. 모델 학습

### 훈련 / 테스트 데이터 NumPy  배열로 변환

In [None]:
X_train = np.asarray( normed_train_df[ [ 'weight' ] ] ) # 학습용 독립 변수 X를 'weight' 변수로 할 때 사용
X_test = np.asarray( normed_test_df[ [ 'weight' ] ] )

In [None]:
print( len( X_train ), len( X_test ) )

In [None]:
print( len( y_train ), len( y_test ) )

### Scikit-learn 사용

In [None]:
model = LinearRegression()
model.fit( X_train, y_train )

In [None]:
r_square = model.score( X_test, y_test ) # 결정계수( R-제곱 ) 계산
print( '결정계수( R-제곱 ) : {}'.format( r_square ) )

In [None]:
print( '회귀식의 기울기( W ) : {}'.format( model.coef_ ) )
print( '회귀식의 절편( b ) : {}'.format( model.intercept_ ) )

In [None]:
y_predict = model.predict( X_test )

In [None]:
plt.figure( figsize = ( 10, 5 ) )
ax1 = sns.distplot( y_test, hist = False, label = 'y_test' )
ax2 = sns.distplot( y_predict, hist = False, label = 'y_predict', ax = ax1 )

plt.show()

### Keras 사용

In [None]:
X_train = np.array( X_train )
X_test = np.array( X_test )
y_train = np.array( y_train )
y_test = np.array( y_test )

In [None]:
model = Sequential()
model.add( Dense( 1, input_dim = 1, activation = 'linear' ) )
sgd = optimizers.SGD( lr = 0.0001 )
model.compile( optimizer = sgd ,loss = 'mse', metrics = [ 'mse', 'mae' ] )
history = model.fit( X_train, y_train, batch_size = 1, epochs = 300, validation_split = 0.2 )

In [None]:
history_dict = history.history
loss = history_dict[ 'loss' ]
val_loss = history_dict[ 'val_loss' ]

epochs = range( 1, len( loss ) + 1 )

In [None]:
plt.plot( epochs, loss, 'b', label = 'Training loss' )
plt.plot( epochs, val_loss, 'r', label = 'Training loss' )
plt.title( 'Training and validation loss' )
plt.xlabel( 'Epochs' )
plt.ylabel( 'Loss' )
plt.legend()

plt.show()

In [None]:
hist = pd.DataFrame(history.history)
hist[ 'epoch' ] = history.epoch
hist.tail()

In [None]:
plt.figure( figsize = ( 8, 12 ) )
plt.subplot( 2, 1, 1 )
plt.xlabel( 'Epoch' )
plt.ylabel( 'Mean Abs Error[MPG]' )
plt.plot( hist[ 'epoch' ], hist[ 'mae' ], label = 'Train Error' )
plt.plot( hist[ 'epoch' ], hist[ 'val_mae' ], label = 'Val Error' )
plt.ylim( [ 0, 5 ] )
plt.legend()

plt.subplot( 2, 1, 2 )
plt.xlabel( 'Epoch' )
plt.ylabel( 'Mean Square Error [$MPG^2$]' )
plt.plot( hist[ 'epoch' ], hist[ 'mse' ], label = 'Train Error' )
plt.plot( hist[ 'epoch' ], hist[ 'val_mse' ], label = 'Val Error' )
plt.ylim( [ 0, 20 ] )
plt.legend()

plt.show()

## 5. 평가

In [None]:
loss, mae, mse = model.evaluate( X_test, y_test, verbose = 2 )

print( "테스트 세트의 평균 절대 오차: {:5.2f} MPG".format( mae ) )

## 6. 예측

In [None]:
test_predictions = model.predict( X_test ).flatten()

plt.scatter( y_test, test_predictions)
plt.xlabel( 'True Values [MPG]' )
plt.ylabel( 'Predictions [MPG]' )
plt.axis( 'equal' )
plt.axis( 'square' )
plt.xlim( [ 0, plt.xlim()[ 1 ] ] )
plt.ylim( [ 0, plt.ylim()[ 1 ] ] )
_ = plt.plot( [ - 100, 100 ], [ -100, 100 ] )

In [None]:
error = test_predictions - y_test
plt.hist( error, bins = 25 )
plt.xlabel( "Prediction Error [MPG]" )
_ = plt.ylabel( "Count" )

# 다중 선형 회귀 모델

## 4. 모델 학습

### 훈련 / 테스트 데이터 NumPy  배열로 변환

In [None]:
X_train = np.asarray( normed_train_df[ [ 'horsepower', 'weight' ] ] ) 
X_test = np.asarray( normed_test_df[ [ 'horsepower', 'weight' ] ] )

In [None]:
print( len( X_train ), len( X_test ) )

In [None]:
print( len( y_train ), len( y_test ) )

### Scikit-learn 사용

In [None]:
model = LinearRegression()
model.fit( X_train, y_train )

In [None]:
r_square = model.score( X_test, y_test ) # 결정계수( R-제곱 ) 계산
print( '결정계수( R-제곱 ) : {}'.format( r_square ) )

In [None]:
print( '회귀식의 기울기( W ) : {}'.format( model.coef_ ) )
print( '회귀식의 절편( b ) : {}'.format( model.intercept_ ) )

In [None]:
y_predict = model.predict( X_test )

In [None]:
plt.figure( figsize = ( 10, 5 ) )
ax1 = sns.distplot( y_test, hist = False, label = 'y_test' )
ax2 = sns.distplot( y_predict, hist = False, label = 'y_predict', ax = ax1 )

plt.show()

### Keras 사용

In [None]:
X_train = np.array( X_train )
X_test = np.array( X_test )

In [None]:
model = Sequential()
model.add( Dense( 1, input_dim = 2, activation = 'linear' ) )
sgd = optimizers.SGD( lr = 0.0001 )
model.compile( optimizer = sgd ,loss = 'mse', metrics = [ 'mse', 'mae' ] )
history = model.fit( X_train, y_train, batch_size = 1, epochs = 300, validation_split = 0.2 )

In [None]:
history_dict = history.history
loss = history_dict[ 'loss' ]
val_loss = history_dict[ 'val_loss' ]

epochs = range( 1, len( loss ) + 1 )

In [None]:
plt.plot( epochs, loss, 'b', label = 'Training loss' )
plt.plot( epochs, val_loss, 'r', label = 'Training loss' )
plt.title( 'Training and validation loss' )
plt.xlabel( 'Epochs' )
plt.ylabel( 'Loss' )
plt.legend()

plt.show()

In [None]:
hist = pd.DataFrame(history.history)
hist[ 'epoch' ] = history.epoch
hist.tail()

In [None]:
plt.figure( figsize = ( 8, 12 ) )
plt.subplot( 2, 1, 1 )
plt.xlabel( 'Epoch' )
plt.ylabel( 'Mean Abs Error[MPG]' )
plt.plot( hist[ 'epoch' ], hist[ 'mae' ], label = 'Train Error' )
plt.plot( hist[ 'epoch' ], hist[ 'val_mae' ], label = 'Val Error' )
plt.ylim( [ 0, 5 ] )
plt.legend()

plt.subplot( 2, 1, 2 )
plt.xlabel( 'Epoch' )
plt.ylabel( 'Mean Square Error [$MPG^2$]' )
plt.plot( hist[ 'epoch' ], hist[ 'mse' ], label = 'Train Error' )
plt.plot( hist[ 'epoch' ], hist[ 'val_mse' ], label = 'Val Error' )
plt.ylim( [ 0, 20 ] )
plt.legend()

plt.show()

# 5. 평가

In [None]:
loss, mae, mse = model.evaluate( X_test, y_test, verbose = 2 )

print( "테스트 세트의 평균 절대 오차: {:5.2f} MPG".format( mae ) )

# 6. 예측

In [None]:
test_predictions = model.predict( X_test ).flatten()

plt.scatter( y_test, test_predictions)
plt.xlabel( 'True Values [MPG]' )
plt.ylabel( 'Predictions [MPG]' )
plt.axis( 'equal' )
plt.axis( 'square' )
plt.xlim( [ 0, plt.xlim()[ 1 ] ] )
plt.ylim( [ 0, plt.ylim()[ 1 ] ] )
_ = plt.plot( [ - 100, 100 ], [ -100, 100 ] )

In [None]:
error = test_predictions - y_test
plt.hist( error, bins = 25 )
plt.xlabel( "Prediction Error [MPG]" )
_ = plt.ylabel( "Count" )