# auto_mpg 데이터 전처리 ----------------------------------------------------

- 1. 단위 변환 => 컬럼 추가
- 2. 자료형 검사 및 변환
- 3. origin 컬럼에 대한 라벨 => (1-미국, 2-유럽, 3-일본) => 컬럼 추가

In [579]:
# 모듈 로딩 ---------------------------------------------------------
import pandas as pd
import numpy as np

# 파일 관련 변수들 ---------------------------------------
DIR='../Data/'
FILE=DIR+'auto_mpg.csv'

FILE

'../Data/auto_mpg.csv'

## (1) 데이터 로딩 ------------------------------------------------------

In [580]:
# CSV FILE => DataFrame 로딩
mpgDF=pd.read_csv(FILE)

# (2) Data 정보 확인
# 요약 정보 확인
mpgDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


## (2) 데이터 확인 ----------------------------------

In [581]:
# 일부 데이터 확인
mpgDF.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [582]:
mpgDF

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


## (3) 데이터 전처리 ------------------------------------------------------


#### (3-1) 데이터 정제 => 결측치, 중복 처리

In [583]:
# isnull() => sum()
mpgDF.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [584]:
# 전체 column에 대한 unique() 고유값
for col in mpgDF.columns:
    print(f'{col}--------------------------\n{mpgDF[col].unique()}\n')

mpg--------------------------
[18.  15.  16.  17.  14.  24.  22.  21.  27.  26.  25.  10.  11.   9.
 28.  19.  12.  13.  23.  30.  31.  35.  20.  29.  32.  33.  17.5 15.5
 14.5 22.5 24.5 18.5 29.5 26.5 16.5 31.5 36.  25.5 33.5 20.5 30.5 21.5
 43.1 36.1 32.8 39.4 19.9 19.4 20.2 19.2 25.1 20.6 20.8 18.6 18.1 17.7
 27.5 27.2 30.9 21.1 23.2 23.8 23.9 20.3 21.6 16.2 19.8 22.3 17.6 18.2
 16.9 31.9 34.1 35.7 27.4 25.4 34.2 34.5 31.8 37.3 28.4 28.8 26.8 41.5
 38.1 32.1 37.2 26.4 24.3 19.1 34.3 29.8 31.3 37.  32.2 46.6 27.9 40.8
 44.3 43.4 36.4 44.6 40.9 33.8 32.7 23.7 23.6 32.4 26.6 25.8 23.5 39.1
 39.  35.1 32.3 37.7 34.7 34.4 29.9 33.7 32.9 31.6 28.1 30.7 24.2 22.4
 34.  38.  44. ]

cylinders--------------------------
[8 4 6 3 5]

displacement--------------------------
[307.  350.  318.  304.  302.  429.  454.  440.  455.  390.  383.  340.
 400.  113.  198.  199.  200.   97.  110.  107.  104.  121.  360.  140.
  98.  232.  225.  250.  351.  258.  122.  116.   79.   88.   71.   72.
  91.   97

In [585]:
# horsepower 컬럼에서 타입 체크
mpgDF['horsepower'].dtype

dtype('O')

In [586]:
# horsepower 컬럼에서 다른 데이터와 다른 문자 '?' 처리
mpgDF['horsepower'].replace('?', np.nan, inplace=True)    # '' <- 이것은 공백아님
mpgDF.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [587]:
# horsepower 컬럼에서 Missing Value 처리
# Missing Value 대체값 => 컬럼 평균 => mean()
#mpgDF['horsepower'].fillna(mpgDF['horsepower'].mean())

# Missing Value 대체값 => 컬럼 평균 => mode()
# mpgDF['horsepower'].fillna(mpgDF['horsepower'].mode())

# print(type(mpgDF['horsepower'].mode()) , mpgDF['horsepower'].mode()[0])       # 최빈값 여러개 나올 수 있어서 0번 원소 값을 뺀다?

mpgDF['horsepower'].fillna(method='ffill', inplace=True)
mpgDF.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

#### (3-1-2) 중복 체크 및 처리

In [588]:
# 체크 : duplicated()                  => True/False    : 행(row)
# 처리 : drop_duplicated()             => 파라미터 세팅
mpgDF.duplicated().sum()

0

In [589]:
# NaN 체크
mpgDF.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [590]:
country=[]
for i in mpgDF.index:
    if mpgDF.iloc[i]['origin']==1:
        country.append('미국')
    elif mpgDF.iloc[i]['origin']==2:
        country.append('유럽')
    if mpgDF.iloc[i]['origin']==3:
        country.append('일본') 

mpgDF['제조국']=country

In [591]:
# mpgDF.replace({'origin':{1:'미국',2:'유럽',3:'일본'}})

In [592]:
# NaN 체크
mpgDF.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
제조국             0
dtype: int64

In [593]:
# (3) 데이터 전처리
# 데이터 --- 데이터 타입 맞도록 설정
mpgDF.horsepower

0      130
1      165
2      150
3      150
4      140
      ... 
393     86
394     52
395     84
396     79
397     82
Name: horsepower, Length: 398, dtype: object

In [594]:
# 타입 변경 => astype( 타입 )
# (1) 데이터의 고유값 확인
mpgDF.horsepower.unique()

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

In [595]:
# (2) 문제가 되는 데이터를 NaN으로 변경하기 => replace(원래값, 새로운 값)
# NaN 상수 사용을 위해서 Numpy 모듈 포함
import numpy as np

mpgDF.horsepower.replace('?', np.nan, inplace=True)
mpgDF.horsepower.unique()

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

In [596]:
# NaN 체크
mpgDF.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
제조국             0
dtype: int64

In [597]:
# NaN => 0으로 채우기
mpgDF.fillna(0, inplace=True)
mpgDF.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
제조국             0
dtype: int64

In [598]:
# horsepower 컬럼을 object => int로 타입 변경 : astype()
mpgDF.horsepower=mpgDF.horsepower.astype(int)

In [599]:
# 요약 정보 확인
mpgDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    int32  
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
 9   제조국           398 non-null    object 
dtypes: float64(3), int32(1), int64(4), object(2)
memory usage: 29.7+ KB


#### (3-2) 데이터표준화 => 단위/ 자료형

- Km/L = mpg * 0.425

#### (3-2-1) mpg ==> kml 단위 변환값 컬럼 추가 - Km/L = mpg * 0.425

In [600]:
# DF의 제일 마지막 추가
mpgDF['Kml']=mpgDF['mpg']*0.425

# DF의 원하는 위치에 추가 => insert(정수위치번호(인덱스?), 컬럼명, 데이터)
mpgDF.insert(1, 'kml', mpgDF['mpg']*0.425)
mpgDF.columns

Index(['mpg', 'kml', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name', '제조국', 'Kml'],
      dtype='object')

###### (3-2-2) origin 컬럼에 대한  origin_label 컬럼 추가 및 자료형 설정

- 1: USA, 2 : EU, 3: JPA

In [601]:
mpgDF['origin'].unique()

array([1, 3, 2], dtype=int64)

In [602]:
# origin 컬럼 라벨 데이터 생성
originLabel=mpgDF['origin'].replace({1:'USA',2:'EU',3:'JPA'})

# DF에 컬럼 추가       # 이름을 알고 값을 빼는 거 연구해보기 
mpgDF.insert(9, 'origin_label', originLabel)
mpgDF.columns

Index(['mpg', 'kml', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'origin_label', 'car name',
       '제조국', 'Kml'],
      dtype='object')

In [603]:
mpgDF.dtypes

mpg             float64
kml             float64
cylinders         int64
displacement    float64
horsepower        int32
weight            int64
acceleration    float64
model year        int64
origin            int64
origin_label     object
car name         object
제조국              object
Kml             float64
dtype: object

In [1]:
# origin, origin_label int64/ object 기본자료형 => 범주형 category
mpgDF[['origin','origin_label']]=mpgDF[['origin','origin_label']].astype('category')
mpgDF.dtypes

NameError: name 'mpgDF' is not defined

In [605]:
# origin, origin_label int64/ object 기본자료형 => 범주형 category (다시 object로 바꿔줌)
mpgDF[['origin','origin_label']]=mpgDF[['origin','origin_label']].astype('str')

In [606]:
mpgDF.dtypes

mpg             float64
kml             float64
cylinders         int64
displacement    float64
horsepower        int32
weight            int64
acceleration    float64
model year        int64
origin           object
origin_label     object
car name         object
제조국              object
Kml             float64
dtype: object

In [607]:
mpgDF['origin']

0      1
1      1
2      1
3      1
4      1
      ..
393    1
394    2
395    1
396    1
397    1
Name: origin, Length: 398, dtype: object

###### (3-2-4) horsepower 컬럼에 대한 

In [608]:
# horsepower 컬럼에 대한 데이터 값 확인
mpgDF['horsepower'].unique()

array([130, 165, 150, 140, 198, 220, 215, 225, 190, 170, 160,  95,  97,
        85,  88,  46,  87,  90, 113, 200, 210, 193, 100, 105, 175, 153,
       180, 110,  72,  86,  70,  76,  65,  69,  60,  80,  54, 208, 155,
       112,  92, 145, 137, 158, 167,  94, 107, 230,  49,  75,  91, 122,
        67,  83,  78,  52,  61,  93, 148, 129,  96,  71,  98, 115,  53,
        81,  79, 120, 152, 102, 108,  68,  58, 149,  89,  63,  48,  66,
       139, 103, 125, 133, 138, 135, 142,  77,  62, 132,  84,  64,  74,
       116,  82])

In [609]:
# horsepower 컬럼 타입 변경 object => int
mpgDF['horsepower'].astype('int')

mpgDF['horsepower']=pd.to_numeric(mpgDF['horsepower'],
                                  errors='coerce',             # 'coerce'인 경우 잘못된 구문 분석이 NaN으로 설정
                                  downcast='integer')          # to_numeric 으로 수치 타입으로 바꿀 수 있음
                                                                # signed,  정수 부호 있고 ???
                                                                # unsigned , 정수 부호 없다 ???

In [610]:
mpgDF.dtypes

mpg             float64
kml             float64
cylinders         int64
displacement    float64
horsepower        int16
weight            int64
acceleration    float64
model year        int64
origin           object
origin_label     object
car name         object
제조국              object
Kml             float64
dtype: object

In [611]:
mpgDF.horsepower

0      130
1      165
2      150
3      150
4      140
      ... 
393     86
394     52
395     84
396     79
397     82
Name: horsepower, Length: 398, dtype: int16

In [612]:
# horsepower 컬럼 타입 변경 int 연속형 ==> 범주형 category 변환
# 구간 => 라벨 horsepower_low, horsepower_normal, horsepower_high
mpgDF['horsepower_label']=pd.cut(x=mpgDF['horsepower'],
                           bins=3,
                           labels=['horsepower_low', 'horsepower_normal', 'horsepower_high'],
                           include_lowest=True)

In [613]:
mpgDF['horsepower_label'].value_counts()

horsepower_low       263
horsepower_normal    103
horsepower_high       32
Name: horsepower_label, dtype: int64

In [614]:
mpgDF['horsepower_label']=mpgDF['horsepower_label'].astype('category')
mpgDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   mpg               398 non-null    float64 
 1   kml               398 non-null    float64 
 2   cylinders         398 non-null    int64   
 3   displacement      398 non-null    float64 
 4   horsepower        398 non-null    int16   
 5   weight            398 non-null    int64   
 6   acceleration      398 non-null    float64 
 7   model year        398 non-null    int64   
 8   origin            398 non-null    object  
 9   origin_label      398 non-null    object  
 10  car name          398 non-null    object  
 11  제조국               398 non-null    object  
 12  Kml               398 non-null    float64 
 13  horsepower_label  398 non-null    category
dtypes: category(1), float64(5), int16(1), int64(3), object(4)
memory usage: 38.7+ KB


In [615]:
mpgDF['horsepower_label'].tail(3)

395    horsepower_low
396    horsepower_low
397    horsepower_low
Name: horsepower_label, dtype: category
Categories (3, object): ['horsepower_low' < 'horsepower_normal' < 'horsepower_high']

In [616]:
# mpgDF.insert(5,'horsepower_label', mpgDF['horsepower_label'])
# mpgDF['horsepower_label']

In [617]:
# horsepower 컬럼의 One-Hot-Encoding 변환 => pandas.get_dummies()
oneHot=pd.get_dummies(mpgDF['horsepower_label'])

In [618]:
pd.concat([mpgDF,oneHot], axis=1)

Unnamed: 0,mpg,kml,cylinders,displacement,horsepower,weight,acceleration,model year,origin,origin_label,car name,제조국,Kml,horsepower_label,horsepower_low,horsepower_normal,horsepower_high
0,18.0,7.650,8,307.0,130,3504,12.0,70,1,USA,chevrolet chevelle malibu,미국,7.650,horsepower_normal,0,1,0
1,15.0,6.375,8,350.0,165,3693,11.5,70,1,USA,buick skylark 320,미국,6.375,horsepower_normal,0,1,0
2,18.0,7.650,8,318.0,150,3436,11.0,70,1,USA,plymouth satellite,미국,7.650,horsepower_normal,0,1,0
3,16.0,6.800,8,304.0,150,3433,12.0,70,1,USA,amc rebel sst,미국,6.800,horsepower_normal,0,1,0
4,17.0,7.225,8,302.0,140,3449,10.5,70,1,USA,ford torino,미국,7.225,horsepower_normal,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,11.475,4,140.0,86,2790,15.6,82,1,USA,ford mustang gl,미국,11.475,horsepower_low,1,0,0
394,44.0,18.700,4,97.0,52,2130,24.6,82,2,EU,vw pickup,유럽,18.700,horsepower_low,1,0,0
395,32.0,13.600,4,135.0,84,2295,11.6,82,1,USA,dodge rampage,미국,13.600,horsepower_low,1,0,0
396,28.0,11.900,4,120.0,79,2625,18.6,82,1,USA,ford ranger,미국,11.900,horsepower_low,1,0,0


###### (3-3) 데이터정규화

In [619]:
# mpg -> Km/L 로 단위 변환
mpg=[]
for i in mpgDF.index:
    mpg.append(mpgDF.iloc[i]['mpg'])

L=[]
for j in mpg:
    km=j*0.425
    L.append(km)
    
mpgDF['Km/L']=L

In [620]:
mpgDF

Unnamed: 0,mpg,kml,cylinders,displacement,horsepower,weight,acceleration,model year,origin,origin_label,car name,제조국,Kml,horsepower_label,Km/L
0,18.0,7.650,8,307.0,130,3504,12.0,70,1,USA,chevrolet chevelle malibu,미국,7.650,horsepower_normal,7.650
1,15.0,6.375,8,350.0,165,3693,11.5,70,1,USA,buick skylark 320,미국,6.375,horsepower_normal,6.375
2,18.0,7.650,8,318.0,150,3436,11.0,70,1,USA,plymouth satellite,미국,7.650,horsepower_normal,7.650
3,16.0,6.800,8,304.0,150,3433,12.0,70,1,USA,amc rebel sst,미국,6.800,horsepower_normal,6.800
4,17.0,7.225,8,302.0,140,3449,10.5,70,1,USA,ford torino,미국,7.225,horsepower_normal,7.225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,11.475,4,140.0,86,2790,15.6,82,1,USA,ford mustang gl,미국,11.475,horsepower_low,11.475
394,44.0,18.700,4,97.0,52,2130,24.6,82,2,EU,vw pickup,유럽,18.700,horsepower_low,18.700
395,32.0,13.600,4,135.0,84,2295,11.6,82,1,USA,dodge rampage,미국,13.600,horsepower_low,13.600
396,28.0,11.900,4,120.0,79,2625,18.6,82,1,USA,ford ranger,미국,11.900,horsepower_low,11.900


In [621]:
# mpgDF['Km/L']=mpgDF['mpg']*0.425

In [622]:
# mpgDF.drop_duplicates(subset=['mpg','cylinders,'displacement','horsepower','weight','acceleration']

#### (3-3) 데이터정규화

- 수치데이터 컬럼마다 값의 범위가 다름
- 값의 범위 0 ~ 1 / -1 ~ 1 로 값을 설정

###### (3-3-1) 방법 : 컬럼의 최대값의 절대값으로 전체 데이터를 나누기

In [623]:
mpgDF['weight'].max(), mpgDF['weight'].min(), abs( mpgDF['weight'].max() )

(5140, 1613, 5140)

In [624]:
mpgDF['weight_nor_1'] = mpgDF['weight'] / abs(mpgDF['weight'].max())
mpgDF['weight_nor_1'].min(), mpgDF['weight_nor_1'].max()

(0.31381322957198443, 1.0)

###### (3-3-2) 방법 : 컬럼의 데이터 - 최소값/ 최대값-최소값

- 컬럼의 최대값 => max(), 최소값 => min()

In [625]:
numerator = mpgDF['weight'] - mpgDF['weight'].min()
denominator = mpgDF['weight'].max() - mpgDF['weight'].min()

mpgDF['weight_nor_2'] = numerator/denominator

mpgDF['weight_nor_2'].min(), mpgDF['weight_nor_2'].max()

(0.0, 1.0)

###### (3-3-3) 방법 : 컬럼의 데이터 - 평균 / 표준편차 ==> Z-score

In [626]:
numerator=mpgDF['weight'] - mpgDF['weight'].mean()
denominator=mpgDF['weight'].std()

mpgDF['weight_nor_3']=numerator/denominator

mpgDF['weight_nor_3'].min(), mpgDF['weight_nor_3'].max()

(-1.6029259118708488, 2.561960738109357)

In [627]:
mpgDF[['weight', 'weight_nor_1', 'weight_nor_2', 'weight_nor_3']]

Unnamed: 0,weight,weight_nor_1,weight_nor_2,weight_nor_3
0,3504,0.681712,0.536150,0.630077
1,3693,0.718482,0.589736,0.853259
2,3436,0.668482,0.516870,0.549778
3,3433,0.667899,0.516019,0.546236
4,3449,0.671012,0.520556,0.565130
...,...,...,...,...
393,2790,0.542802,0.333711,-0.213056
394,2130,0.414397,0.146583,-0.992422
395,2295,0.446498,0.193365,-0.797581
396,2625,0.510700,0.286929,-0.407897


In [628]:
mpgDF.dtypes

mpg                  float64
kml                  float64
cylinders              int64
displacement         float64
horsepower             int16
weight                 int64
acceleration         float64
model year             int64
origin                object
origin_label          object
car name              object
제조국                   object
Kml                  float64
horsepower_label    category
Km/L                 float64
weight_nor_1         float64
weight_nor_2         float64
weight_nor_3         float64
dtype: object

In [629]:
numerator = mpgDF['weight'] - mpgDF['weight'].min()
denominator = mpgDF['weight'].max() - mpgDF['weight'].min()

mpgDF['weight_nor_2'] = numerator/denominator
mpgDF['weight_nor_2'].min(), mpgDF['weight_nor_2'].max()

(0.0, 1.0)

In [634]:
def normal(df,col):
    numerator=df[col] / abs(df[col].min())
    denominator=df[col].max() - df[col].min()
    columns = numerator/denominator
    return columns.min(), columns.max()

for i in ['mpg','kml','cylinders','displacement','horsepower','weight','acceleration','model year','','']:
    print(normal(mpgDF,i))

mpg
(0.026595744680851064, 0.13770685579196218)
horsepower
(0.005434782608695652, 0.02717391304347826)


In [651]:
def normal(df,col):
    numerator=df[col] / abs(df[col].min())
    denominator=df[col].max() - df[col].min()
    columns = numerator/denominator
    return f'{df}[{col}]={columns}'

for i in ['mpg','cylinders','horsepower','kml']:
    print(normal(mpgDF,i))
    
    
mpgDF

      mpg     kml  cylinders  displacement  horsepower  weight  acceleration  \
0    18.0   7.650          8         307.0         130    3504          12.0   
1    15.0   6.375          8         350.0         165    3693          11.5   
2    18.0   7.650          8         318.0         150    3436          11.0   
3    16.0   6.800          8         304.0         150    3433          12.0   
4    17.0   7.225          8         302.0         140    3449          10.5   
..    ...     ...        ...           ...         ...     ...           ...   
393  27.0  11.475          4         140.0          86    2790          15.6   
394  44.0  18.700          4          97.0          52    2130          24.6   
395  32.0  13.600          4         135.0          84    2295          11.6   
396  28.0  11.900          4         120.0          79    2625          18.6   
397  31.0  13.175          4         119.0          82    2720          19.4   

     model year origin origin_label    

Unnamed: 0,mpg,kml,cylinders,displacement,horsepower,weight,acceleration,model year,origin,origin_label,car name,제조국,Kml,horsepower_label,Km/L,weight_nor_1,weight_nor_2,weight_nor_3
0,18.0,7.650,8,307.0,130,3504,12.0,70,1,USA,chevrolet chevelle malibu,미국,7.650,horsepower_normal,7.650,0.681712,0.536150,0.630077
1,15.0,6.375,8,350.0,165,3693,11.5,70,1,USA,buick skylark 320,미국,6.375,horsepower_normal,6.375,0.718482,0.589736,0.853259
2,18.0,7.650,8,318.0,150,3436,11.0,70,1,USA,plymouth satellite,미국,7.650,horsepower_normal,7.650,0.668482,0.516870,0.549778
3,16.0,6.800,8,304.0,150,3433,12.0,70,1,USA,amc rebel sst,미국,6.800,horsepower_normal,6.800,0.667899,0.516019,0.546236
4,17.0,7.225,8,302.0,140,3449,10.5,70,1,USA,ford torino,미국,7.225,horsepower_normal,7.225,0.671012,0.520556,0.565130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,11.475,4,140.0,86,2790,15.6,82,1,USA,ford mustang gl,미국,11.475,horsepower_low,11.475,0.542802,0.333711,-0.213056
394,44.0,18.700,4,97.0,52,2130,24.6,82,2,EU,vw pickup,유럽,18.700,horsepower_low,18.700,0.414397,0.146583,-0.992422
395,32.0,13.600,4,135.0,84,2295,11.6,82,1,USA,dodge rampage,미국,13.600,horsepower_low,13.600,0.446498,0.193365,-0.797581
396,28.0,11.900,4,120.0,79,2625,18.6,82,1,USA,ford ranger,미국,11.900,horsepower_low,11.900,0.510700,0.286929,-0.407897


In [659]:
nums=['a','b','c','d']
idx=0
for num in nums:
    print(num, idx)
    idx+=1
    
for idx, num in enumerate(nums):       # enumerate 나열한다? 알아서 데이터랑 인덱스 가져옴 이 경우만 ???
    print(num, idx)
print('-------------------------')
for idx, num in enumerate(nums):
    print(num, idx)

a 0
b 1
c 2
d 3
a 0
b 1
c 2
d 3
-------------------------
a 0
b 1
c 2
d 3


## (4) 데이터 추출 ------------------------------------------------------