In [3]:
import pandas as pd
from numpy import nan
import seaborn as sns

In [4]:
titanic = sns.load_dataset('titanic')
titanic.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [5]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [6]:
# 컬럼별 데이터수량 - 전체 index 수 를 boolean 인덱스 조사
count = titanic.count() - len(titanic.index)
mask = count < 0

# 데이터가 누락된 컬럼별 누락 수량
count[mask]

age           -177
embarked        -2
deck          -688
embark_town     -2
dtype: int64

In [7]:
titanic['deck'].value_counts(dropna = False)

deck
NaN    688
C       59
B       47
D       33
E       32
A       15
F       13
G        4
Name: count, dtype: int64

In [8]:
# 빈 데이터(NaN)을 True(1)로 바꾸고, 합계 -> 누락수량
print( titanic.isnull().sum() )
print()
# 데이터을 True(1)로 바꾸고, 합계 -> 누락되지 않은 실제수량
print( titanic.notnull().sum() )

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64


In [9]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [10]:
titanic2 = titanic.copy()

count = len(titanic2.index) * 0.8
print('정상치의 80%:', count)
# 누락 데이터 제거(행,열), 치환
# dropna(thresh = 임계값) : 정상적인 데이터의 수량이 임계값을 만족하지 않는 행/열 제거
titanic2.dropna(axis = 'columns', thresh = count)

정상치의 80%: 712.8000000000001


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


In [11]:
titanic2 = titanic.copy()

# DF.dropna()                           : 데이터프레임의 모든 컬럼을 조사하여 제거
# DF.dropna(subset = ['컬럼1', '컬럼2'] : 데이터프레임의 특정 컬럼을 조사하여 제거
# how = 'all'   : 다수의 컬럼들의 모든 값에 적용
# how = 'any'   : 다수의 컬럼들 중 1개이상에 적용
titanic2.dropna(subset = ['age', 'embarked'], how = 'any', axis = 'index')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [12]:
titanic2 = titanic.copy()

mean_age = titanic2['age'].mean()
print('평균 나이:', mean_age)

titanic2['age'] = titanic2['age'].fillna(mean_age)
titanic2['age'].values

평균 나이: 29.69911764705882


array([22.        , 38.        , 26.        , 35.        , 35.        ,
       29.69911765, 54.        ,  2.        , 27.        , 14.        ,
        4.        , 58.        , 20.        , 39.        , 14.        ,
       55.        ,  2.        , 29.69911765, 31.        , 29.69911765,
       35.        , 34.        , 15.        , 28.        ,  8.        ,
       38.        , 29.69911765, 19.        , 29.69911765, 29.69911765,
       40.        , 29.69911765, 29.69911765, 66.        , 28.        ,
       42.        , 29.69911765, 21.        , 18.        , 14.        ,
       40.        , 27.        , 29.69911765,  3.        , 19.        ,
       29.69911765, 29.69911765, 29.69911765, 29.69911765, 18.        ,
        7.        , 21.        , 49.        , 29.        , 65.        ,
       29.69911765, 21.        , 28.5       ,  5.        , 11.        ,
       22.        , 38.        , 45.        ,  4.        , 29.69911765,
       29.69911765, 29.        , 19.        , 17.        , 26.  

In [13]:
titanic2 = titanic.copy()
most_freq = titanic2['embark_town'].value_counts().idxmax()
print('최대인원이 탑승한 도시명:', most_freq)

titanic2['embark_town'] = titanic2['embark_town'].fillna(most_freq)
titanic2.info()

최대인원이 탑승한 도시명: Southampton
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  891 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [14]:
titanic2 = titanic.copy()

# 1. 전체 컬럼들 중 20%이상 누락된 데이터가 있는 컬럼 제거
count = len(titanic2.index) * 0.8
titanic2 = titanic2.dropna(axis = 'columns', thresh = count)

# 2. age 컬럼의 누락데이터를 평균값을 대체
mean_age = titanic2['age'].mean()
titanic2['age'] = titanic2['age'].fillna(mean_age)

# 3. embark_town 컬럼은 최빈값으로 대체
most_freq = titanic2['embark_town'].value_counts().idxmax()
titanic2['embark_town'] = titanic2['embark_town'].fillna(most_freq)

# 4. embarked 컬럼은 이웃값으로 대체
titanic2['embarked'] = titanic2['embarked'].fillna(method = 'ffill')

titanic2.count()

survived       891
pclass         891
sex            891
age            891
sibsp          891
parch          891
fare           891
embarked       891
class          891
who            891
adult_male     891
embark_town    891
alive          891
alone          891
dtype: int64

In [51]:
car = pd.read_csv('csv/auto-mpg.csv')
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [60]:
def is_not_numbers(item_list):
    not_numbers = []
    for item in item_list:
        try:
            int(item)
        except:
            if item not in not_numbers:
                not_numbers.append(item)
    return not_numbers

def is_not_numbers2(item_list):
    not_numbers = {}
    for item in item_list:
        try:
            int(item)
        except:
            if item not in not_numbers:
                not_numbers[item] = 1
            else:
                not_numbers[item] += 1
    return not_numbers

print(is_not_numbers( car['horsepower'].values ))
print(is_not_numbers2( car['horsepower'].values ))

['?', '-', 'x']
{'?': 6, '-': 3, 'x': 4}


In [23]:
car2 = car.copy()
car2['horsepower'] = car2['horsepower'].replace('?',nan)
car2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [24]:
car2 = car2.dropna(axis = 'index')
car2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    object 
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
 8   car name      392 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 30.6+ KB


In [25]:
car2['horsepower'] = car2['horsepower'].astype('int')
car2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    int32  
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
 8   car name      392 non-null    object 
dtypes: float64(3), int32(1), int64(4), object(1)
memory usage: 29.1+ KB


In [29]:
# p182 중복데이터 처리
df = pd.DataFrame({'c1':['a','a','b','a','b','b'],
                  'c2':[1,1,1,2,2,1],
                  'c3':[1,1,2,2,2,2]})
df

Unnamed: 0,c1,c2,c3
0,a,1,1
1,a,1,1
2,b,1,2
3,a,2,2
4,b,2,2
5,b,1,2


In [32]:
# 모든 필드에 나열된 데이터(행)들이 모두 일치하는가?
print(df.duplicated())

# subset 옵션 추가하면,
# df.duplicated(subset = ['필드1','필드2']) 특정필드에 나열된 데이터(행)들이 일치하는가/

print(df.duplicated(subset = ['c2','c3']))

0    False
1     True
2    False
3    False
4    False
5     True
dtype: bool
0    False
1     True
2    False
3    False
4     True
5     True
dtype: bool


In [34]:
df2 = df.copy()
df2.drop_duplicates()

Unnamed: 0,c1,c2,c3
0,a,1,1
2,b,1,2
3,a,2,2
4,b,2,2


In [37]:
df2 = df.copy()
df2.drop_duplicates(subset = ['c2','c3'])

Unnamed: 0,c1,c2,c3
0,a,1,1
2,b,1,2
3,a,2,2


In [38]:
car

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [41]:
mpg_to_kpl = 1.60934/3.78541
print('20mpg ->?kpl', 20 * mpg_to_kpl)

20mpg ->?kpl 8.50285702209272


In [52]:
car2 = car.copy()
kpl = car2['mpg']*mpg_to_kpl
car2.insert(1,'kpl',kpl)

In [53]:
car2

Unnamed: 0,mpg,kpl,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,7.652571,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,6.377143,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,7.652571,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,6.802286,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,7.227428,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...,...
393,27.0,11.478857,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,18.706285,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,13.604571,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,11.904000,4,120.0,79,2625,18.6,82,1,ford ranger


In [65]:
car = pd.read_csv('csv/auto-mpg2.csv')
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [66]:
is_not_numbers(car['horsepower'].values)

['?', '-', 'x']

In [62]:
# car DF의 horsepower 컬럼에서 정수가 될 수 없는 문자들을 nan 데이터로 치환하고, 
# dropna() 메소드를 이용하여 nan이 포함된 모든 행들을 제거한다. 
# 그리고, horsepower 컬럼을 정수(int) 형으로 변환한다.

In [74]:
not_numbers = list(is_not_numbers(car['horsepower'].values))

for notnumber in not_numbers:
    car['horsepower'] = car['horsepower'].replace(notnumber, nan)

print(car['horsepower'].values)


['130' '165' '150' '150' '140' '198' '220' '215' '225' '190' '170' '160'
 '150' '225' '95' '95' '97' '85' '88' '46' '87' '90' '95' '113' '90' '215'
 '200' '210' '193' '88' '90' '95' nan '100' nan '100' '88' '100' '165'
 '175' '153' '150' '180' '170' '175' '110' '72' '100' '88' nan '90' '70'
 '76' nan '69' '60' '70' '95' '80' '54' '90' nan '165' '175' '150' nan
 '150' '208' '155' '160' '190' '97' '150' '130' '140' '150' '112' '76'
 '87' '69' '86' '92' '97' '80' '88' '175' '150' '145' '137' nan '198'
 '150' '158' '150' '215' '225' '175' '105' '100' '100' '88' '95' '46'
 '150' '167' '170' '180' '100' '88' '72' '94' nan '85' '107' '90' '145'
 '230' '49' '75' '91' '112' '150' '110' '122' '180' '95' nan '100' '100'
 '67' '80' '65' '75' '100' '110' '105' '140' '150' '150' '140' '150' '83'
 '67' '78' '52' '61' '75' '75' '75' '97' '93' '67' '95' '105' '72' '72'
 '170' '145' '150' '148' '110' '105' '110' '95' '110' '110' '129' '75'
 '83' '100' '78' '96' '71' '97' '97' '70' '90' '95' '88' '98' '1

In [70]:
not_numbers[0]

'?'

In [None]:
car2['horsepower'] = car2['horsepower'].replace('?',nan)


In [80]:
#답
is_not_numbers(car['horsepower'].values)

for char in is_not_numbers(car['horsepower'].values):
    car['horsepower'] = car['horsepower'].replace(char, nan)
    
car = car.dropna(subset = 'horsepower', axis = 'index')
car.info()

car['horsepower'] = car['horsepower'].astype('int')
car.info()

<class 'pandas.core.frame.DataFrame'>
Index: 385 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           385 non-null    float64
 1   cylinders     385 non-null    int64  
 2   displacement  385 non-null    float64
 3   horsepower    385 non-null    object 
 4   weight        385 non-null    int64  
 5   acceleration  385 non-null    float64
 6   model year    385 non-null    int64  
 7   origin        385 non-null    int64  
 8   car name      385 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 30.1+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 385 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           385 non-null    float64
 1   cylinders     385 non-null    int64  
 2   displacement  385 non-null    float64
 3   horsepower    385 non-null    int32  
 4   wei

In [81]:
car = pd.read_csv('csv/auto-mpg.csv')
car['horsepower'].unique()

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', '?', '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

In [88]:
mask = car['horsepower'].str.isdigit() # 이렇게 할 수도 있다. 
# car['horsepower']는 시리즈인데, str을 붙이면 문자열 함수를 적용할 수 있게 해준다.
car = car[mask]
car['horsepower'] = car['horsepower'].astype(int)
car.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    int32  
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
 8   car name      392 non-null    object 
dtypes: float64(3), int32(1), int64(4), object(1)
memory usage: 29.1+ KB


In [89]:
car['origin'].unique()

array([1, 3, 2], dtype=int64)

In [94]:
car['origin'] = car['origin'].replace({1:'usa',2:'eu',3:'jpn'})
car.values  #object를 category형으로..
car['origin'] = car['origin'].astype('category')
car.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   mpg           392 non-null    float64 
 1   cylinders     392 non-null    int64   
 2   displacement  392 non-null    float64 
 3   horsepower    392 non-null    int32   
 4   weight        392 non-null    int64   
 5   acceleration  392 non-null    float64 
 6   model year    392 non-null    int64   
 7   origin        392 non-null    category
 8   car name      392 non-null    object  
dtypes: category(1), float64(3), int32(1), int64(3), object(1)
memory usage: 26.5+ KB


In [97]:
car['model year']+= 1900

In [106]:
car['model year'] = car['model year'].astype('category')
car['model year']

0      1970
1      1970
2      1970
3      1970
4      1970
       ... 
393    1982
394    1982
395    1982
396    1982
397    1982
Name: model year, Length: 392, dtype: category
Categories (13, int64): [1970, 1971, 1972, 1973, ..., 1979, 1980, 1981, 1982]

In [109]:
car2 = car.copy()
car2['horsepower'].values

array([130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 170, 160, 150,
       225,  95,  95,  97,  85,  88,  46,  87,  90,  95, 113,  90, 215,
       200, 210, 193,  88,  90,  95, 100, 105, 100,  88, 100, 165, 175,
       153, 150, 180, 170, 175, 110,  72, 100,  88,  86,  90,  70,  76,
        65,  69,  60,  70,  95,  80,  54,  90,  86, 165, 175, 150, 153,
       150, 208, 155, 160, 190,  97, 150, 130, 140, 150, 112,  76,  87,
        69,  86,  92,  97,  80,  88, 175, 150, 145, 137, 150, 198, 150,
       158, 150, 215, 225, 175, 105, 100, 100,  88,  95,  46, 150, 167,
       170, 180, 100,  88,  72,  94,  90,  85, 107,  90, 145, 230,  49,
        75,  91, 112, 150, 110, 122, 180,  95, 100, 100,  67,  80,  65,
        75, 100, 110, 105, 140, 150, 150, 140, 150,  83,  67,  78,  52,
        61,  75,  75,  75,  97,  93,  67,  95, 105,  72,  72, 170, 145,
       150, 148, 110, 105, 110,  95, 110, 110, 129,  75,  83, 100,  78,
        96,  71,  97,  97,  70,  90,  95,  88,  98, 115,  53,  8

In [117]:
hp_max = car2['horsepower'].max()
hp_min = car2['horsepower'].min()

print(f'최대값은 {hp_max}, 최소값은 {hp_min}')
increment = (hp_max - hp_min)/3
print(hp_min, hp_min + increment, hp_min + increment*2, hp_max)

최대값은 230, 최소값은 46
46 107.33333333333334 168.66666666666669 230


In [118]:
# 위 histogram을 numpy로
import numpy as np
np.histogram(car['horsepower'],bins=3)

(array([257, 103,  32], dtype=int64),
 array([ 46.        , 107.33333333, 168.66666667, 230.        ]))

In [140]:
count, bin_dividers = np.histogram(car['horsepower'],bins=3)
print('구간별   수량:', count)
print('구간별 경계값:', bin_dividers)

bin_labels = ['저출력','보통출력','고출력']
hp_bin = pd.cut(x = car2['horsepower'],bins = bin_dividers,
                      labels = bin_labels, include_lowest = True)
#car2.insert(4,'hp_bin',hp_bin)
car2

구간별   수량: [257 103  32]
구간별 경계값: [ 46.         107.33333333 168.66666667 230.        ]


Unnamed: 0,mpg,cylinders,displacement,horsepower,hp_bin,weight,acceleration,model year,origin,car name,출력
0,18.0,8,307.0,130,보통출력,3504,12.0,1970,usa,chevrolet chevelle malibu,보통출력
1,15.0,8,350.0,165,보통출력,3693,11.5,1970,usa,buick skylark 320,보통출력
2,18.0,8,318.0,150,보통출력,3436,11.0,1970,usa,plymouth satellite,보통출력
3,16.0,8,304.0,150,보통출력,3433,12.0,1970,usa,amc rebel sst,보통출력
4,17.0,8,302.0,140,보통출력,3449,10.5,1970,usa,ford torino,보통출력
...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,저출력,2790,15.6,1982,usa,ford mustang gl,저출력
394,44.0,4,97.0,52,저출력,2130,24.6,1982,eu,vw pickup,저출력
395,32.0,4,135.0,84,저출력,2295,11.6,1982,usa,dodge rampage,저출력
396,28.0,4,120.0,79,저출력,2625,18.6,1982,usa,ford ranger,저출력


In [141]:
# 정규화
hp_min = car2['horsepower'].min()
hp_max = car2['horsepower'].max()

sr = (car2['horsepower'] - hp_min) / (hp_max - hp_min)
print(sr.min())
print(sr.max())

0.0
1.0
