In [1]:
import pandas as pd
import numpy as np

### 데이터 클렌징 · 편집

In [2]:
# 샘플 데이터프레임 생성_1
dict1 = {'item' : ['radio', 'computer','telephone'], 'price' : ['10000원', '30000원', '20000']}
df1 = pd.DataFrame(dict1)
df1

Unnamed: 0,item,price
0,radio,10000원
1,computer,30000원
2,telephone,20000


In [3]:
# 데이터 클렌징
df1.price = df1.price.str.replace('원','')  # 문자열 내 원을 공백으로 치환
df1.price = pd.to_numeric(df1.price)  # 숫자형으로 변환
print(df1.dtypes)
df1

item     object
price     int64
dtype: object


Unnamed: 0,item,price
0,radio,10000
1,computer,30000
2,telephone,20000


In [4]:
df1.price2 = df1.price.replace({10000 : '쌈', 20000 : '보통', 30000 : '비쌈'})  ## 처음 선언할 때는 메서드처럼 쓰면 안됨
df1

  df1.price2 = df1.price.replace({10000 : '쌈', 20000 : '보통', 30000 : '비쌈'})  ## 처음 선언할 때는 메서드처럼 쓰면 안됨


Unnamed: 0,item,price
0,radio,10000
1,computer,30000
2,telephone,20000


In [5]:
df1['price2'] = df1.price.replace({10000 : '쌈', 20000 : '보통', 30000 : '비쌈'})  # 문자열 전체 매칭시
df1

Unnamed: 0,item,price,price2
0,radio,10000,쌈
1,computer,30000,비쌈
2,telephone,20000,보통


In [6]:
def condition(x):  # 함수 및 apply 활용 가능
    if x<20000:
        return '2만미만'
    else:
        return '2만이상'
df1.price.apply(condition)

0    2만미만
1    2만이상
2    2만이상
Name: price, dtype: object

In [7]:
df1['price'].apply(lambda x: x**2)

0    100000000
1    900000000
2    400000000
Name: price, dtype: int64

In [8]:
df1.price.transform([lambda x: np.log(x), np.sqrt])  # 복수열로도 사용 가능

Unnamed: 0,<lambda>,sqrt
0,9.21034,100.0
1,10.308953,173.205081
2,9.903488,141.421356


In [9]:
df1.eval('price = price * 50')  # eval 메서드 활용

Unnamed: 0,item,price,price2
0,radio,500000,쌈
1,computer,1500000,비쌈
2,telephone,1000000,보통


In [10]:
np.where(df1.price >= 2e4, '2만이상', '2만미만')

array(['2만미만', '2만이상', '2만이상'], dtype='<U4')

### 변수 타입 변경

In [11]:
# 샘플 데이터프레임 생성_2
dict2 = {'ipo' : ['witch factory', 'cubox','trueN'], 
         'date' : [20230526, 20230510, 20230509],
         'deposit' : ['32,000,000,000', '22,500,000,000', '30,000,000,000']}
df2 = pd.DataFrame(dict2)
print(df2.dtypes)
df2

ipo        object
date        int64
deposit    object
dtype: object


Unnamed: 0,ipo,date,deposit
0,witch factory,20230526,32000000000
1,cubox,20230510,22500000000
2,trueN,20230509,30000000000


In [12]:
# 천단위 소수점 삭제 후 str to float
df2.deposit = df2.deposit.str.replace(',','').astype(float)
print(df2.dtypes)
df2

ipo         object
date         int64
deposit    float64
dtype: object


Unnamed: 0,ipo,date,deposit
0,witch factory,20230526,32000000000.0
1,cubox,20230510,22500000000.0
2,trueN,20230509,30000000000.0


In [13]:
# yyyymmdd(str) to datetime
from datetime import datetime
df2.date = df2.date.apply(lambda x: datetime.strptime(str(x), '%Y%m%d'))
print(df2.dtypes)
df2

ipo                object
date       datetime64[ns]
deposit           float64
dtype: object


Unnamed: 0,ipo,date,deposit
0,witch factory,2023-05-26,32000000000.0
1,cubox,2023-05-10,22500000000.0
2,trueN,2023-05-09,30000000000.0


In [14]:
# datetime to yyyymmdd(str) 
df2.date = df2.date.apply(lambda x: datetime.strftime(x, '%Y%m%d'))
print(df2.dtypes)
df2

ipo         object
date        object
deposit    float64
dtype: object


Unnamed: 0,ipo,date,deposit
0,witch factory,20230526,32000000000.0
1,cubox,20230510,22500000000.0
2,trueN,20230509,30000000000.0


In [15]:
# everything to category : 모양은 종전 형식 유지
df2.ipo = df2.ipo.astype('category')
print(df2.dtypes)
df2

ipo        category
date         object
deposit     float64
dtype: object


Unnamed: 0,ipo,date,deposit
0,witch factory,20230526,32000000000.0
1,cubox,20230510,22500000000.0
2,trueN,20230509,30000000000.0


In [16]:
# 문자로 변경
df2.deposit = df2.deposit / 1e8
df2.deposit = df2.deposit.astype(str)
print(df2.dtypes)
df2

ipo        category
date         object
deposit      object
dtype: object


Unnamed: 0,ipo,date,deposit
0,witch factory,20230526,320.0
1,cubox,20230510,225.0
2,trueN,20230509,300.0


In [17]:
# 한번에 변경
df2 = df2.astype({'ipo' : str,
                 'date' : 'int32',
                 'deposit' : float})
print(df2.dtypes)
df2

ipo         object
date         int32
deposit    float64
dtype: object


Unnamed: 0,ipo,date,deposit
0,witch factory,20230526,320.0
1,cubox,20230510,225.0
2,trueN,20230509,300.0


### 컬럼 추가, 변경, 정리, 삭제

In [18]:
# 샘플 데이터프레임 생성_3
dict3 = {'Name' : ['John Harbor', 'San Miguel','Sir Otto'], 'Korean' : [60, 80, 70], 'Math' : [90,80,80], 'Society' : [70, 90, 95]}
df3 = pd.DataFrame(dict3)
df3

Unnamed: 0,Name,Korean,Math,Society
0,John Harbor,60,90,70
1,San Miguel,80,80,90
2,Sir Otto,70,80,95


In [19]:
# 맨 마지막에 컬럼 추가
df3['Science'] = [85, 75, 80]
df3

Unnamed: 0,Name,Korean,Math,Society,Science
0,John Harbor,60,90,70,85
1,San Miguel,80,80,90,75
2,Sir Otto,70,80,95,80


In [20]:
# 원하는 위치에 컬럼 추가
df3.insert(3, 'English', [80, 90, 85])
df3

Unnamed: 0,Name,Korean,Math,English,Society,Science
0,John Harbor,60,90,80,70,85
1,San Miguel,80,80,90,90,75
2,Sir Otto,70,80,85,95,80


In [21]:
# 하나의 컬럼을 둘로 쪼개기
df3[['First name', 'Last name']] = df3.Name.str.split(' ', expand=True)
df3

Unnamed: 0,Name,Korean,Math,English,Society,Science,First name,Last name
0,John Harbor,60,90,80,70,85,John,Harbor
1,San Miguel,80,80,90,90,75,San,Miguel
2,Sir Otto,70,80,85,95,80,Sir,Otto


In [22]:
# 컬럼 위치 자유자재로 바꾸기
col1 = df3.columns[-2:].to_list()
col2 = df3.columns[:-2].to_list()
col = col1+ col2
df3 = df3[col]
df3

Unnamed: 0,First name,Last name,Name,Korean,Math,English,Society,Science
0,John,Harbor,John Harbor,60,90,80,70,85
1,San,Miguel,San Miguel,80,80,90,90,75
2,Sir,Otto,Sir Otto,70,80,85,95,80


In [23]:
# 컬럼 순서를 거꾸로
df3[reversed(df3.columns)]

Unnamed: 0,Science,Society,English,Math,Korean,Name,Last name,First name
0,85,70,80,90,60,John Harbor,Harbor,John
1,75,90,90,80,80,San Miguel,Miguel,San
2,80,95,85,80,70,Sir Otto,Otto,Sir


In [24]:
# List Comprehension : 서로 맞바꾸기
df3[['First name' if c == 'Last name' else 'Last name' if c=='First name' else c for c in df3.columns]]

Unnamed: 0,Last name,First name,Name,Korean,Math,English,Society,Science
0,Harbor,John,John Harbor,60,90,80,70,85
1,Miguel,San,San Miguel,80,80,90,90,75
2,Otto,Sir,Sir Otto,70,80,85,95,80


In [25]:
# 특정 컬럼을 앞으로,,
df3[['Society'] + [i for i in df3.columns.to_list() if i != 'Society']]

Unnamed: 0,Society,First name,Last name,Name,Korean,Math,English,Science
0,70,John,Harbor,John Harbor,60,90,80,85
1,90,San,Miguel,San Miguel,80,80,90,75
2,95,Sir,Otto,Sir Otto,70,80,85,80


In [26]:
# 컬럼 삭제
df3.drop(['First name'], axis = 1, inplace=True)
df3

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.drop(['First name'], axis = 1, inplace=True)


Unnamed: 0,Last name,Name,Korean,Math,English,Society,Science
0,Harbor,John Harbor,60,90,80,70,85
1,Miguel,San Miguel,80,80,90,90,75
2,Otto,Sir Otto,70,80,85,95,80


In [27]:
del df3['Last name']
df3

Unnamed: 0,Name,Korean,Math,English,Society,Science
0,John Harbor,60,90,80,70,85
1,San Miguel,80,80,90,90,75
2,Sir Otto,70,80,85,95,80


In [28]:
# 특정 컬럼명 변경
df3.rename(columns = {'Korean' : '국어',
                     'Math' : '수학'})

Unnamed: 0,Name,국어,수학,English,Society,Science
0,John Harbor,60,90,80,70,85
1,San Miguel,80,80,90,90,75
2,Sir Otto,70,80,85,95,80


In [29]:
# 컬럼명 일괄 변경, 갯수 맞아야 함
df3.columns = ['이름', '국어', '수학', '영어', '사회', '과학']
df3

Unnamed: 0,이름,국어,수학,영어,사회,과학
0,John Harbor,60,90,80,70,85
1,San Miguel,80,80,90,90,75
2,Sir Otto,70,80,85,95,80


### 결측값 확인, 삭제, 보간

In [30]:
# 샘플 데이터프레임 생성_4
col = ['spring', 'summer', 'fall', 'winter']
row = ['apple', 'peach', 'melon', 'strawberry', 'chestnut']
data = [[6, pd.NA, 8, 7],
                [4, 9, 5, np.nan],
                [5, 5, 5, 5],
                [None, 3, 6, 10],
                [None, None, None, None]]
df4 = pd.DataFrame(data, row, col)
df4

Unnamed: 0,spring,summer,fall,winter
apple,6.0,,8.0,7.0
peach,4.0,9.0,5.0,
melon,5.0,5.0,5.0,5.0
strawberry,,3.0,6.0,10.0
chestnut,,,,


In [31]:
# 결측값 여부 확인
df4.isna()

Unnamed: 0,spring,summer,fall,winter
apple,False,True,False,False
peach,False,False,False,True
melon,False,False,False,False
strawberry,True,False,False,False
chestnut,True,True,True,True


In [32]:
# 결측값 제거
df4.dropna(subset = ['summer', 'fall'],  # 결측값 여부 체크할 컬럼, 생략시 전체
          how = 'any',  # any: 결측값 하나라도 있는 경우, all: 모두 결측값인 경우, 기본값 any
          axis = 0,  # 0: 행 기준, 1: 열 기준, 기본값 0
          )

Unnamed: 0,spring,summer,fall,winter
peach,4.0,9,5.0,
melon,5.0,5,5.0,5.0
strawberry,,3,6.0,10.0


In [33]:
# 결측값 보간
df4.fillna(method = 'ffill',  # ffill: na 앞의 값으로 채움, bfill: na 뒤의 값으로 채움
          axis = 0,  # 0: 행 기준, 1: 열 기준, 기본값 0
          limit = 1,  # 결측값 변경할 횟수제한
          downcast = 'infer'  # float64를 int64로 변경
          )

Unnamed: 0,spring,summer,fall,winter
apple,6.0,,8,7
peach,4.0,9.0,5,7
melon,5.0,5.0,5,5
strawberry,5.0,3.0,6,10
chestnut,,3.0,6,10


In [34]:
# 특정 값으로 결측값 채우기
df4.fillna(0)

Unnamed: 0,spring,summer,fall,winter
apple,6.0,0,8.0,7.0
peach,4.0,9,5.0,0.0
melon,5.0,5,5.0,5.0
strawberry,0.0,3,6.0,10.0
chestnut,0.0,0,0.0,0.0


In [35]:
# 컬럼별로 다르게 결측값 채우기
df4.fillna({'spring':'A', 'summer':'B', 'fall':'C', 'winter':'D'})

Unnamed: 0,spring,summer,fall,winter
apple,6.0,B,8.0,7.0
peach,4.0,9,5.0,D
melon,5.0,5,5.0,5.0
strawberry,A,3,6.0,10.0
chestnut,A,B,C,D


### 집계 행/열 추가

In [36]:
# 데이터프레임 재활용
df5 = pd.DataFrame(dict3)
df5

Unnamed: 0,Name,Korean,Math,Society
0,John Harbor,60,90,70
1,San Miguel,80,80,90
2,Sir Otto,70,80,95


In [37]:
# 컬럼 합계 추가
df5['Sum'] = df5.sum(axis=1, numeric_only=None)
df5

  df5['Sum'] = df5.sum(axis=1, numeric_only=None)


Unnamed: 0,Name,Korean,Math,Society,Sum
0,John Harbor,60,90,70,220
1,San Miguel,80,80,90,250
2,Sir Otto,70,80,95,245


In [38]:
# 행 평균 추가
df5.set_index('Name', inplace=True)  # 숫자만 남도록 문자부분 인덱스 지정, 필요시 인덱스 다시 해제
df5.loc['Avg'] = df5.mean(axis=0)
df5

Unnamed: 0_level_0,Korean,Math,Society,Sum
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
John Harbor,60.0,90.0,70.0,220.0
San Miguel,80.0,80.0,90.0,250.0
Sir Otto,70.0,80.0,95.0,245.0
Avg,70.0,83.333333,85.0,238.333333
