### Pandas
- Series and Dataframe
- 분석을 위한 전처리

In [93]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action="ignore")

print(np.__version__)
print(pd.__version__)

1.19.2
1.1.3


### Series 클래스
- 넘파이 1차원 배열과 비슷
- series = index + value

In [5]:
# numpy vector
ary = np.array([1,2,3,4,'ruby'], dtype=np.object)
print(ary)
print(ary.dtype)

[1 2 3 4 'ruby']
object


In [13]:
# pandas series
ary = pd.Series([1,2,3,4,'ruby'], dtype=np.object)
print(ary)
print(ary.values)
print(type(ary.values))
print(ary.index)
print(type(ary.index))

0       1
1       2
2       3
3       4
4    ruby
dtype: object
[1 2 3 4 'ruby']
<class 'numpy.ndarray'>
RangeIndex(start=0, stop=5, step=1)
<class 'pandas.core.indexes.range.RangeIndex'>


In [21]:
def seriesInfo(ary) :
    print('index + value : \n', ary)
    print('value : ', ary.values)
    print('value type : ', type(ary.values))
    print('index : ', ary.index)
    print('index type : ', type(ary.index))

- 인덱스의 라벨은 정수, 문자, 날짜, 시간으로 변경 가능

In [22]:
ary = pd.Series([1,2,3,4,5], dtype=np.int32, index=['강남','서초','방배','동작','사당'])

In [23]:
seriesInfo(ary)

index + value : 
 강남    1
서초    2
방배    3
동작    4
사당    5
dtype: int32
value :  [1 2 3 4 5]
value type :  <class 'numpy.ndarray'>
index :  Index(['강남', '서초', '방배', '동작', '사당'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>


In [24]:
# head추가
ary.index.name='구별'
seriesInfo(ary)

index + value : 
 구별
강남    1
서초    2
방배    3
동작    4
사당    5
dtype: int32
value :  [1 2 3 4 5]
value type :  <class 'numpy.ndarray'>
index :  Index(['강남', '서초', '방배', '동작', '사당'], dtype='object', name='구별')
index type :  <class 'pandas.core.indexes.base.Index'>


In [27]:
# 두개 이상의 인덱스 [[]]
print(ary['서초'])
print(ary[['강남', '방배']])

2
구별
강남    1
방배    3
dtype: int32


In [28]:
# index, values 모두 가져올 때
for idx, value in ary.items():
    print('idx : {}, value : {}'.format(idx, value))

idx : 강남, value : 1
idx : 서초, value : 2
idx : 방배, value : 3
idx : 동작, value : 4
idx : 사당, value : 5


In [29]:
# index만 가져올 때
for idx in ary.keys():
    print('idx : {}'.format(idx))

idx : 강남
idx : 서초
idx : 방배
idx : 동작
idx : 사당


In [36]:
# value만 가져올 때
for value in ary.values:
    print('value : {}'.format(value))

value : 1.0
value : 2.0
value : 4.0


In [31]:
ary = pd.Series(range(10, 21,2))
seriesInfo(ary)

index + value : 
 0    10
1    12
2    14
3    16
4    18
5    20
dtype: int64
value :  [10 12 14 16 18 20]
value type :  <class 'numpy.ndarray'>
index :  RangeIndex(start=0, stop=6, step=1)
index type :  <class 'pandas.core.indexes.range.RangeIndex'>


In [37]:
# dict형태로 series만들기
ary = pd.Series({'c' : 1, 'b' : 5, 'a' : -8, 'k' : 10}, dtype=np.float64)
seriesInfo(ary)

index + value : 
 c     1.0
b     5.0
a    -8.0
k    10.0
dtype: float64
value :  [ 1.  5. -8. 10.]
value type :  <class 'numpy.ndarray'>
index :  Index(['c', 'b', 'a', 'k'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>


In [38]:
# 연산이 되더라도 index는 변함없음
ary*10

c     10.0
b     50.0
a    -80.0
k    100.0
dtype: float64

- fancy indexing & boolean indexing

In [41]:
print('fancy [0,2] indexing : \n{}'.format(ary[[0,2]]))

fancy [0,2] indexing : 
c    1.0
a   -8.0
dtype: float64


In [46]:
# 2의 배수
print('boolean ary % 2 == 0 :\n{}'.format(ary[ary % 2 == 0]))

boolean ary % 2 == 0 :
a    -8.0
k    10.0
dtype: float64


### 시간, 날짜 불러오기

In [54]:
from datetime import date, datetime, timedelta
from dateutil.parser import parse

In [103]:
strDate = datetime(2021,2,25)
print(strDate)
print(strDate + timedelta(days=1))

2021-02-25 00:00:00
2021-02-26 00:00:00


In [104]:
# 평균이 50이고 편차가 5인 정규분포 데이터를 10일간 만들기
fac01 = pd.Series([int(x) for x in np.random.normal(50, 5, (10))], index=[strDate + timedelta(days=day) for day in range(10)])
seriesInfo(fac01)

index + value : 
 2021-02-25    40
2021-02-26    51
2021-02-27    57
2021-02-28    54
2021-03-01    46
2021-03-02    58
2021-03-03    49
2021-03-04    46
2021-03-05    51
2021-03-06    51
dtype: int64
value :  [40 51 57 54 46 58 49 46 51 51]
value type :  <class 'numpy.ndarray'>
index :  DatetimeIndex(['2021-02-25', '2021-02-26', '2021-02-27', '2021-02-28',
               '2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04',
               '2021-03-05', '2021-03-06'],
              dtype='datetime64[ns]', freq=None)
index type :  <class 'pandas.core.indexes.datetimes.DatetimeIndex'>


In [105]:
# 평균이 70이고 편차가 8인 정규분포 데이터를 10일간 만들기
fac02 = pd.Series([int(x) for x in np.random.normal(70, 8, (10))], index=[strDate + timedelta(days=day) for day in range(10)])
seriesInfo(fac02)

index + value : 
 2021-02-25    59
2021-02-26    74
2021-02-27    67
2021-02-28    82
2021-03-01    61
2021-03-02    73
2021-03-03    57
2021-03-04    63
2021-03-05    63
2021-03-06    80
dtype: int64
value :  [59 74 67 82 61 73 57 63 63 80]
value type :  <class 'numpy.ndarray'>
index :  DatetimeIndex(['2021-02-25', '2021-02-26', '2021-02-27', '2021-02-28',
               '2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04',
               '2021-03-05', '2021-03-06'],
              dtype='datetime64[ns]', freq=None)
index type :  <class 'pandas.core.indexes.datetimes.DatetimeIndex'>


In [106]:
fac01 + fac02

2021-02-25     99
2021-02-26    125
2021-02-27    124
2021-02-28    136
2021-03-01    107
2021-03-02    131
2021-03-03    106
2021-03-04    109
2021-03-05    114
2021-03-06    131
dtype: int64

In [107]:
# casting
print(set(fac01.index))
print(list(fac01.index))

{Timestamp('2021-03-02 00:00:00'), Timestamp('2021-03-06 00:00:00'), Timestamp('2021-03-04 00:00:00'), Timestamp('2021-03-05 00:00:00'), Timestamp('2021-02-28 00:00:00'), Timestamp('2021-02-26 00:00:00'), Timestamp('2021-03-03 00:00:00'), Timestamp('2021-03-01 00:00:00'), Timestamp('2021-02-25 00:00:00'), Timestamp('2021-02-27 00:00:00')}
[Timestamp('2021-02-25 00:00:00'), Timestamp('2021-02-26 00:00:00'), Timestamp('2021-02-27 00:00:00'), Timestamp('2021-02-28 00:00:00'), Timestamp('2021-03-01 00:00:00'), Timestamp('2021-03-02 00:00:00'), Timestamp('2021-03-03 00:00:00'), Timestamp('2021-03-04 00:00:00'), Timestamp('2021-03-05 00:00:00'), Timestamp('2021-03-06 00:00:00')]


In [108]:
for idx in fac01.index :
    print(idx)

2021-02-25 00:00:00
2021-02-26 00:00:00
2021-02-27 00:00:00
2021-02-28 00:00:00
2021-03-01 00:00:00
2021-03-02 00:00:00
2021-03-03 00:00:00
2021-03-04 00:00:00
2021-03-05 00:00:00
2021-03-06 00:00:00


- Series indexing

In [109]:
fac01

2021-02-25    40
2021-02-26    51
2021-02-27    57
2021-02-28    54
2021-03-01    46
2021-03-02    58
2021-03-03    49
2021-03-04    46
2021-03-05    51
2021-03-06    51
dtype: int64

In [110]:
fac01[1]

51

In [112]:
fac01[datetime.strptime('2021-02-25', '%Y-%m-%d')]

40

In [113]:
price_series = pd.Series([4000, 3000, 3500, 2000], index=['a', 'b', 'c', 'd'])
seriesInfo(price_series)

index + value : 
 a    4000
b    3000
c    3500
d    2000
dtype: int64
value :  [4000 3000 3500 2000]
value type :  <class 'numpy.ndarray'>
index :  Index(['a', 'b', 'c', 'd'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>


In [116]:
# value값 바꾸기
price_series['a']=5000
print(seriesInfo(price_series))
price_series[0] = 6000
print(seriesInfo(price_series))

index + value : 
 a    5000
b    3000
c    3500
d    2000
dtype: int64
value :  [5000 3000 3500 2000]
value type :  <class 'numpy.ndarray'>
index :  Index(['a', 'b', 'c', 'd'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>
None
index + value : 
 a    6000
b    3000
c    3500
d    2000
dtype: int64
value :  [6000 3000 3500 2000]
value type :  <class 'numpy.ndarray'>
index :  Index(['a', 'b', 'c', 'd'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>
None


In [117]:
# index, value값 추가
price_series['e'] = 1000
seriesInfo(price_series)

index + value : 
 a    6000
b    3000
c    3500
d    2000
e    1000
dtype: int64
value :  [6000 3000 3500 2000 1000]
value type :  <class 'numpy.ndarray'>
index :  Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>


In [118]:
# index, value값 삭제
del price_series['e']
seriesInfo(price_series)

index + value : 
 a    6000
b    3000
c    3500
d    2000
dtype: int64
value :  [6000 3000 3500 2000]
value type :  <class 'numpy.ndarray'>
index :  Index(['a', 'b', 'c', 'd'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>


In [120]:
# null값 추가
price_series['e'] = np.NaN
seriesInfo(price_series)

index + value : 
 a    6000.0
b    3000.0
c    3500.0
d    2000.0
e       NaN
dtype: float64
value :  [6000. 3000. 3500. 2000.   nan]
value type :  <class 'numpy.ndarray'>
index :  Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>


In [121]:
# null o --> True
pd.isnull(price_series)

a    False
b    False
c    False
d    False
e     True
dtype: bool

In [126]:
ser01 = pd.Series([100, 200, 300, 400], index=['a','b','c','d'])
ser02 = pd.Series([500, 600, 700, 800], index=['a','b','e','d'])

In [129]:
# 연산--> +
ser03 = ser01 + ser02
ser03

a     600.0
b     800.0
c       NaN
d    1200.0
e       NaN
dtype: float64

In [130]:
# 연산--> .add
ser04 = ser01.add(ser02, fill_value=0)
ser04

a     600.0
b     800.0
c     300.0
d    1200.0
e     700.0
dtype: float64

In [135]:
# 결측값
zser = ser03.fillna(0)
print(seriesInfo(zser))
zser = ser03.fillna(ser03.mean())
print(seriesInfo(zser))

index + value : 
 a     600.0
b     800.0
c       0.0
d    1200.0
e       0.0
dtype: float64
value :  [ 600.  800.    0. 1200.    0.]
value type :  <class 'numpy.ndarray'>
index :  Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>
None
index + value : 
 a     600.000000
b     800.000000
c     866.666667
d    1200.000000
e     866.666667
dtype: float64
value :  [ 600.          800.          866.66666667 1200.          866.66666667]
value type :  <class 'numpy.ndarray'>
index :  Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>
None


In [140]:
# subset
print(pd.isnull(ser03))
subset = ser03[pd.isnull(ser03)]
subset

a    False
b    False
c     True
d    False
e     True
dtype: bool


c   NaN
e   NaN
dtype: float64

In [139]:
print(pd.notnull(ser03))
subset = ser03[pd.notnull(ser03)]
subset

a     True
b     True
c    False
d     True
e    False
dtype: bool


a     600.0
b     800.0
d    1200.0
dtype: float64

In [155]:
print(seriesInfo(ser04))
# 배열인덱싱
print(ser04[0:2])
# 라벨인덱싱
print(ser04[['a', 'c']])
print(ser04[['c', 'a']])
print(ser04['a':'c'])
print(ser04[[3,1]])

index + value : 
 a     600.0
b     800.0
c     300.0
d    1200.0
e     700.0
dtype: float64
value :  [ 600.  800.  300. 1200.  700.]
value type :  <class 'numpy.ndarray'>
index :  Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>
None
a    600.0
b    800.0
dtype: float64
a    600.0
c    300.0
dtype: float64
c    300.0
a    600.0
dtype: float64
a    600.0
b    800.0
c    300.0
dtype: float64
d    1200.0
b     800.0
dtype: float64


In [156]:
tuple_ser = pd.Series((10, 20, 30, 40))
tuple_ser

0    10
1    20
2    30
3    40
dtype: int64

In [159]:
# set은 순서가 없기때문에 list로 형변환을 시켜 순서를 넣어줘야 함
set_ser = pd.Series(list({10, 20, 30, 40}))
set_ser

0    40
1    10
2    20
3    30
dtype: int64

### DataFrame

In [162]:
data = {'name' : ['ruby', 'dia', '은영', '녕', 'jslim'], 'birth' : [2000,2001,2002,2003,2004]}
userDF = pd.DataFrame(data)
display(userDF)

Unnamed: 0,name,birth
0,ruby,2000
1,dia,2001
2,은영,2002
3,녕,2003
4,jslim,2004


In [178]:
print(userDF.shape)
print(userDF.size)
print(userDF.ndim)
print(userDF.index)
print(userDF.columns)
print(userDF.values)
print(type(userDF.values))

(5, 2)
10
2
RangeIndex(start=0, stop=5, step=1)
Index(['name', 'birth'], dtype='object')
[['ruby' 2000]
 ['dia' 2001]
 ['은영' 2002]
 ['녕' 2003]
 ['jslim' 2004]]
<class 'numpy.ndarray'>


In [175]:
def frameInfo(df) :
    print('shape : {}'.format(df.shape))
    print('size : {}'.format(df.size))
    print('ndim : {}'.format(df.ndim))
    print('index : {}'.format(df.index))
    print('index type : {}'.format(type(df.index)))
    print('columns : {}'.format(df.columns))
    print('columns type : {}'.format(type(df.columns)))

In [183]:
data = {
    "2021" : [9910293, 8384050, 2938485, 1203948],
    "2018" : [8910293, 7384050, 5938485, 3203948],
    "2016" : [7910293, 5384050, 7938485, 6203948],
    "2014" : [5910293, 3384050, 4938485, 4203948],
    "지역" : ['수도권' , '경상권' , '수도권' , '경상권'],
    "증가율" : [0.2343 , 0.0434 , 0.0944 , 0.0034]
}

In [186]:
columns = ["지역","2014","2016","2018","2021","증가율",]
popDF = pd.DataFrame(data, index=["서울","부산","경기","대구"], columns = columns)
popDF

Unnamed: 0,지역,2014,2016,2018,2021,증가율
서울,수도권,5910293,7910293,8910293,9910293,0.2343
부산,경상권,3384050,5384050,7384050,8384050,0.0434
경기,수도권,4938485,7938485,5938485,2938485,0.0944
대구,경상권,4203948,6203948,3203948,1203948,0.0034


In [187]:
frameInfo(popDF)

shape : (4, 6)
size : 24
ndim : 2
index : Index(['서울', '부산', '경기', '대구'], dtype='object')
index type : <class 'pandas.core.indexes.base.Index'>
columns : Index(['지역', '2014', '2016', '2018', '2021', '증가율'], dtype='object')
columns type : <class 'pandas.core.indexes.base.Index'>


In [189]:
popDF.index

Index(['서울', '부산', '경기', '대구'], dtype='object')

In [192]:
popDF.columns

Index(['지역', '2014', '2016', '2018', '2021', '증가율'], dtype='object')

In [198]:
# index의 name 설정
popDF.index.name="city"
popDF.columns.name = "feature"
display(popDF)
display(popDF.T)

feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343
부산,경상권,3384050,5384050,7384050,8384050,0.0434
경기,수도권,4938485,7938485,5938485,2938485,0.0944
대구,경상권,4203948,6203948,3203948,1203948,0.0034


city,서울,부산,경기,대구
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
지역,수도권,경상권,수도권,경상권
2014,5910293,3384050,4938485,4203948
2016,7910293,5384050,7938485,6203948
2018,8910293,7384050,5938485,3203948
2021,9910293,8384050,2938485,1203948
증가율,0.2343,0.0434,0.0944,0.0034


### 다음 조건을 만족하는 임의의 데이터프레임을 만들어보자
- 열의 갯수와 행의 갯수가 각각 5개 이상이여야 한다.
- 열에는 정수, 문자열, 실수, 날짜 데이터가 각각 1개 이상 포함되어야 한다.

In [199]:
# 정수컬럼
random_int = np.random.randint(1, 100, 10)
# 실수컬럼(표준정규분포)
random_gao = np.random.randn(10)
# 실수 컬럼
random_uni = np.random.rand(10)
# 날짜 컬럼
startDay = datetime(2021,2,25)
years = [startDay + timedelta(day) for day in range(0, 10)]
# 문자 컬럼
random_moon = ["apple", "banana", "mango", "berry", "melon", "cherry", "pineapple", "peach", "blueberry", "avocado"]

In [202]:
data = {
    '정수' : random_int,
    '실수1': random_gao,
    '실수2':random_uni,
    '문자': random_moon,
    '날짜': years,
}

testDF = pd.DataFrame(data)
testDF

Unnamed: 0,정수,실수1,실수2,문자,날짜
0,6,-0.523359,0.659371,apple,2021-02-25
1,48,-0.403826,0.925632,banana,2021-02-26
2,36,-0.089557,0.258317,mango,2021-02-27
3,23,0.254172,0.488167,berry,2021-02-28
4,54,0.222466,0.301571,melon,2021-03-01
5,59,0.381257,0.801714,cherry,2021-03-02
6,87,0.259845,0.772216,pineapple,2021-03-03
7,54,-0.198669,0.909934,peach,2021-03-04
8,15,-0.126438,0.138101,blueberry,2021-03-05
9,45,-0.320419,0.940905,avocado,2021-03-06


In [257]:
# 새로운 컬럼 추가
popDF['2014-2016 증가율'] = ((popDF['2016'] - popDF['2014']) / popDF['2014'] * 100).round(2)
popDF

feature,지역,2014,2016,2018,2021,증가율,2014-2016 증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343,33.84
부산,경상권,3384050,5384050,7384050,8384050,0.0434,59.1
경기,수도권,4938485,7938485,5938485,2938485,0.0944,60.75
대구,경상권,4203948,6203948,3203948,1203948,0.0034,47.57


In [258]:
# 컬럼 삭제
del popDF['2014-2016 증가율']
popDF

feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343
부산,경상권,3384050,5384050,7384050,8384050,0.0434
경기,수도권,4938485,7938485,5938485,2938485,0.0944
대구,경상권,4203948,6203948,3203948,1203948,0.0034


In [219]:
print(type(popDF[['지역', '증가율']]))
print(type(popDF['지역']))
print(type(popDF['지역'].values))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>


- 행(row) indexing
- 슬라이싱만 가능
- 배열 인덱스, 라벨인덱스 가능

In [222]:
popDF

feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343
부산,경상권,3384050,5384050,7384050,8384050,0.0434
경기,수도권,4938485,7938485,5938485,2938485,0.0944
대구,경상권,4203948,6203948,3203948,1203948,0.0034


In [231]:
# 배열인덱싱
display(popDF[:1])
# 라벨인덱싱
display(popDF[: '서울'])

feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343


feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343


In [236]:
# 배열인덱싱
display(popDF[0:3])
# 라벨인덱싱
display(popDF['서울': '경기'])

feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343
부산,경상권,3384050,5384050,7384050,8384050,0.0434
경기,수도권,4938485,7938485,5938485,2938485,0.0944


feature,지역,2014,2016,2018,2021,증가율
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5910293,7910293,8910293,9910293,0.2343
부산,경상권,3384050,5384050,7384050,8384050,0.0434
경기,수도권,4938485,7938485,5938485,2938485,0.0944


- 개별 인덱싱 : 특정 행에 대한 특정 컬럼

In [244]:
print(popDF['2021']['서울'])
print(popDF['2021'][:'서울'])
print(type(popDF['2021']['서울']))
print(type(popDF['2021'][:'서울']))

9910293
city
서울    9910293
Name: 2021, dtype: int64
<class 'numpy.int64'>
<class 'pandas.core.series.Series'>


In [243]:
popDF['2021'][['서울', '대구']]

city
서울    9910293
대구    1203948
Name: 2021, dtype: int64

### 실습

In [286]:
score_data = {
    'kor'  : [80,90,70,30],
    'eng'  : [90,70,60,40],
    'math' : [90,60,90,70] 
}
columns = ['kor','eng','math']
index   = ['김지은', '황인범', '김정수', '최호진']

exec_df = pd.DataFrame(score_data , index=index , columns=columns)
exec_df

Unnamed: 0,kor,eng,math
김지은,80,90,90
황인범,90,70,60
김정수,70,60,90
최호진,30,40,70


In [287]:
# 위 데이터를 보고 모든 학생의 수학 점수를 시리즈로 출력하라
display(exec_df["math"])
display(type(exec_df["math"]))

김지은    90
황인범    60
김정수    90
최호진    70
Name: math, dtype: int64

pandas.core.series.Series

In [288]:
# 모든 학생의 국어와 영어 점수를 데이터 프레임으로  만들어라
display(exec_df[["kor","eng"]])

Unnamed: 0,kor,eng
김지은,80,90
황인범,90,70
김정수,70,60
최호진,30,40


In [289]:
# 모든 학생의 각 과목 평균 점수를 새로운 열로 추가하라
exec_df['평균'] = ((exec_df["kor"] + exec_df["eng"] + exec_df["math"]) / 3).round(1)
exec_df

Unnamed: 0,kor,eng,math,평균
김지은,80,90,90,86.7
황인범,90,70,60,73.3
김정수,70,60,90,73.3
최호진,30,40,70,46.7


In [290]:
# 최호진 학생의 영어 점수를 90점으로 수정하고 평균 점수도 다시 계산하라
exec_df["eng"]["최호진"] = 90
exec_df['평균'] = ((exec_df["kor"] + exec_df["eng"] + exec_df["math"]) / 3).round(1)
exec_df


Unnamed: 0,kor,eng,math,평균
김지은,80,90,90,86.7
황인범,90,70,60,73.3
김정수,70,60,90,73.3
최호진,30,90,70,63.3


In [297]:
# 김지은 학생의 점수를 데이터 프레임으로 만들어라
display(exec_df[:1])

Unnamed: 0,kor,eng,math,평균
김지은,80,90,90,86.7


In [298]:
# 김정수 학생의 점수를 시리즈로 출력하라
display(exec_df.T["김정수"])
display(type(exec_df.T["김정수"]))

kor     70.0
eng     60.0
math    90.0
평균      73.3
Name: 김정수, dtype: float64

pandas.core.series.Series

In [304]:
# 황인범 학생의 국어점수와 수학점수를 100점으로 수정하고 평균 점수도 다시 계산하라
exec_df["kor"]["황인범"] = 100
exec_df["math"]["황인범"] = 100
exec_df['평균'] = ((exec_df["kor"] + exec_df["eng"] + exec_df["math"]) / 3).round(1)
exec_df

Unnamed: 0,kor,eng,math,평균
김지은,80,90,90,86.7
황인범,100,70,100,90.0
김정수,70,60,90,73.3
최호진,30,90,70,63.3


### pandas 문자함수
- 함수앞에 str붙이기

In [307]:
# head() , tail() : default가 5로 설정되어 있음
display(testDF.head())
display(testDF.tail())

Unnamed: 0,정수,실수1,실수2,문자,날짜
0,6,-0.523359,0.659371,apple,2021-02-25
1,48,-0.403826,0.925632,banana,2021-02-26
2,36,-0.089557,0.258317,mango,2021-02-27
3,23,0.254172,0.488167,berry,2021-02-28
4,54,0.222466,0.301571,melon,2021-03-01


Unnamed: 0,정수,실수1,실수2,문자,날짜
5,59,0.381257,0.801714,cherry,2021-03-02
6,87,0.259845,0.772216,pineapple,2021-03-03
7,54,-0.198669,0.909934,peach,2021-03-04
8,15,-0.126438,0.138101,blueberry,2021-03-05
9,45,-0.320419,0.940905,avocado,2021-03-06


In [308]:
courtDF = pd.read_csv('../data/court_code.txt', sep='\t', encoding='cp949')
courtDF.head()

Unnamed: 0,법정동코드,법정동명,폐지여부
0,1100000000,서울특별시,존재
1,1111000000,서울특별시 종로구,존재
2,1111010100,서울특별시 종로구 청운동,존재
3,1111010200,서울특별시 종로구 신교동,존재
4,1111010300,서울특별시 종로구 궁정동,존재


In [311]:
court_subset_df = courtDF[courtDF["폐지여부"]=="존재"]
court_subset_df

Unnamed: 0,법정동코드,법정동명,폐지여부
0,1100000000,서울특별시,존재
1,1111000000,서울특별시 종로구,존재
2,1111010100,서울특별시 종로구 청운동,존재
3,1111010200,서울특별시 종로구 신교동,존재
4,1111010300,서울특별시 종로구 궁정동,존재
...,...,...,...
46175,5013032022,제주특별자치도 서귀포시 표선면 하천리,존재
46176,5013032023,제주특별자치도 서귀포시 표선면 성읍리,존재
46177,5013032024,제주특별자치도 서귀포시 표선면 가시리,존재
46178,5013032025,제주특별자치도 서귀포시 표선면 세화리,존재


In [312]:
court_subset_df = courtDF[courtDF["폐지여부"]=="폐지"]
court_subset_df

Unnamed: 0,법정동코드,법정동명,폐지여부
89,1111090100,서울특별시 종로구 창신1동,폐지
90,1111090200,서울특별시 종로구 창신2동,폐지
91,1111090300,서울특별시 종로구 창신3동,폐지
92,1111090400,서울특별시 종로구 숭인1동,폐지
93,1111090500,서울특별시 종로구 숭인2동,폐지
...,...,...,...
45962,4972032025,제주도 남제주군 표선면 세화리,폐지
45963,4972032026,제주도 남제주군 표선면 토산리,폐지
46025,5011025305,제주특별자치도 제주시 애월읍 귀일리,폐지
46026,5011025306,제주특별자치도 제주시 애월읍 어도리,폐지


In [317]:
court_subset_df = courtDF[courtDF["법정동명"].str[:5] == "서울특별시"]
court_subset_df

Unnamed: 0,법정동코드,법정동명,폐지여부
0,1100000000,서울특별시,존재
1,1111000000,서울특별시 종로구,존재
2,1111010100,서울특별시 종로구 청운동,존재
3,1111010200,서울특별시 종로구 신교동,존재
4,1111010300,서울특별시 종로구 궁정동,존재
...,...,...,...
1107,1174010600,서울특별시 강동구 둔촌동,존재
1108,1174010700,서울특별시 강동구 암사동,존재
1109,1174010800,서울특별시 강동구 성내동,존재
1110,1174010900,서울특별시 강동구 천호동,존재


In [324]:
court_subset_df = courtDF["법정동명"].str[-1]
court_subset_df

0        시
1        구
2        동
3        동
4        동
        ..
46175    리
46176    리
46177    리
46178    리
46179    리
Name: 법정동명, Length: 46180, dtype: object

- 분할 : str.split()

In [323]:
# str.split()
# series 표현
court_subset_df = courtDF["법정동명"].str.split(" ")
court_subset_df

0                          [서울특별시]
1                     [서울특별시, 종로구]
2                [서울특별시, 종로구, 청운동]
3                [서울특별시, 종로구, 신교동]
4                [서울특별시, 종로구, 궁정동]
                   ...            
46175    [제주특별자치도, 서귀포시, 표선면, 하천리]
46176    [제주특별자치도, 서귀포시, 표선면, 성읍리]
46177    [제주특별자치도, 서귀포시, 표선면, 가시리]
46178    [제주특별자치도, 서귀포시, 표선면, 세화리]
46179    [제주특별자치도, 서귀포시, 표선면, 토산리]
Name: 법정동명, Length: 46180, dtype: object

In [326]:
# str.split()
# DataFrame 표현
court_subset_df = courtDF["법정동명"].str.split(" ", expand=True)
court_subset_df

Unnamed: 0,0,1,2,3,4
0,서울특별시,,,,
1,서울특별시,종로구,,,
2,서울특별시,종로구,청운동,,
3,서울특별시,종로구,신교동,,
4,서울특별시,종로구,궁정동,,
...,...,...,...,...,...
46175,제주특별자치도,서귀포시,표선면,하천리,
46176,제주특별자치도,서귀포시,표선면,성읍리,
46177,제주특별자치도,서귀포시,표선면,가시리,
46178,제주특별자치도,서귀포시,표선면,세화리,


- srt.startswith() : ~로 시작되는
- str.endswith() : ~로 끝나는
- str.contains() : ~을 담고있는
- str.replace() : ~로 대체

In [327]:
# str.startswith()
# series 표현
court_subset_df = courtDF["법정동명"].str.startswith("제주")
court_subset_df

0        False
1        False
2        False
3        False
4        False
         ...  
46175     True
46176     True
46177     True
46178     True
46179     True
Name: 법정동명, Length: 46180, dtype: bool

In [332]:
# str.startswith()
# DataFrame 표현
court_subset_df = courtDF[courtDF["법정동명"].str.startswith("제주")]
court_subset_df

Unnamed: 0,법정동코드,법정동명,폐지여부
45607,4900000000,제주도,폐지
45608,4911000000,제주도 제주시,폐지
45609,4911010100,제주도 제주시 일도일동,폐지
45610,4911010200,제주도 제주시 일도이동,폐지
45611,4911010300,제주도 제주시 이도일동,폐지
...,...,...,...
46175,5013032022,제주특별자치도 서귀포시 표선면 하천리,존재
46176,5013032023,제주특별자치도 서귀포시 표선면 성읍리,존재
46177,5013032024,제주특별자치도 서귀포시 표선면 가시리,존재
46178,5013032025,제주특별자치도 서귀포시 표선면 세화리,존재


In [334]:
# str.endswith()
court_subset_df = courtDF[courtDF["법정동명"].str.endswith("시")]
court_subset_df

Unnamed: 0,법정동코드,법정동명,폐지여부
0,1100000000,서울특별시,존재
1112,2100000000,부산직할시,폐지
1566,2200000000,대구직할시,폐지
1900,2300000000,인천직할시,폐지
2123,2400000000,광주직할시,폐지
...,...,...,...
42476,4833000000,경상남도 양산시,존재
45608,4911000000,제주도 제주시,폐지
45656,4913000000,제주도 서귀포시,폐지
45965,5011000000,제주특별자치도 제주시,존재


In [336]:
# str.contains()
court_subset_df = courtDF[courtDF["법정동명"].str.contains("강서") & courtDF["법정동명"].str.contains("부산")]
court_subset_df

Unnamed: 0,법정동코드,법정동명,폐지여부
1541,2144000000,부산직할시 강서구,폐지
1542,2144010100,부산직할시 강서구 대저일동,폐지
1543,2144010200,부산직할시 강서구 대저이동,폐지
1544,2144010300,부산직할시 강서구 강동동,폐지
1545,2144010400,부산직할시 강서구 명지동,폐지
1546,2144010500,부산직할시 강서구 죽림동,폐지
1547,2144010600,부산직할시 강서구 식만동,폐지
1548,2144010700,부산직할시 강서구 죽동동,폐지
1549,2144010800,부산직할시 강서구 봉림동,폐지
1550,2144010900,부산직할시 강서구 송정동,폐지


In [337]:
# str.replace()
court_subset_df = courtDF["법정동명"].str.replace(" ", "-")
court_subset_df

0                       서울특별시
1                   서울특별시-종로구
2               서울특별시-종로구-청운동
3               서울특별시-종로구-신교동
4               서울특별시-종로구-궁정동
                 ...         
46175    제주특별자치도-서귀포시-표선면-하천리
46176    제주특별자치도-서귀포시-표선면-성읍리
46177    제주특별자치도-서귀포시-표선면-가시리
46178    제주특별자치도-서귀포시-표선면-세화리
46179    제주특별자치도-서귀포시-표선면-토산리
Name: 법정동명, Length: 46180, dtype: object

- str.strip() : 좌우공백 없애기
- str.lstrip() : 왼쪽 공백 없애기
- str.rstrip() : 오른쪽 공백 없애기
- str.lower() : 소문자 전환
- str.upper() : 대문자 전환
- str.swapcase() : 

In [338]:
emptyDF = pd.DataFrame({
    'col01' : ['abcd   ' , ' FFFght '  , 'abCCe    '],
    'col02' : ['   fjHij' , ' ffght '  , 'Ibcce    '],
        
})

emptyDF

Unnamed: 0,col01,col02
0,abcd,fjHij
1,FFFght,ffght
2,abCCe,Ibcce


In [341]:
# str.strip()
print(emptyDF['col01'].str.strip())
# str.lower()
print(emptyDF['col01'].str.lower())
# str.strip() , str.upper()
print(emptyDF['col01'].str.strip().str.upper())

0      abcd
1    FFFght
2     abCCe
Name: col01, dtype: object
0      abcd   
1      fffght 
2    abcce    
Name: col01, dtype: object
0      ABCD
1    FFFGHT
2     ABCCE
Name: col01, dtype: object
