# Pandas
- 구조화된 데이터의 처리를 지원하는 Python 라이브러리
- Python계의 엑셀!
- 고성능 Array 계산 라이브러리인 Numpy와 통합하여, 강력한 '스프레드시트' 처리 기능을 제공
- 인덱싱, 연산용 함수, 전처리 함수 등을 제공함

In [1]:
import pandas as pd # 라이브러리 호출

In [2]:
data_url='https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data' # Data URL
df_data=pd.read_csv(data_url, sep='\s+', header=None) # csv 타입 데이터 로드, separate는 빈공간으로 지정 

In [3]:
df_data.head() # 처음 다섯 줄 출력

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [4]:
df_data.values

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 3.9690e+02, 4.9800e+00,
        2.4000e+01],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 3.9690e+02, 9.1400e+00,
        2.1600e+01],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 3.9283e+02, 4.0300e+00,
        3.4700e+01],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 3.9690e+02, 5.6400e+00,
        2.3900e+01],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 3.9345e+02, 6.4800e+00,
        2.2000e+01],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 3.9690e+02, 7.8800e+00,
        1.1900e+01]])

# Pandas의 구성
![image.png](attachment:image.png)
- DataFrame : Data Table 전체를 포함하는 Object
- Series : DataFrame 중 하나의 Coulumn에 해당하는 데이터의 모음 Object

## Series
Column Vector를 표현하는 object

In [5]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [6]:
list_data=[1,2,3,4,5]
example_obj=Series(data=list_data)
example_obj

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [7]:
list_data=[1,2,3,4,5]
list_name=['a','b','c','d','e']
example_obj=Series(data=list_data, index=list_name) # index 이름 지정
example_obj

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [8]:
dict_data={'a':1,'b':2,'c':3,'d':4,'e':5} # Data와 index 이름 지정
example_obj=Series(dict_data, dtype=np.float32, name='example_data') # data type 설정, series 이름 설정
example_obj

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

In [9]:
example_obj['a'] # data index에 접근하기

1.0

In [10]:
example_obj['a']=3.2 # data index에 값 할당하기
example_obj

a    3.2
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

## DataFrame
Series를 모아서 만든 Data Table = 기본 2차원

In [11]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [12]:
# column_name : data
raw_data={'first_name':['Jason','Molly','Tina','Jake','Amy'],
         'last_name':['Miller','Jacobson','Ali','Miner','Cooze'],
         'age':[42,52,36,24,73],
         'city':['San Francisco','Baltimore','Mianmi','Douglas','Boston']}
df=pd.DataFrame(raw_data, columns=['first_name','last_name','age','city'])
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Mianmi
3,Jake,Miner,24,Douglas
4,Amy,Cooze,73,Boston


In [13]:
DataFrame(raw_data, columns=['age','city']) # column 선택

Unnamed: 0,age,city
0,42,San Francisco
1,52,Baltimore
2,36,Mianmi
3,24,Douglas
4,73,Boston


In [14]:
DataFrame(raw_data, columns=['first_name','last_name','age','city','debt']) # 새로운 column 추가

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Mianmi,
3,Jake,Miner,24,Douglas,
4,Amy,Cooze,73,Boston,


In [15]:
df=DataFrame(raw_data, columns=['first_name','last_name','age','city','debt'])
df.first_name # column 선택 - series 추출

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

In [16]:
df['first_name'] # column 선택 - series 추출

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

In [17]:
df.loc[1] # loc - index location

first_name        Molly
last_name      Jacobson
age                  52
city          Baltimore
debt                NaN
Name: 1, dtype: object

In [18]:
df['age'].iloc[1:] # iloc - index position

1    52
2    36
3    24
4    73
Name: age, dtype: int64

loc는 index 이름, iloc는 index number

In [19]:
s=pd.Series(np.nan,index=[49,48,47,46,45,1,2,3,4,5])
s

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
dtype: float64

In [20]:
s.loc[:3] # index 이름 3까지

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
dtype: float64

In [21]:
s.iloc[:3] # 처음부터 세번째 값

49   NaN
48   NaN
47   NaN
dtype: float64

In [22]:
df.debt=df.age>40 # Column에 새로운 데이터 할당
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,True
1,Molly,Jacobson,52,Baltimore,True
2,Tina,Ali,36,Mianmi,False
3,Jake,Miner,24,Douglas,False
4,Amy,Cooze,73,Boston,True


In [23]:
df.T # transpose

Unnamed: 0,0,1,2,3,4
first_name,Jason,Molly,Tina,Jake,Amy
last_name,Miller,Jacobson,Ali,Miner,Cooze
age,42,52,36,24,73
city,San Francisco,Baltimore,Mianmi,Douglas,Boston
debt,True,True,False,False,True


In [24]:
df.values # 값 출력

array([['Jason', 'Miller', 42, 'San Francisco', True],
       ['Molly', 'Jacobson', 52, 'Baltimore', True],
       ['Tina', 'Ali', 36, 'Mianmi', False],
       ['Jake', 'Miner', 24, 'Douglas', False],
       ['Amy', 'Cooze', 73, 'Boston', True]], dtype=object)

In [25]:
df.to_csv() # csv 변환

',first_name,last_name,age,city,debt\r\n0,Jason,Miller,42,San Francisco,True\r\n1,Molly,Jacobson,52,Baltimore,True\r\n2,Tina,Ali,36,Mianmi,False\r\n3,Jake,Miner,24,Douglas,False\r\n4,Amy,Cooze,73,Boston,True\r\n'

In [26]:
del df['debt'] # column 삭제
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Mianmi
3,Jake,Miner,24,Douglas
4,Amy,Cooze,73,Boston


In [27]:
# Nested dict 에서는 Column : {Index:value}
pop={'Nevada':{2001: 2.4, 2002: 2.9},
    'Ohio':{2000: 1.5, 2001: 1.7, 2002: 3.6}}
DataFrame(pop)

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


# Selection & Drop

## Selection with column names

In [28]:
df['city'].head(3) # 한개의 column 선택

0    San Francisco
1        Baltimore
2           Mianmi
Name: city, dtype: object

In [29]:
df['age'].head(3) # 한개의 column 선택

0    42
1    52
2    36
Name: age, dtype: int64

In [30]:
df[['first_name','last_name','age']].head(3) # 1개 이상의 column 선택

Unnamed: 0,first_name,last_name,age
0,Jason,Miller,42
1,Molly,Jacobson,52
2,Tina,Ali,36


In [31]:
df[:3] # column 이름 없이 사용하는 index number는 row 기준 표시

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Mianmi


In [32]:
df['city'][:3] # column 이름과 함께 row index 사용시, 해당 column 만

0    San Francisco
1        Baltimore
2           Mianmi
Name: city, dtype: object

In [33]:
age_series=df['age']
age_series[:3]

0    42
1    52
2    36
Name: age, dtype: int64

In [34]:
age_series[[0,1,2]] # 1개 이상의 index

0    42
1    52
2    36
Name: age, dtype: int64

In [35]:
age_series[age_series<40] # boolean index

2    36
3    24
Name: age, dtype: int64

## Index 변경

In [36]:
df.index=df['age']
del df['age']
df.head()

Unnamed: 0_level_0,first_name,last_name,city
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,Jason,Miller,San Francisco
52,Molly,Jacobson,Baltimore
36,Tina,Ali,Mianmi
24,Jake,Miner,Douglas
73,Amy,Cooze,Boston


## Basic, loc, iloc selection

In [37]:
df[['first_name','last_name']][:2] # columns과 index number

Unnamed: 0_level_0,first_name,last_name
age,Unnamed: 1_level_1,Unnamed: 2_level_1
42,Jason,Miller
52,Molly,Jacobson


In [38]:
df.loc[[42,52],['first_name','last_name']] # columns과 index name

Unnamed: 0_level_0,first_name,last_name
age,Unnamed: 1_level_1,Unnamed: 2_level_1
42,Jason,Miller
52,Molly,Jacobson


In [39]:
df.iloc[:2,:2] # column number와 index number

Unnamed: 0_level_0,first_name,last_name
age,Unnamed: 1_level_1,Unnamed: 2_level_1
42,Jason,Miller
52,Molly,Jacobson


## index 재설정

In [40]:
df.index=list(range(0,5))
df.head()

Unnamed: 0,first_name,last_name,city
0,Jason,Miller,San Francisco
1,Molly,Jacobson,Baltimore
2,Tina,Ali,Mianmi
3,Jake,Miner,Douglas
4,Amy,Cooze,Boston


## Data drop

In [41]:
df.drop(1) # index number로 drop

Unnamed: 0,first_name,last_name,city
0,Jason,Miller,San Francisco
2,Tina,Ali,Mianmi
3,Jake,Miner,Douglas
4,Amy,Cooze,Boston


In [42]:
df.drop([0,1,2,3]) # 한 개 이상의 index number로 drop

Unnamed: 0,first_name,last_name,city
4,Amy,Cooze,Boston


In [43]:
df.drop('city',axis=1) # axis 지정으로 축 기준 drop -> column 중에 city

Unnamed: 0,first_name,last_name
0,Jason,Miller
1,Molly,Jacobson
2,Tina,Ali
3,Jake,Miner
4,Amy,Cooze


In [44]:
df.drop(['last_name','city'],axis=1) 

Unnamed: 0,first_name
0,Jason
1,Molly
2,Tina
3,Jake
4,Amy


## Series operation

In [45]:
s1=Series(range(1,6),index=list('abcde'))
s1

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [46]:
s2=Series(range(5,11),index=list('bcedef'))
s2

b     5
c     6
e     7
d     8
e     9
f    10
dtype: int64

In [47]:
s1.add(s2) # index 기준으로 연산 수행

a     NaN
b     7.0
c     9.0
d    12.0
e    12.0
e    14.0
f     NaN
dtype: float64

In [48]:
s1+s2 # 겹치는 index가 없을 경우 NaN 반환

a     NaN
b     7.0
c     9.0
d    12.0
e    12.0
e    14.0
f     NaN
dtype: float64

## Dataframe operation

In [49]:
df1=DataFrame(np.arange(9).reshape(3,3), columns=list('abc'))
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [50]:
df2=DataFrame(np.arange(16).reshape(4,4), columns=list('abcd'))
df2

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [51]:
df1+df2 # df는 column과 index를 모두 고려

Unnamed: 0,a,b,c,d
0,0.0,2.0,4.0,
1,7.0,9.0,11.0,
2,14.0,16.0,18.0,
3,,,,


In [52]:
df1.add(df2,fill_value=0) # add operation을 쓰면 NaN 값 0으로 반환

Unnamed: 0,a,b,c,d
0,0.0,2.0,4.0,3.0
1,7.0,9.0,11.0,7.0
2,14.0,16.0,18.0,11.0
3,12.0,13.0,14.0,15.0


## Series + Dataframe

In [76]:
df=DataFrame(np.arange(16).reshape(4,4), columns=list('abcd'))
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [77]:
s=Series(range(10,14),index=list('abcd'))
s

a    10
b    11
c    12
d    13
dtype: int64

In [78]:
df+s # column을 기준으로 broadcasting이 발생함

Unnamed: 0,a,b,c,d
0,10,12,14,16
1,14,16,18,20
2,18,20,22,24
3,22,24,26,28


In [79]:
df.add(s) # 같은 결과, add의 기본값 axis=1

Unnamed: 0,a,b,c,d
0,10,12,14,16
1,14,16,18,20
2,18,20,22,24
3,22,24,26,28


In [56]:
df=DataFrame(np.arange(16).reshape(4,4), columns=list('abcd'))
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [57]:
s2=Series(range(10,14))
s2

0    10
1    11
2    12
3    13
dtype: int64

In [58]:
df+s2

Unnamed: 0,a,b,c,d,0,1,2,3
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,


In [59]:
df.add(s2, axis=0) # axis를 기준으로 row broadcasting 실행

Unnamed: 0,a,b,c,d
0,10,11,12,13
1,15,16,17,18
2,20,21,22,23
3,25,26,27,28


# Lambda, map, apply

## Lambda 함수
- 한 줄로 함수를 표현하는 익명 함수 기법
- Lisp 언어에서 시작된 기법으로 오늘날 현대언어에 많이 사용
- `lambda argument : expression`

In [60]:
f=lambda x,y: x+y
f(1,4)

5

In [61]:
f=lambda x: x/2 # 하나의 argument만 처리하는 lambda 함수
f(3)

1.5

In [62]:
f=lambda x: x**2 # 하나의 argument만 처리하는 lambda 함수
f(3)

9

In [63]:
(lambda x: x+1)(5) # 이름을 할당하지 않는 lambda 함수

6

## map 함수
- 함수와 sequence 형 데이터를 인자로 받아
- 각 element마다 입력받은 함수를 적용하여 list로 반환
- 일반적으로 함수를 lambda 형태로 표현함
- `map(function, sequence)`

In [64]:
ex=[1,2,3,4,5]
f=lambda x: x**2
list(map(f, ex))

[1, 4, 9, 16, 25]

In [65]:
f=lambda x,y: x+y # 두 개 이상의 argument가 있을 때는 두 개의 sequence 형을 써야함
list(map(f, ex, ex))

[2, 4, 6, 8, 10]

In [66]:
list(map(lambda x: x+x, ex)) # 익명 함수 그대로 사용할 수 있음

[2, 4, 6, 8, 10]

### map for series
- Pandas의 series type의 데이터에도 map 함수 사용가능
- function 대신 dict, sequence형 자료 등으로 대체 가능

In [67]:
s1=Series(np.arange(10))
s1.head(5)

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [68]:
s1.map(lambda x: x**2).head(5)

0     0
1     1
2     4
3     9
4    16
dtype: int64

In [69]:
z={1:'A', 2:'B', 3:'C'}
s1.map(z).head(5) # dict type으로 데이터 교체, 없는 값은 NaN

0    NaN
1      A
2      B
3      C
4    NaN
dtype: object

In [70]:
s2=Series(np.arange(10,20))
s1.map(s2).head(5) # 같은 위치의 데이터를 s2로 전환

0    10
1    11
2    12
3    13
4    14
dtype: int32

### apply form dataframe
- map과 달리, series 전체(column)에 해당 함수를 적용
- 입력 값이 series 데이터로 입력받아 handling 가능
![image.png](attachment:image.png)

- 내장 연산 함수를 사용할 때도 똑같은 효과를 거둘 수 있음
- mean, std 등 사용 가능
![image.png](attachment:image.png)

- scalar 값 이외에 series 값의 반환도 가능함
![image.png](attachment:image.png)

- series 단위가 아닌 element 단위로 함수를 적용함
- sereis 단위에 apply를 적용시킬 때와 같은 효과
![image.png](attachment:image.png)

# Pandas Built-in functions

## describe
- Numeric type 데이터의 요약 정보를 보여줌

In [71]:
raw_data={'first_name':['Jason','Molly','Tina','Jake','Amy'],
         'last_name':['Miller','Jacobson','Ali','Miner','Cooze'],
         'age':[42,52,36,24,73],
         'city':['San Francisco','Baltimore','Mianmi','Douglas','Boston']}
df=pd.DataFrame(raw_data, columns=['first_name','last_name','age','city'])
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Mianmi
3,Jake,Miner,24,Douglas
4,Amy,Cooze,73,Boston


In [72]:
df.describe()

Unnamed: 0,age
count,5.0
mean,45.4
std,18.460769
min,24.0
25%,36.0
50%,42.0
75%,52.0
max,73.0


In [73]:
df.city.unique() # 고유값

array(['San Francisco', 'Baltimore', 'Mianmi', 'Douglas', 'Boston'],
      dtype=object)

In [74]:
np.array(dict(enumerate(df['city'].unique()))) # dict type으로 index

array({0: 'San Francisco', 1: 'Baltimore', 2: 'Mianmi', 3: 'Douglas', 4: 'Boston'},
      dtype=object)

In [75]:
value=list(map(int,np.array(list(enumerate(df['city'].unique())))[:,0].tolist()))
key=np.array(list(enumerate(df['city'].unique())),dtype=str)[:,1].tolist()

value, key # label index 값과 label 값 각각 추출

([0, 1, 2, 3, 4],
 ['San Francisco', 'Baltimore', 'Mianmi', 'Douglas', 'Boston'])