# Pandas

### 구조화된 데이터의 처리를 지원하는 Python 라이브러리 => Python의 엑셀

- 구조화된 데이터의 처리를 지원하는 Python 라이브러리
- 고성능 Array 계산 라이브러리인 Numpy와 통합하여, 강력한 "스프레드시트" 처리 기능을 제공
- 인덱싱, 연상용 함수, 전처리 함수 등을 제공함

## 1. data loading

In [1]:
import pandas as pd

In [3]:
# Data URL
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
# csv 타입 데이터 로드, separate는 빈 공간으로 지정하고, Column은 없음
df_data = pd.read_csv(data_url, sep='\s+', header = None) 

In [4]:
# 처음 다섯 줄 출력
df_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


![1](nb_images/pd1.png)

![2](nb_images/pd2.png)

![3](nb_images/pd3.png)

## <span class="mark">Series = Numpy + Index</span>
### Numpy가 할 수 있는 모든 연산은 다 지원한다.

In [12]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [2]:
example_obj = Series()

![4](nb_images/pd4.png)

In [4]:
list_data = [1,2,3,4,5]
example_obj = Series(data = list_data)
print(example_obj)

0    1
1    2
2    3
3    4
4    5
dtype: int64


![5](nb_images/pd5.png)

![6](nb_images/pd6.png)

In [5]:
list_data = [1,2,3,4,5]
list_name = ["a", "b", "c", "d", "e"]
example_obj = Series(data = list_data, index=list_name)
print(example_obj)

a    1
b    2
c    3
d    4
e    5
dtype: int64


![7](nb_images/pd7.png)

In [13]:
dict_data = {"a":1, "b":2, "c":3, "d":4, "e":5}
example_obj = Series(data = dict_data, dtype=np.float32, name = "example_data")
print(example_obj)

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32


![8](nb_images/pd8.png)

In [14]:
example_obj["a"]

1.0

In [15]:
example_obj["a"] = 3.2
print(example_obj)

a    3.2
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32


![9](nb_images/pd9.png)

In [16]:
print(example_obj.values)
print(type(example_obj.values))

[3.2 2.  3.  4.  5. ]
<class 'numpy.ndarray'>


In [17]:
print(example_obj.index)
print(type(example_obj.index))

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
<class 'pandas.indexes.base.Index'>


In [18]:
example_obj.name = "number"
example_obj.index.name = "alphabet"
print(example_obj)

alphabet
a    3.2
b    2.0
c    3.0
d    4.0
e    5.0
Name: number, dtype: float32


![10](nb_images/pd10.png)

In [23]:
dict_data_1 = {"a":1, "b":2, "c":3, "d":4, "e":5}
indexes = ["a", "b", "c", "d", "e", "f", "g", "h"]
series_obj_1 = Series(data = dict_data_1, index = indexes)
print(series_obj_1)

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
f    NaN
g    NaN
h    NaN
dtype: float64


In [25]:
# boolean operation
example_obj[example_obj > 2]

alphabet
a    3.2
c    3.0
d    4.0
e    5.0
Name: number, dtype: float32

In [26]:
example_obj * 2

alphabet
a     6.4
b     4.0
c     6.0
d     8.0
e    10.0
Name: number, dtype: float32