# Pandas Class
- Series 
- DataFrame
- Grouped
- ...

In [2]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

In [3]:
my_list = [i*10 for i in range(1, 6)]
my_list

[10, 20, 30, 40, 50]

In [4]:
my_dict = {
    "a": 100,
    "b": 200,
    "c": 300,
    "d": 400,
    "e": 500
}
my_dict

{'a': 100, 'b': 200, 'c': 300, 'd': 400, 'e': 500}

# Series Indexing
- 위치 (Python 인덱싱):
  - iloc
- 라벨, 값의 이름, 인덱스:
  - loc
  - dict 형식
  - . 연산자

In [15]:
my_ss = pd.Series(my_list)
my_ss

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [6]:
my_ss[0] # 안티패턴

10

In [7]:
my_ss.iloc[0] # 첫번째 값

10

In [8]:
my_ss.iloc[-1] # 마지막 값

50

In [9]:
my_ss1 = my_ss.copy()
my_ss1

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [10]:
my_ss1.index = ['a', 'b', 'c', 'd', 'e']
my_ss1

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [11]:
my_ss1['a']

10

In [12]:
my_ss1.loc['a']

10

In [13]:
my_ss1.a

10

In [14]:
my_ss[[True, False, True, False, True]]

0    10
2    30
4    50
dtype: int64

In [16]:
my_ss[my_ss > 30]

3    40
4    50
dtype: int64

In [None]:
my_ss[[0, 2]] # 안티패턴 

In [18]:
my_ss.iloc[[0, 2]]

0    10
2    30
dtype: int64

In [19]:
my_ss = pd.Series(my_list,
                  index=['a', 'b', 'c', 'd', 'e'])
my_ss

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [20]:
my_ss['a']

10

In [21]:
my_ss.a

10

In [22]:
my_ss.loc['a']

10

In [23]:
my_ss[[True, False, True, False, True]]

a    10
c    30
e    50
dtype: int64

In [24]:
my_ss[['a', 'b']]

a    10
b    20
dtype: int64

In [26]:
my_ss.index = ['a', 'b', 'c', 'd', 0]
my_ss

a    10
b    20
c    30
d    40
0    50
dtype: int64

In [None]:
my_ss[[0, 2]] # 안티패턴

In [28]:
my_ss.iloc[[0, 2]]

a    10
c    30
dtype: int64

In [29]:
my_ss.iloc[0]

10

In [30]:
my_ss.iloc[-1]

50

In [31]:
my_ss.iloc[2]

30

## 슬라이싱
- 순서:
  - iloc (파이썬의 슬라이싱과 동일) 
  - iloc[start:stop:step]: [start, stop)
- 라벨을 이용한 슬라이싱:
  - loc
    - my_ss.loc[start:stop]: [start, stop]
  - dict 형식
    - my_ss[start:stop]: [start, stop]

In [32]:
my_ss = pd.Series(my_list)
my_ss

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [34]:
my_ss.iloc[2:] # n번째 부터 끝까지

2    30
3    40
4    50
dtype: int64

In [35]:
my_ss.iloc[:2]

0    10
1    20
dtype: int64

In [36]:
my_ss.iloc[2:4]

2    30
3    40
dtype: int64

In [37]:
my_ss.iloc[:]

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [39]:
my_ss[:3] # 안티패턴

0    10
1    20
2    30
dtype: int64

In [40]:
my_ss = pd.Series(my_list, index=list('abcde'))
my_ss

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [41]:
my_ss.iloc[:2] # 처음 부터 1번째 값까지

a    10
b    20
dtype: int64

In [42]:
my_ss.iloc[2:] # n-1번째 값부터 마지막 값까지

c    30
d    40
e    50
dtype: int64

In [43]:
my_ss.iloc[2:4]

c    30
d    40
dtype: int64

In [44]:
my_ss.iloc[:]

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [45]:
my_ss['a':'b']

a    10
b    20
dtype: int64

In [46]:
my_ss[:'c']

a    10
b    20
c    30
dtype: int64

In [47]:
my_ss['c':]

c    30
d    40
e    50
dtype: int64

In [48]:
my_ss.loc['a':'b']

a    10
b    20
dtype: int64

In [49]:
my_ss.loc[:'c']

a    10
b    20
c    30
dtype: int64

In [50]:
my_ss.loc['c':]

c    30
d    40
e    50
dtype: int64

In [51]:
my_ss

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [52]:
my_ss1 = my_ss * 3 + 2
my_ss1

a     32
b     62
c     92
d    122
e    152
dtype: int64

In [53]:
my_list + 3

TypeError: ignored

In [54]:
my_list + [3]

[10, 20, 30, 40, 50, 3]

In [55]:
my_ss + 2 # == my_ss + [2, 2, 2, 2, 2] # broadcast (numpy)

a    12
b    22
c    32
d    42
e    52
dtype: int64

In [56]:
my_ss + my_ss1

a     42
b     82
c    122
d    162
e    202
dtype: int64

In [57]:
my_ss - 2 # my_ss - [2, 2, 2, 2, 2]

a     8
b    18
c    28
d    38
e    48
dtype: int64

In [58]:
my_ss1 - my_ss

a     22
b     42
c     62
d     82
e    102
dtype: int64

In [59]:
my_ss / 3

a     3.333333
b     6.666667
c    10.000000
d    13.333333
e    16.666667
dtype: float64

In [60]:
my_ss * 3

a     30
b     60
c     90
d    120
e    150
dtype: int64

In [61]:
my_ss // 3

a     3
b     6
c    10
d    13
e    16
dtype: int64

In [62]:
my_ss % 3

a    1
b    2
c    0
d    1
e    2
dtype: int64

In [63]:
my_ss1['d'] = np.nan
my_ss1

a     32.0
b     62.0
c     92.0
d      NaN
e    152.0
dtype: float64

In [64]:
my_ss + my_ss1

a     42.0
b     82.0
c    122.0
d      NaN
e    202.0
dtype: float64

In [65]:
my_ss.add(my_ss1, fill_value=0)

a     42.0
b     82.0
c    122.0
d     40.0
e    202.0
dtype: float64

In [68]:
my_ss >= 30

a    False
b    False
c     True
d     True
e     True
dtype: bool

In [69]:
my_ss[my_ss >= 30]

c    30
d    40
e    50
dtype: int64

In [72]:
my_ss[(my_ss >= 30) | (my_ss <= 10)]

a    10
c    30
d    40
e    50
dtype: int64

In [74]:
my_ss.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [75]:
my_ss.name

In [76]:
my_ss.values

array([10, 20, 30, 40, 50])

In [77]:
my_ss.dtype

dtype('int64')

In [78]:
# np.int64, np.int32
# np.float64, np.float32
# np.nan
# object, 'O' (Python object, str)

In [79]:
my_ss.size

5

In [81]:
my_ss.ndim # (index, columns)

1

In [82]:
import seaborn as sns

my_iris = sns.load_dataset("iris")
my_iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [83]:
my_iris.sample(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
22,4.6,3.6,1.0,0.2,setosa
106,4.9,2.5,4.5,1.7,virginica
73,6.1,2.8,4.7,1.2,versicolor
58,6.6,2.9,4.6,1.3,versicolor
139,6.9,3.1,5.4,2.1,virginica
121,5.6,2.8,4.9,2.0,virginica
69,5.6,2.5,3.9,1.1,versicolor
46,5.1,3.8,1.6,0.2,setosa
74,6.4,2.9,4.3,1.3,versicolor
63,6.1,2.9,4.7,1.4,versicolor


In [85]:
my_iris.species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [86]:
my_titanic = sns.load_dataset('titanic')
my_titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [88]:
my_titanic.pclass.unique()

array([3, 1, 2])

In [89]:
my_titanic.age

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [90]:
my_titanic.alive

0       no
1      yes
2      yes
3      yes
4       no
      ... 
886     no
887    yes
888     no
889    yes
890     no
Name: alive, Length: 891, dtype: object