# Intro to Data Structures
- 출처 : https://pandas.pydata.org/pandas-docs/stable/dsintro.html

In [3]:
import numpy as np
import pandas as pd

## Series

- **Series** 일차원 구조의 데이터이며 다양한 데이터 타입을 가질 수 있음(int, str, float, Python object, etc)
- index라는 축 라벨이 존재

<br/>
```python
s = pd.Series(data, index = index)
```  
Here, data can be many different things:
- a Pythno dict
- an ndarray
- a scalar value(like 5)

### From ndarray

In [2]:
s = pd.Series(np.random.randn(5), index = list('abcde'))
s

a    0.868649
b    0.907287
c    0.479810
d    1.102607
e   -0.907131
dtype: float64

In [15]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

- Pandas는 unique하지 않은 index values를 지원\
- 중복된 index를 지원하지 않는 연산을 수행할 시 error 발생
- The reason for being lazy is nearly all performance-based (there are many instances in computations, like parts of GroupBy, where the index is not used).

### From dict

In [17]:
d = {'b' : 1, 'a' : 0, 'c' : 2}
pd.Series(d)

b    1
a    0
c    2
dtype: int64

In [19]:
pd.Series(d, index = list('bcda'))

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

### From scalar value

In [20]:
pd.Series(5., index = list('abcde'))

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

### Series is ndarray-like

In [27]:
s

a    0.041993
b   -0.934759
c   -0.594902
d    0.093635
e   -1.021400
dtype: float64

In [21]:
s[0]

0.041992802228601626

In [22]:
s[:3]

a    0.041993
b   -0.934759
c   -0.594902
dtype: float64

In [23]:
s[s > s.median()]

a    0.041993
d    0.093635
dtype: float64

In [26]:
s[[4,3,1]]

e   -1.021400
d    0.093635
b   -0.934759
dtype: float64

In [28]:
np.exp(s)

a    1.042887
b    0.392680
c    0.551617
d    1.098159
e    0.360090
dtype: float64

### Series is dict-like

In [29]:
s['a']

0.041992802228601626

In [30]:
s['e']

-1.021400331734986

In [32]:
s['e'] = 12.
s['e']

12.0

In [33]:
'e' in s

True

In [34]:
'f' in s

False

In [43]:
12.0 in s

False

In [46]:
s.get('f')

In [48]:
s.get('f', np.nan)

nan

### Vectorized operations 

In [4]:
s + s

a    1.737297
b    1.814574
c    0.959620
d    2.205213
e   -1.814263
dtype: float64

In [5]:
s * 2

a    1.737297
b    1.814574
c    0.959620
d    2.205213
e   -1.814263
dtype: float64

In [6]:
np.exp(2)

7.38905609893065

In [7]:
s[1:] + s[:-1]

a         NaN
b    1.814574
c    0.959620
d    2.205213
e         NaN
dtype: float64

### Name attribute

In [4]:
s = pd. Series(np.random.randn(5), name = 'something')
s

0   -0.408718
1   -0.623660
2   -0.773496
3   -0.029521
4   -1.096467
Name: something, dtype: float64

In [5]:
s.name

'something'

In [6]:
s2 = s.rename('different')
s2.name

'different'

## DataFrame
 - 선택적으로 index(row labels)와 columns(column labels) 인수 지정가능
 - index만 활용하여 필터링 가능

### From dict of Series or dicts

In [7]:
d = {'one' : pd.Series([1., 2., 3.], index = list('abc')),
     'two' : pd.Series([1., 2., 3., 4.], index = list('abcd'))}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [8]:
pd.DataFrame(d, index = list('dba'))

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [9]:
pd.DataFrame(d, index = list('dba'), columns = ['two', 'three'])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [10]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [11]:
df.columns

Index(['one', 'two'], dtype='object')

### From dict of ndarrays / list

In [13]:
d = {'one' : [1., 2., 3., 4.],
     'two' : [4., 3., 2., 1.]}

pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [14]:
pd.DataFrame(d, index = list('abcd'))

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


### From structured or record array

In [15]:
data = np.zeros((2, ), dtype = [('A', 'i4'), ('B', 'f4'), ('c', 'a10')])

In [16]:
data

array([(0, 0., b''), (0, 0., b'')],
      dtype=[('A', '<i4'), ('B', '<f4'), ('c', 'S10')])

In [17]:
data[:] = [(1,2.,'hello'), (2,3.,'world')]

In [18]:
data

array([(1, 2., b'hello'), (2, 3., b'world')],
      dtype=[('A', '<i4'), ('B', '<f4'), ('c', 'S10')])

In [19]:
pd.DataFrame(data)

Unnamed: 0,A,B,c
0,1,2.0,b'hello'
1,2,3.0,b'world'


### Columns selection, addition, deletion

In [48]:
d = {'one' : pd.Series([1., 2., 3.], index = list('abc')),
     'two' : pd.Series([1., 2., 3., 4.], index = list('abcd'))}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [49]:
df['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [50]:
df['three'] = df['one'] * df['two']
df['flag'] = df['one'] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


In [51]:
del df['two']

In [52]:
df

Unnamed: 0,one,three,flag
a,1.0,1.0,False
b,2.0,4.0,False
c,3.0,9.0,True
d,,,False


In [53]:
three = df.pop('three')

In [54]:
three

a    1.0
b    4.0
c    9.0
d    NaN
Name: three, dtype: float64

In [55]:
df

Unnamed: 0,one,flag
a,1.0,False
b,2.0,False
c,3.0,True
d,,False


In [56]:
df['foo'] = 'bar'
df

Unnamed: 0,one,flag,foo
a,1.0,False,bar
b,2.0,False,bar
c,3.0,True,bar
d,,False,bar


In [57]:
df['one_trunc'] = df['one'][:2]
df

Unnamed: 0,one,flag,foo,one_trunc
a,1.0,False,bar,1.0
b,2.0,False,bar,2.0
c,3.0,True,bar,
d,,False,bar,


In [58]:
# index, column name, data
df.insert(2, 'bar', df['one'])
df

Unnamed: 0,one,flag,bar,foo,one_trunc
a,1.0,False,1.0,bar,1.0
b,2.0,False,2.0,bar,2.0
c,3.0,True,3.0,bar,
d,,False,,bar,
