# 개요

1. numpy 
2. pandas
3. torch로 vector, matrix 등 만들어보기

## 참고자료
* Official tutorial: https://docs.scipy.org/doc/numpy/reference/
* Stanford Univ. CS231 tutorial: http://cs231n.github.io/python-numpy-tutorial/

## 1. numpy, array, matrix 등

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np

### 리스트, 행렬 선언

In [6]:
# list_name(변수명 설정) = ["item-1", "item-2", ... , "item-n"]

a = [1, 2, 3]
b = ["a", "b", "c"]
c = [["a"], ["a", "b"], ["a", "c"], ["a", "b", "c"]]

print(a)
print(b)
print(c)

[1, 2, 3]
['a', 'b', 'c']
[['a'], ['a', 'b'], ['a', 'c'], ['a', 'b', 'c']]


In [7]:
# append()로 리스트에 item 추가하기
c.append('new_item')
c.append(['new_item_list'])

print(c)

[['a'], ['a', 'b'], ['a', 'c'], ['a', 'b', 'c'], 'new_item', ['new_item_list']]


In [9]:
a = np.array([1, 2, 3])
b = np.array(["a", "b", "c"])
c = np.array([["a"], ["a", "b"], ["a", "c"], ["a", "b", "c"]])

print(a)
print(b)
print(c)

[1 2 3]
['a' 'b' 'c']
[list(['a']) list(['a', 'b']) list(['a', 'c']) list(['a', 'b', 'c'])]


In [10]:
# 배열(또는 행렬)을 1로 채우기
print(np.ones((1, 5)))
print(np.ones((3, 2)))

[[1. 1. 1. 1. 1.]]
[[1. 1.]
 [1. 1.]
 [1. 1.]]


In [11]:
# 배열(또는 행렬)을 0으로 채우기
print(np.zeros((1, 5)))
print(np.zeros((3, 2)))

[[0. 0. 0. 0. 0.]]
[[0. 0.]
 [0. 0.]
 [0. 0.]]


In [12]:
# 주어진 범위 안에서 순차적으로 증가하는 리스트 만들기
# np.arange([start,] stop, [step, ] dtype=None)

print(np.arange(10))
print(np.arange(3,7, dtype=np.float))
print(np.arange(3,10,2))

[0 1 2 3 4 5 6 7 8 9]
[3. 4. 5. 6.]
[3 5 7 9]


In [15]:
# 행렬 선언하기
mat1 = np.array([[1,2,3],[4,5,6]])

# random으로 matrix 만들기
mat2 = np.random.randint(low=1, high=10, size=(3,2))
mat3 = np.random.rand(3,2) # 정규분포를 따르는 3x2

print(mat1)
print(mat2)
print(mat3)

[[1 2 3]
 [4 5 6]]
[[9 5]
 [3 3]
 [2 6]]
[[0.65621424 0.03783443]
 [0.72841212 0.54509803]
 [0.35756357 0.30584737]]


### 리스트 인덱싱(Indexing) & 슬라이싱(Slicing)

In [16]:
# Indexing
a = [1,3,5,7,9,11]
print(a[2], a[5], a[-1])

5 11 11


In [17]:
# Slicing
b = [2,4,6,8,10]
print(b[2:])
print(b[:2])
print(b[:])

[6, 8, 10]
[2, 4]
[2, 4, 6, 8, 10]


### numpy의 reshape (PyTorch의 view와 비교)

In [18]:
# (row, column)
mat1 = np.random.rand(6,3)
print(mat1)

[[0.50760895 0.94128393 0.56133354]
 [0.46457844 0.84402729 0.59545319]
 [0.11062589 0.6834279  0.6105375 ]
 [0.70978288 0.91464269 0.97855119]
 [0.16630519 0.54375376 0.00149751]
 [0.26494625 0.42076476 0.32283679]]


In [19]:
# -1: all
print(mat1.reshape(1, -1).shape)
print(mat1.reshape(1, -1))
print("=====")
print(mat1.reshape(-1, 1).shape)
print(mat1.reshape(-1, 1))

(1, 18)
[[0.50760895 0.94128393 0.56133354 0.46457844 0.84402729 0.59545319
  0.11062589 0.6834279  0.6105375  0.70978288 0.91464269 0.97855119
  0.16630519 0.54375376 0.00149751 0.26494625 0.42076476 0.32283679]]
=====
(18, 1)
[[0.50760895]
 [0.94128393]
 [0.56133354]
 [0.46457844]
 [0.84402729]
 [0.59545319]
 [0.11062589]
 [0.6834279 ]
 [0.6105375 ]
 [0.70978288]
 [0.91464269]
 [0.97855119]
 [0.16630519]
 [0.54375376]
 [0.00149751]
 [0.26494625]
 [0.42076476]
 [0.32283679]]


In [20]:
print(mat1.reshape(2, 9).shape)
print(mat1.reshape(9, 2))

(2, 9)
[[0.50760895 0.94128393]
 [0.56133354 0.46457844]
 [0.84402729 0.59545319]
 [0.11062589 0.6834279 ]
 [0.6105375  0.70978288]
 [0.91464269 0.97855119]
 [0.16630519 0.54375376]
 [0.00149751 0.26494625]
 [0.42076476 0.32283679]]


In [21]:
mat1.reshape(2,5)

ValueError: ignored

In [22]:
# tensor 형태로 나타내기
print(mat1.reshape(3,2,3).shape)
print(mat1.reshape(3,2,3)) # 2x3 3개

(3, 2, 3)
[[[0.50760895 0.94128393 0.56133354]
  [0.46457844 0.84402729 0.59545319]]

 [[0.11062589 0.6834279  0.6105375 ]
  [0.70978288 0.91464269 0.97855119]]

 [[0.16630519 0.54375376 0.00149751]
  [0.26494625 0.42076476 0.32283679]]]


### matrix 또는 tensor 형태에서도 slicing이 가능합니다!

In [26]:
mat2 = np.arange(24).reshape(-1, 4)
print(mat2)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]
 [16 17 18 19]
 [20 21 22 23]]


In [27]:
mat2[:1,:3] # 첫 번째 = 행, 두 번째 = 열

array([[0, 1, 2]])

In [28]:
mat2[3, 0:2]

array([12, 13])

### Math Arithmetic Operations(사칙연산)

In [29]:
x = np.array([[1,3,5],[7,9,11],[13,15,17]])
y = np.array([[2,4,6],[8,10,12],[14,16,18]])

print(x)
print(y)

[[ 1  3  5]
 [ 7  9 11]
 [13 15 17]]
[[ 2  4  6]
 [ 8 10 12]
 [14 16 18]]


In [30]:
# add
x + y

array([[ 3,  7, 11],
       [15, 19, 23],
       [27, 31, 35]])

In [31]:
x - y

array([[-1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1]])

In [32]:
x * y

array([[  2,  12,  30],
       [ 56,  90, 132],
       [182, 240, 306]])

In [33]:
x / y

array([[0.5       , 0.75      , 0.83333333],
       [0.875     , 0.9       , 0.91666667],
       [0.92857143, 0.9375    , 0.94444444]])

In [34]:
print(x ** 2)
print(np.power(x, 2))

[[  1   9  25]
 [ 49  81 121]
 [169 225 289]]
[[  1   9  25]
 [ 49  81 121]
 [169 225 289]]


In [35]:
np.dot(x, y)

array([[ 96, 114, 132],
       [240, 294, 348],
       [384, 474, 564]])

In [36]:
np.sqrt(x)

array([[1.        , 1.73205081, 2.23606798],
       [2.64575131, 3.        , 3.31662479],
       [3.60555128, 3.87298335, 4.12310563]])

In [37]:
# More on matrix operation
z1 = np.array([[2,2,2]])
z2 = np.array([[2,2]])

x * z1

array([[ 2,  6, 10],
       [14, 18, 22],
       [26, 30, 34]])

In [38]:
print(z2.shape)
print(x.shape)
print(x * z2)

(1, 2)
(3, 3)


ValueError: ignored

In [39]:
# 합
print(x)
print(x.sum(axis = 0))
print(x.sum(axis = 1))

[[ 1  3  5]
 [ 7  9 11]
 [13 15 17]]
[21 27 33]
[ 9 27 45]


In [40]:
# 평균
print(x)
print(x.mean(axis = 0))
print(x.mean(axis = 1))

[[ 1  3  5]
 [ 7  9 11]
 [13 15 17]]
[ 7.  9. 11.]
[ 3.  9. 15.]


In [41]:
# 표준편차
print(x)
print(x.std(axis = 0))
print(x.std(axis = 1))

[[ 1  3  5]
 [ 7  9 11]
 [13 15 17]]
[4.89897949 4.89897949 4.89897949]
[1.63299316 1.63299316 1.63299316]


In [42]:
print(x.T)
print(np.dot(x, z1.T))

[[ 1  7 13]
 [ 3  9 15]
 [ 5 11 17]]
[[18]
 [54]
 [90]]


### Other operations

In [51]:
xx = np.random.rand(15)
print(xx)

[0.72872144 0.56718499 0.33233493 0.1713851  0.44423526 0.24934422
 0.43851747 0.11118081 0.4168039  0.90190616 0.273644   0.80554543
 0.16555263 0.54773768 0.00986939]


In [53]:
print(xx)
print(xx.argsort()) # axis를 활용해서 행렬(matrix)에도 적용할 수 있다.
xx.sort()
print(xx)

[0.00986939 0.11118081 0.16555263 0.1713851  0.24934422 0.273644
 0.33233493 0.4168039  0.43851747 0.44423526 0.54773768 0.56718499
 0.72872144 0.80554543 0.90190616]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
[0.00986939 0.11118081 0.16555263 0.1713851  0.24934422 0.273644
 0.33233493 0.4168039  0.43851747 0.44423526 0.54773768 0.56718499
 0.72872144 0.80554543 0.90190616]


## 2. Pandas

* https://pandas.pydata.org/pandas-docs/stable/

In [54]:
import pandas as pd

### Pandas Series 만들기

* pandas series는 1차원 데이터 집합

In [55]:
pd_series = pd.Series(index = ['a','b','c','d','e'], data=[1,2,3,4,5])
pd_series

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [56]:
# Pandas Series의 기초 정보
print('차원:', pd_series.ndim)
print('형태: ', pd_series.shape)
print('총 원소의 수:', pd_series.size)

print('값:', pd_series.values)
print('인덱스:', pd_series.index)

차원: 1
형태:  (5,)
총 원소의 수: 5
값: [1 2 3 4 5]
인덱스: Index(['a', 'b', 'c', 'd', 'e'], dtype='object')


### index를 활용한 데이터 탐색

In [57]:
# loc(index를 활용하여 access), iloc(integer location)

print(pd_series.loc['a'])
print(pd_series.iloc[0])

print(pd_series.loc[['a','c']])
print(pd_series.iloc[[0,2]])

1
1
a    1
c    3
dtype: int64
a    1
c    3
dtype: int64


### 데이터 삭제

In [58]:
pd_series.drop('b')

a    1
c    3
d    4
e    5
dtype: int64

In [59]:
print(pd_series)
pd_series.drop('b', inplace=True)
print(pd_series)

a    1
b    2
c    3
d    4
e    5
dtype: int64
a    1
c    3
d    4
e    5
dtype: int64


### Pandas Dataframe 만들기

* 2차원 데이터 집합. 행렬과 비슷하게 row와 column을 갖고 있다

In [60]:
data = {
    'A': np.arange(15),
    'B': np.random.randint(low=0, high=15, size=(15)),
    'C': np.random.rand(15)
}

data_df = pd.DataFrame(data)

In [62]:
data_df

Unnamed: 0,A,B,C
0,0,7,0.618879
1,1,4,0.128281
2,2,10,0.899273
3,3,9,0.109899
4,4,11,0.147529
5,5,13,0.118856
6,6,3,0.62369
7,7,0,0.638217
8,8,7,0.803902
9,9,11,0.281026


In [61]:
data_df.head()

Unnamed: 0,A,B,C
0,0,7,0.618879
1,1,4,0.128281
2,2,10,0.899273
3,3,9,0.109899
4,4,11,0.147529


In [63]:
data_df.tail()

Unnamed: 0,A,B,C
10,10,4,0.240241
11,11,6,0.736207
12,12,6,0.448411
13,13,1,0.652511
14,14,9,0.639383


In [64]:
data_df.shape

(15, 3)

### Indexing and Slicing

In [65]:
data_df[1:3]

Unnamed: 0,A,B,C
1,1,4,0.128281
2,2,10,0.899273


In [66]:
data_df.loc[1]

A    1.000000
B    4.000000
C    0.128281
Name: 1, dtype: float64

In [67]:
data_df.loc[1]['C']

0.12828136109562593

In [70]:
data_df.loc[1][['B','C']]

B    4.000000
C    0.128281
Name: 1, dtype: float64

In [68]:
data_df.iloc[1]

A    1.000000
B    4.000000
C    0.128281
Name: 1, dtype: float64

### Add, Remove and etc

In [71]:
data_df['D'] = data_df['A'] >= 5
data_df

Unnamed: 0,A,B,C,D
0,0,7,0.618879,False
1,1,4,0.128281,False
2,2,10,0.899273,False
3,3,9,0.109899,False
4,4,11,0.147529,False
5,5,13,0.118856,True
6,6,3,0.62369,True
7,7,0,0.638217,True
8,8,7,0.803902,True
9,9,11,0.281026,True


In [72]:
data_df.drop('D', axis=1, inplace=True)

In [74]:
data_df.head()

Unnamed: 0,A,B,C
0,0,7,0.618879
1,1,4,0.128281
2,2,10,0.899273
3,3,9,0.109899
4,4,11,0.147529


In [75]:
data_df.sort_index(axis=0, ascending=False) # ascending = True : 오름차순 ,False : 내림차순

Unnamed: 0,A,B,C
14,14,9,0.639383
13,13,1,0.652511
12,12,6,0.448411
11,11,6,0.736207
10,10,4,0.240241
9,9,11,0.281026
8,8,7,0.803902
7,7,0,0.638217
6,6,3,0.62369
5,5,13,0.118856
