### 데이터 분석 기초

#### ✔ 데이터 분석이란?

- 데이터를 수집하고 처리함으로써 유의미한 가치를 창출하는 과정.


---

#### ✔ 라이브러리 : pandas

> $ pip install pandas

### Pandas 기초



In [1]:
import pandas as pd
import numpy as np

#### 자료구조

**1. 시리즈 : 리스트 인덱스 - 값**

In [2]:
s = pd.Series([1, 3, 5, 7, 9])
s

0    1
1    3
2    5
3    7
4    9
dtype: int64

**2. 데이터 프레임**

In [3]:
dates = pd.date_range('20210131', periods=6)
print(type(dates))

<class 'pandas.core.indexes.datetimes.DatetimeIndex'>


In [4]:
# 6행 3열의 랭덤 변수를 생성, index에 dates Series를 넣어줌, 칼럼명은 'A, B, C'로 생성.
df = pd.DataFrame(np.random.randn(6, 3), index = dates, columns = ['A','B','C'])
df.head()

Unnamed: 0,A,B,C
2021-01-31,0.302908,1.813232,0.135453
2021-02-01,0.935266,0.202624,1.370875
2021-02-02,-0.004166,0.121869,0.280267
2021-02-03,-1.010709,1.097691,0.052087
2021-02-04,0.46646,0.765866,2.044951


In [5]:
df.head(6)

Unnamed: 0,A,B,C
2021-01-31,0.302908,1.813232,0.135453
2021-02-01,0.935266,0.202624,1.370875
2021-02-02,-0.004166,0.121869,0.280267
2021-02-03,-1.010709,1.097691,0.052087
2021-02-04,0.46646,0.765866,2.044951
2021-02-05,-0.420129,0.565969,-1.646932


In [6]:
df.index

DatetimeIndex(['2021-01-31', '2021-02-01', '2021-02-02', '2021-02-03',
               '2021-02-04', '2021-02-05'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2021-01-31 to 2021-02-05
Freq: D
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
dtypes: float64(3)
memory usage: 192.0 bytes


In [9]:
df.describe()

Unnamed: 0,A,B,C
count,6.0,6.0,6.0
mean,0.044938,0.761209,0.372783
std,0.688477,0.629108,1.268015
min,-1.010709,0.121869,-1.646932
25%,-0.316139,0.29346,0.072928
50%,0.149371,0.665918,0.20786
75%,0.425572,1.014735,1.098223
max,0.935266,1.813232,2.044951


In [10]:
# ascending을 이용해 False오름차순, 내림차순을 정할 수 있다.
df.sort_values(by = 'B', ascending = False)

Unnamed: 0,A,B,C
2021-01-31,0.302908,1.813232,0.135453
2021-02-03,-1.010709,1.097691,0.052087
2021-02-04,0.46646,0.765866,2.044951
2021-02-05,-0.420129,0.565969,-1.646932
2021-02-01,0.935266,0.202624,1.370875
2021-02-02,-0.004166,0.121869,0.280267


In [11]:
# 해당 칼럼만 Series로 보여줌.
df['A']

2021-01-31    0.302908
2021-02-01    0.935266
2021-02-02   -0.004166
2021-02-03   -1.010709
2021-02-04    0.466460
2021-02-05   -0.420129
Freq: D, Name: A, dtype: float64

In [12]:
# 데이터 프레임 자료구조에서도 슬라이싱이 가능하다.
df[0:3]

Unnamed: 0,A,B,C
2021-01-31,0.302908,1.813232,0.135453
2021-02-01,0.935266,0.202624,1.370875
2021-02-02,-0.004166,0.121869,0.280267


In [13]:
df['2021-01-31' : '2021-02-01']

Unnamed: 0,A,B,C
2021-01-31,0.302908,1.813232,0.135453
2021-02-01,0.935266,0.202624,1.370875


In [14]:
df.loc[dates[0]]

A    0.302908
B    1.813232
C    0.135453
Name: 2021-01-31 00:00:00, dtype: float64

In [15]:
df.loc['2021-01-31':'2021-02-01',['A', 'B']]

Unnamed: 0,A,B
2021-01-31,0.302908,1.813232
2021-02-01,0.935266,0.202624


In [16]:
df.loc[dates[0], 'A']

0.30290822216381563

In [17]:
df.iloc[3]

A   -1.010709
B    1.097691
C    0.052087
Name: 2021-02-03 00:00:00, dtype: float64

In [18]:
df.iloc[1:4, 1:3]

Unnamed: 0,B,C
2021-02-01,0.202624,1.370875
2021-02-02,0.121869,0.280267
2021-02-03,1.097691,0.052087


In [19]:
df.iloc[[1,2,4], [0,1]]

Unnamed: 0,A,B
2021-02-01,0.935266,0.202624
2021-02-02,-0.004166,0.121869
2021-02-04,0.46646,0.765866


In [20]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C
2021-02-01,0.935266,0.202624,1.370875
2021-02-02,-0.004166,0.121869,0.280267


In [21]:
df.iloc[:,[0]]

Unnamed: 0,A
2021-01-31,0.302908
2021-02-01,0.935266
2021-02-02,-0.004166
2021-02-03,-1.010709
2021-02-04,0.46646
2021-02-05,-0.420129


In [22]:
df[df.A > 0]

Unnamed: 0,A,B,C
2021-01-31,0.302908,1.813232,0.135453
2021-02-01,0.935266,0.202624,1.370875
2021-02-04,0.46646,0.765866,2.044951


In [23]:
df[df > 0] 

Unnamed: 0,A,B,C
2021-01-31,0.302908,1.813232,0.135453
2021-02-01,0.935266,0.202624,1.370875
2021-02-02,,0.121869,0.280267
2021-02-03,,1.097691,0.052087
2021-02-04,0.46646,0.765866,2.044951
2021-02-05,,0.565969,


In [24]:
df2 = df.copy()
df2 = df[df > 0]
df2 = df2.sort_values(by="A", ascending = False)
df2

Unnamed: 0,A,B,C
2021-02-01,0.935266,0.202624,1.370875
2021-02-04,0.46646,0.765866,2.044951
2021-01-31,0.302908,1.813232,0.135453
2021-02-02,,0.121869,0.280267
2021-02-03,,1.097691,0.052087
2021-02-05,,0.565969,


In [25]:
df2['A-Rank'] = [1, 2, 3, 4, 5, 6]
df2

Unnamed: 0,A,B,C,A-Rank
2021-02-01,0.935266,0.202624,1.370875,1
2021-02-04,0.46646,0.765866,2.044951,2
2021-01-31,0.302908,1.813232,0.135453,3
2021-02-02,,0.121869,0.280267,4
2021-02-03,,1.097691,0.052087,5
2021-02-05,,0.565969,,6


In [26]:
df2['A-Rank'].isin([1, 2, 3])

2021-02-01     True
2021-02-04     True
2021-01-31     True
2021-02-02    False
2021-02-03    False
2021-02-05    False
Name: A-Rank, dtype: bool

In [27]:
# 매개변수로 전달되는 함수를 데이터 프레임에 적용가능.
df2.apply(np.cumsum)

Unnamed: 0,A,B,C,A-Rank
2021-02-01,0.935266,0.202624,1.370875,1
2021-02-04,1.401726,0.96849,3.415826,3
2021-01-31,1.704635,2.781722,3.551279,6
2021-02-02,,2.903592,3.831547,10
2021-02-03,,4.001283,3.883633,15
2021-02-05,,4.567252,,21


In [28]:
df2

Unnamed: 0,A,B,C,A-Rank
2021-02-01,0.935266,0.202624,1.370875,1
2021-02-04,0.46646,0.765866,2.044951,2
2021-01-31,0.302908,1.813232,0.135453,3
2021-02-02,,0.121869,0.280267,4
2021-02-03,,1.097691,0.052087,5
2021-02-05,,0.565969,,6


In [29]:
df2.apply(lambda x: x.max() - x.min())

A         0.632358
B         1.691363
C         1.992864
A-Rank    5.000000
dtype: float64