# Pandas
파이썬 데이터 분석의 3대 라이브러리: 
* Numpy : 배열, 행열을 다룸
* Pandas : 엑셀과 같은 데이터프레임을 다룸
* Matplotlib : 시각화

In [None]:
# pandas 사용하기
import numpy as np
import pandas as pd

In [None]:
# 2. Pandas 자료구조
# Pandas 에서는 기본적으로 정의되는 자료구조인 Series와 Data Frame을 사용한다.
# 이 자료구조들은 빅 데이터 분석에 있어서 높은 수준의 성능을 보여준다.

In [None]:
# 2-1. Series

In [85]:
# Series 정의하기
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [86]:
# Series의 값만 확인하기
obj.values

array([ 4,  7, -5,  3])

In [87]:
# Series의 인덱스만 확인하기
obj.index

RangeIndex(start=0, stop=4, step=1)

In [88]:
# Series의 자료형 확인하기
obj.dtypes

dtype('int64')

In [89]:
# 인덱스를 바꿀 수 있다
obj2 = pd.Series([4,7,-5,3], index=['d','b','a','c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [90]:
# python의 dictionary 자료형을 Series data로 만들 수 있다.
# dictionary의 key가 Series의 index가 된다
sdata = {'kim':35000, 'park':67000, 'john':12000, 'choi':4000}
obj3 = pd.Series(sdata)
obj3

kim     35000
park    67000
john    12000
choi     4000
dtype: int64

In [91]:
# index 변경
obj3.index = ['A', 'B', 'C', 'D']
obj3

A    35000
B    67000
C    12000
D     4000
dtype: int64

In [None]:
# 2-2. Data Frame

In [93]:
# Data Frame 정의하기
# 이전에 DataFrame에 들어갈 데이터를 정의해주어야 하는데,
# 이는 python의 dictionary 또는 numpy의 array로 정의할 수 있다.

data = {'name':['lee', 'lee', 'lee', 'kim', 'park'],
       'year':[2013, 2014, 2015, 2016, 2015],
       'points':[1.5, 1.7, 3.6, 2.4, 2.9]}

df = pd.DataFrame(data)
df

Unnamed: 0,name,year,points
0,lee,2013,1.5
1,lee,2014,1.7
2,lee,2015,3.6
3,kim,2016,2.4
4,park,2015,2.9


In [None]:
# 행과 열의 구조를 가진 데이터가 생긴다.

In [95]:
# 행 번호
df.index

RangeIndex(start=0, stop=5, step=1)

In [96]:
# 열 이름
df.columns

Index(['name', 'year', 'points'], dtype='object')

In [97]:
df.head(3)

Unnamed: 0,name,year,points
0,lee,2013,1.5
1,lee,2014,1.7
2,lee,2015,3.6


In [98]:
df.tail(2)

Unnamed: 0,name,year,points
3,kim,2016,2.4
4,park,2015,2.9


In [99]:
# 값 얻기
df.values

array([['lee', 2013, 1.5],
       ['lee', 2014, 1.7],
       ['lee', 2015, 3.6],
       ['kim', 2016, 2.4],
       ['park', 2015, 2.9]], dtype=object)

In [100]:
data

{'name': ['lee', 'lee', 'lee', 'kim', 'park'],
 'points': [1.5, 1.7, 3.6, 2.4, 2.9],
 'year': [2013, 2014, 2015, 2016, 2015]}

In [101]:
# DataFrame을 만들면서 columns와 index를 설정할 수 있다.

df2 = pd.DataFrame(data, columns=['year', 'name', 'points', 'penalty'], index=['one', 'two', 'three', 'four', 'five'])
df2

Unnamed: 0,year,name,points,penalty
one,2013,lee,1.5,
two,2014,lee,1.7,
three,2015,lee,3.6,
four,2016,kim,2.4,
five,2015,park,2.9,


In [None]:
# DataFrame을 정의하면서, data로 들어가는 python dictionary와 columns의
# 순서가 달라도 알아서 맞춰서 정의된다.
# 하지만 data에 포함되어 있지 않은 값은
# Nan(Not a Number)으로 나타나게 된다.
# 이는 null과 같은 개념이다.
# Nan 값은 추후에 어떠한 방법으로도 처리가 되지 않는 데이터이다.
# 따라서 올바른 데이터 처리를 위해 추가적으로 값을 넣어줘야 한다.

In [103]:
# describe() 함수는 DataFrame의 계산 가능한 값들에 대한 요약통계 값을 보여준다.
df2.describe()

Unnamed: 0,year,points
count,5.0,5.0
mean,2014.6,2.42
std,1.140175,0.864292
min,2013.0,1.5
25%,2014.0,1.7
50%,2015.0,2.4
75%,2015.0,2.9
max,2016.0,3.6


In [104]:
# info() 함수는 DataFrame의 요약정보를 보여준다.
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, one to five
Data columns (total 4 columns):
year       5 non-null int64
name       5 non-null object
points     5 non-null float64
penalty    0 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 200.0+ bytes


In [None]:
# 3. DataFrame Indexing

In [106]:
data = {'names':['lee', 'lee', 'lee', 'park', 'park'],
       'year': [2014, 2015, 2016, 2015, 2016],
       'points':[1.5, 1.7, 3.6, 2.4, 2.9]}

df3 = pd.DataFrame(data, 
                   columns=['year', 'names', 'points', 'penalty'], 
                   index=['one', 'two', 'three', 'four', 'five'])
df3

Unnamed: 0,year,names,points,penalty
one,2014,lee,1.5,
two,2015,lee,1.7,
three,2016,lee,3.6,
four,2015,park,2.4,
five,2016,park,2.9,


In [None]:
# 3-1. DataFrame 에서 열을 선택하고 조작하기

In [108]:
df3['names']

one       lee
two       lee
three     lee
four     park
five     park
Name: names, dtype: object

In [109]:
# 동일한 의미를 갖는, 다른 방법
df3.names

one       lee
two       lee
three     lee
four     park
five     park
Name: names, dtype: object

In [110]:
df3[['year', 'points']]

Unnamed: 0,year,points
one,2014,1.5
two,2015,1.7
three,2016,3.6
four,2015,2.4
five,2016,2.9


In [111]:
# 특정 열에 대해 위와 같이 선택하고, 우리가 원하는 값을 대입할 수 있다.
df3.penalty = 0.5
df3

Unnamed: 0,year,names,points,penalty
one,2014,lee,1.5,0.5
two,2015,lee,1.7,0.5
three,2016,lee,3.6,0.5
four,2015,park,2.4,0.5
five,2016,park,2.9,0.5


In [112]:
# 또는
# python의 List나 numpy의 array
df3.penalty = [0.2, 0.3, 0.5, 0.8, 0.1]
df3

Unnamed: 0,year,names,points,penalty
one,2014,lee,1.5,0.2
two,2015,lee,1.7,0.3
three,2016,lee,3.6,0.5
four,2015,park,2.4,0.8
five,2016,park,2.9,0.1


In [113]:
# 새로운 열을 추가하기
df3['zeros'] = np.arange(5)
df3

Unnamed: 0,year,names,points,penalty,zeros
one,2014,lee,1.5,0.2,0
two,2015,lee,1.7,0.3,1
three,2016,lee,3.6,0.5,2
four,2015,park,2.4,0.8,3
five,2016,park,2.9,0.1,4


In [None]:
# Series를 추가할 수도 있다.
val = pd.Series([-1.2, -1.5, -1.7],
               index=['two', 'four', 'five'])

In [115]:
df3['depts'] = val
df3

Unnamed: 0,year,names,points,penalty,zeros,depts
one,2014,lee,1.5,0.2,0,
two,2015,lee,1.7,0.3,1,-1.2
three,2016,lee,3.6,0.5,2,
four,2015,park,2.4,0.8,3,-1.5
five,2016,park,2.9,0.1,4,-1.7


In [None]:
# 하지만 Series로 넣을 때는 val와 같이 넣으려는 data의 index에 맞춰서 
# 데이터가 들어간다.
# 이점이 python list나 numpy array로 데이터를 넣을 때와 가장 큰 차이점이다.

In [117]:
df3['net_points'] = df3['points'] - df3['penalty']
df3

Unnamed: 0,year,names,points,penalty,zeros,depts,net_points
one,2014,lee,1.5,0.2,0,,1.3
two,2015,lee,1.7,0.3,1,-1.2,1.4
three,2016,lee,3.6,0.5,2,,3.1
four,2015,park,2.4,0.8,3,-1.5,1.6
five,2016,park,2.9,0.1,4,-1.7,2.8


In [118]:
df3['high_points'] = df3['net_points'] > 2
df3

Unnamed: 0,year,names,points,penalty,zeros,depts,net_points,high_points
one,2014,lee,1.5,0.2,0,,1.3,False
two,2015,lee,1.7,0.3,1,-1.2,1.4,False
three,2016,lee,3.6,0.5,2,,3.1,True
four,2015,park,2.4,0.8,3,-1.5,1.6,False
five,2016,park,2.9,0.1,4,-1.7,2.8,True


In [119]:
# 열 삭제하기
del df3['high_points']
df3

Unnamed: 0,year,names,points,penalty,zeros,depts,net_points
one,2014,lee,1.5,0.2,0,,1.3
two,2015,lee,1.7,0.3,1,-1.2,1.4
three,2016,lee,3.6,0.5,2,,3.1
four,2015,park,2.4,0.8,3,-1.5,1.6
five,2016,park,2.9,0.1,4,-1.7,2.8


In [120]:
del df3['net_points']
df3

Unnamed: 0,year,names,points,penalty,zeros,depts
one,2014,lee,1.5,0.2,0,
two,2015,lee,1.7,0.3,1,-1.2
three,2016,lee,3.6,0.5,2,
four,2015,park,2.4,0.8,3,-1.5
five,2016,park,2.9,0.1,4,-1.7


In [None]:
# 3-2. DataFrame에서 행을 선택하고 조작하기
# Pandas에서는 DataFrame에서 행을 인덱싱하는 방법이 무수히 많다.
# 물론 위에서 소개했던 열을 선택하는 방법도 수많은 방법 중에 하나에 불과하다.

In [122]:
# 0번째 부터 2(3-1)번째 까지 가져온다.
# 뒤에 써준 숫자번째의 행은 뺀다.
df3[0:3]

Unnamed: 0,year,names,points,penalty,zeros,depts
one,2014,lee,1.5,0.2,0,
two,2015,lee,1.7,0.3,1,-1.2
three,2016,lee,3.6,0.5,2,


In [123]:
# 아래 방법을 권장한다.
# .loc 또는 .iloc 함수를 사용하는 방법
df3

Unnamed: 0,year,names,points,penalty,zeros,depts
one,2014,lee,1.5,0.2,0,
two,2015,lee,1.7,0.3,1,-1.2
three,2016,lee,3.6,0.5,2,
four,2015,park,2.4,0.8,3,-1.5
five,2016,park,2.9,0.1,4,-1.7


In [124]:
df3.loc[:,'year']

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [125]:
df3.loc['one','year']

2014

In [126]:
df3.loc['two']

year       2015
names       lee
points      1.7
penalty     0.3
zeros         1
depts      -1.2
Name: two, dtype: object

In [127]:
df3.loc['two':'four']

Unnamed: 0,year,names,points,penalty,zeros,depts
two,2015,lee,1.7,0.3,1,-1.2
three,2016,lee,3.6,0.5,2,
four,2015,park,2.4,0.8,3,-1.5


In [128]:
# DataFrame 복제하는 방법 

data = {'names':['lee', 'lee', 'lee', 'park', 'park'],
       'year': [2014, 2015, 2016, 2015, 2016],
       'points':[1.5, 1.7, 3.6, 2.4, 2.9]}

df = pd.DataFrame(data, 
                   columns=['year', 'names', 'points', 'penalty'], 
                   index=['one', 'two', 'three', 'four', 'five'])
df

Unnamed: 0,year,names,points,penalty
one,2014,lee,1.5,
two,2015,lee,1.7,
three,2016,lee,3.6,
four,2015,park,2.4,
five,2016,park,2.9,


In [129]:
# np.copy 를 이용해서 복제하면 각각의 DataFrame이 독립적이다.
df_copy = df.copy() 
df_copy

Unnamed: 0,year,names,points,penalty
one,2014,lee,1.5,
two,2015,lee,1.7,
three,2016,lee,3.6,
four,2015,park,2.4,
five,2016,park,2.9,


In [130]:
del df_copy['penalty']
print(df)
print(df_copy)

       year names  points penalty
one    2014   lee     1.5     NaN
two    2015   lee     1.7     NaN
three  2016   lee     3.6     NaN
four   2015  park     2.4     NaN
five   2016  park     2.9     NaN
       year names  points
one    2014   lee     1.5
two    2015   lee     1.7
three  2016   lee     3.6
four   2015  park     2.4
five   2016  park     2.9


In [131]:
# '=' 를 이용해서 복제하면 종속되어 같이 변경된다.
df_copy = df
df_copy

Unnamed: 0,year,names,points,penalty
one,2014,lee,1.5,
two,2015,lee,1.7,
three,2016,lee,3.6,
four,2015,park,2.4,
five,2016,park,2.9,


In [132]:
del df_copy['penalty']
print(df)
print(df_copy)

       year names  points
one    2014   lee     1.5
two    2015   lee     1.7
three  2016   lee     3.6
four   2015  park     2.4
five   2016  park     2.9
       year names  points
one    2014   lee     1.5
two    2015   lee     1.7
three  2016   lee     3.6
four   2015  park     2.4
five   2016  park     2.9


In [133]:
data = {'names':['lee', 'lee', 'lee', 'park', 'park'],
       'year': [2014, 2015, 2016, 2015, 2016],
       'points':[1.5, 1.7, 3.6, 2.4, 2.9]}

df = pd.DataFrame(data, 
                   columns=['year', 'names', 'points', 'penalty'], 
                   index=['one', 'two', 'three', 'four', 'five'])
df

Unnamed: 0,year,names,points,penalty
one,2014,lee,1.5,
two,2015,lee,1.7,
three,2016,lee,3.6,
four,2015,park,2.4,
five,2016,park,2.9,


In [134]:
# 새로운 행 삽입하기
df.loc['six',:] = [2013, 'june', 4.0, 0.1]
df

Unnamed: 0,year,names,points,penalty
one,2014.0,lee,1.5,
two,2015.0,lee,1.7,
three,2016.0,lee,3.6,
four,2015.0,park,2.4,
five,2016.0,park,2.9,
six,2013.0,june,4.0,0.1


In [135]:
# .iloc 사용 :: index 번호를 사용한다.
df.iloc[3]

year       2015
names      park
points      2.4
penalty     NaN
Name: four, dtype: object

In [136]:
df.iloc[3:5, 0:2]

Unnamed: 0,year,names
four,2015.0,park
five,2016.0,park


In [137]:
df.iloc[[1,3,4],[1,2]]

Unnamed: 0,names,points
two,lee,1.7
four,park,2.4
five,park,2.9


In [138]:
df.iloc[:,1:4]

Unnamed: 0,names,points,penalty
one,lee,1.5,
two,lee,1.7,
three,lee,3.6,
four,park,2.4,
five,park,2.9,
six,june,4.0,0.1


In [139]:
df.loc['seven',:] = [2018, 'may', 5.0, 0.8]
df

Unnamed: 0,year,names,points,penalty
one,2014.0,lee,1.5,
two,2015.0,lee,1.7,
three,2016.0,lee,3.6,
four,2015.0,park,2.4,
five,2016.0,park,2.9,
six,2013.0,june,4.0,0.1
seven,2018.0,may,5.0,0.8


In [140]:
df.drop('seven')

Unnamed: 0,year,names,points,penalty
one,2014.0,lee,1.5,
two,2015.0,lee,1.7,
three,2016.0,lee,3.6,
four,2015.0,park,2.4,
five,2016.0,park,2.9,
six,2013.0,june,4.0,0.1


In [141]:
df.iloc[5,1]

'june'

In [None]:
# 4.DataFrame에서의 boolean Indexing

In [143]:
df

Unnamed: 0,year,names,points,penalty
one,2014.0,lee,1.5,
two,2015.0,lee,1.7,
three,2016.0,lee,3.6,
four,2015.0,park,2.4,
five,2016.0,park,2.9,
six,2013.0,june,4.0,0.1
seven,2018.0,may,5.0,0.8


In [144]:
# year가 2014보다 큰 boolean data
df['year']>2014

one      False
two       True
three     True
four      True
five      True
six      False
seven     True
Name: year, dtype: bool

In [145]:
# year가 2014보다 큰 모든 행의 값

df.loc[df['year']>2014,:]   # df.loc[df['year']>2014] 와 동일

Unnamed: 0,year,names,points,penalty
two,2015.0,lee,1.7,
three,2016.0,lee,3.6,
four,2015.0,park,2.4,
five,2016.0,park,2.9,
seven,2018.0,may,5.0,0.8


In [146]:
df[df['year']>2014]

Unnamed: 0,year,names,points,penalty
two,2015.0,lee,1.7,
three,2016.0,lee,3.6,
four,2015.0,park,2.4,
five,2016.0,park,2.9,
seven,2018.0,may,5.0,0.8


In [147]:
df

Unnamed: 0,year,names,points,penalty
one,2014.0,lee,1.5,
two,2015.0,lee,1.7,
three,2016.0,lee,3.6,
four,2015.0,park,2.4,
five,2016.0,park,2.9,
six,2013.0,june,4.0,0.1
seven,2018.0,may,5.0,0.8


In [148]:
df.loc[df['names'] == 'lee',['points','penalty']]

Unnamed: 0,points,penalty
one,1.5,
two,1.7,
three,3.6,


In [149]:
df

Unnamed: 0,year,names,points,penalty
one,2014.0,lee,1.5,
two,2015.0,lee,1.7,
three,2016.0,lee,3.6,
four,2015.0,park,2.4,
five,2016.0,park,2.9,
six,2013.0,june,4.0,0.1
seven,2018.0,may,5.0,0.8


In [150]:
# numpy에서와 같이 논리연산을 응용할 수 있다.
# points < 1.7 이거나 points >2.9 인 행을 가져오자

# 연산자 우선순위를 고려하여 조건식(부등식)에 괄호를 씌운다.

df.loc[(df['points']<1.7) + (df['points']>2.9),:]

  .format(op=op_str, alt_op=unsupported[op_str]))


Unnamed: 0,year,names,points,penalty
one,2014.0,lee,1.5,
three,2016.0,lee,3.6,
six,2013.0,june,4.0,0.1
seven,2018.0,may,5.0,0.8


In [None]:
# 5.Data

In [162]:
# DataFrame을 만들 때 index, column을 설정하지 않으면 
# 기본 값으로 0부터 시작하는 정수형 숫자로 입력된다.
df = pd.DataFrame(np.random.randn(6,4))
df

Unnamed: 0,0,1,2,3
0,0.87329,1.07292,1.845569,0.389144
1,0.086825,-1.396704,-0.94127,0.885149
2,2.104669,0.870225,0.911261,-0.02258
3,-1.86536,0.043917,-1.393309,1.866914
4,-0.443593,-2.102058,0.588712,-0.360892
5,0.176431,0.899418,-1.204459,0.662099


In [163]:
# pandas에서 제공하는 date_range 함수는 datetime 자료형으로 구성된,
# 날짜, 시각 등을 알 수 있는 자료형을 만드는 함수
df.columns = ['A', 'B', 'C', 'D']
df.index = pd.date_range('20190911', periods=6)
df

Unnamed: 0,A,B,C,D
2019-09-11,0.87329,1.07292,1.845569,0.389144
2019-09-12,0.086825,-1.396704,-0.94127,0.885149
2019-09-13,2.104669,0.870225,0.911261,-0.02258
2019-09-14,-1.86536,0.043917,-1.393309,1.866914
2019-09-15,-0.443593,-2.102058,0.588712,-0.360892
2019-09-16,0.176431,0.899418,-1.204459,0.662099


In [164]:
# np.nan은 NaN값을 의미한다.
df['F'] = [1.0, np.nan, 3.5, 6.1, np.nan, 7.0]
df

Unnamed: 0,A,B,C,D,F
2019-09-11,0.87329,1.07292,1.845569,0.389144,1.0
2019-09-12,0.086825,-1.396704,-0.94127,0.885149,
2019-09-13,2.104669,0.870225,0.911261,-0.02258,3.5
2019-09-14,-1.86536,0.043917,-1.393309,1.866914,6.1
2019-09-15,-0.443593,-2.102058,0.588712,-0.360892,
2019-09-16,0.176431,0.899418,-1.204459,0.662099,7.0


In [None]:
# NaN 없애기

In [165]:
# 행의 값 중 하나라도 nan인 경우 그 행을 없앤다.
df.dropna(how='any')

Unnamed: 0,A,B,C,D,F
2019-09-11,0.87329,1.07292,1.845569,0.389144,1.0
2019-09-13,2.104669,0.870225,0.911261,-0.02258,3.5
2019-09-14,-1.86536,0.043917,-1.393309,1.866914,6.1
2019-09-16,0.176431,0.899418,-1.204459,0.662099,7.0


In [166]:
df

Unnamed: 0,A,B,C,D,F
2019-09-11,0.87329,1.07292,1.845569,0.389144,1.0
2019-09-12,0.086825,-1.396704,-0.94127,0.885149,
2019-09-13,2.104669,0.870225,0.911261,-0.02258,3.5
2019-09-14,-1.86536,0.043917,-1.393309,1.866914,6.1
2019-09-15,-0.443593,-2.102058,0.588712,-0.360892,
2019-09-16,0.176431,0.899418,-1.204459,0.662099,7.0


In [167]:
# 행의 값이 모두 nan인 경우 그행을 없앤다.
df.dropna(how='all')

Unnamed: 0,A,B,C,D,F
2019-09-11,0.87329,1.07292,1.845569,0.389144,1.0
2019-09-12,0.086825,-1.396704,-0.94127,0.885149,
2019-09-13,2.104669,0.870225,0.911261,-0.02258,3.5
2019-09-14,-1.86536,0.043917,-1.393309,1.866914,6.1
2019-09-15,-0.443593,-2.102058,0.588712,-0.360892,
2019-09-16,0.176431,0.899418,-1.204459,0.662099,7.0


In [None]:
# 주의 drop함수는 특정 행 또는 열을 drop하고 난 DataFrame을 반환한다.
# 즉, 반환을 받지 않으면 기존의 DataFrame은 그대로이다.
# 아니면, inplace = True 라는 인자를 추가하여, 반환을 받지 않고서도
# 기존의 DataFrame이 변경되도록 한다.

In [171]:
# nan 값에 값 넣기
df.fillna(value=0.5, inplace=True)
df

Unnamed: 0,A,B,C,D,F
2019-09-11,0.87329,1.07292,1.845569,0.389144,1.0
2019-09-12,0.086825,-1.396704,-0.94127,0.885149,0.5
2019-09-13,2.104669,0.870225,0.911261,-0.02258,3.5
2019-09-14,-1.86536,0.043917,-1.393309,1.866914,6.1
2019-09-15,-0.443593,-2.102058,0.588712,-0.360892,0.5
2019-09-16,0.176431,0.899418,-1.204459,0.662099,7.0


In [174]:
# nan 값인지 확인하기
df.isnull()

Unnamed: 0,A,B,C,D,F
2019-09-11,False,False,False,False,False
2019-09-12,False,False,False,False,False
2019-09-13,False,False,False,False,False
2019-09-14,False,False,False,False,False
2019-09-15,False,False,False,False,False
2019-09-16,False,False,False,False,False


In [177]:
df = pd.DataFrame(np.random.randn(6,4))
df.columns = ['A', 'B', 'C', 'D']
df.index = pd.date_range('20190911', periods=6)
df['F'] = [1.0, np.nan, 3.5, 6.1, np.nan, 7.0]
df

Unnamed: 0,A,B,C,D,F
2019-09-11,0.041398,2.337962,-0.146295,0.578089,1.0
2019-09-12,1.515453,-0.332878,1.190386,-0.598531,
2019-09-13,0.254239,-0.090602,-1.146521,1.090934,3.5
2019-09-14,-0.355535,1.481052,-0.089459,-0.99371,6.1
2019-09-15,-0.337676,-0.37944,0.702928,-0.573629,
2019-09-16,0.02592,-0.881973,-0.447439,0.769509,7.0


In [178]:
# F열에서 nan값을 포함하는 행만 추출하기
df.loc[df.isnull()['F'],:]

Unnamed: 0,A,B,C,D,F
2019-09-12,1.515453,-0.332878,1.190386,-0.598531,
2019-09-15,-0.337676,-0.37944,0.702928,-0.573629,


In [180]:
pd.to_datetime('20190911')

Timestamp('2019-09-11 00:00:00')

In [181]:
# 특정 행 drop 하기
df.drop(pd.to_datetime('20190911'))

Unnamed: 0,A,B,C,D,F
2019-09-12,1.515453,-0.332878,1.190386,-0.598531,
2019-09-13,0.254239,-0.090602,-1.146521,1.090934,3.5
2019-09-14,-0.355535,1.481052,-0.089459,-0.99371,6.1
2019-09-15,-0.337676,-0.37944,0.702928,-0.573629,
2019-09-16,0.02592,-0.881973,-0.447439,0.769509,7.0


In [185]:
# 2개 이상도 가능
df.drop([pd.to_datetime('20190912'), pd.to_datetime('20190914')])

Unnamed: 0,A,B,C,D,F
2019-09-11,0.041398,2.337962,-0.146295,0.578089,1.0
2019-09-13,0.254239,-0.090602,-1.146521,1.090934,3.5
2019-09-15,-0.337676,-0.37944,0.702928,-0.573629,
2019-09-16,0.02592,-0.881973,-0.447439,0.769509,7.0


In [186]:
# 특정 열 삭제하기
df.drop('F', axis=1)

Unnamed: 0,A,B,C,D
2019-09-11,0.041398,2.337962,-0.146295,0.578089
2019-09-12,1.515453,-0.332878,1.190386,-0.598531
2019-09-13,0.254239,-0.090602,-1.146521,1.090934
2019-09-14,-0.355535,1.481052,-0.089459,-0.99371
2019-09-15,-0.337676,-0.37944,0.702928,-0.573629
2019-09-16,0.02592,-0.881973,-0.447439,0.769509


In [187]:
# 2개 이상의 열도 가능
df.drop(['A', 'C'], axis=1)

Unnamed: 0,B,D,F
2019-09-11,2.337962,0.578089,1.0
2019-09-12,-0.332878,-0.598531,
2019-09-13,-0.090602,1.090934,3.5
2019-09-14,1.481052,-0.99371,6.1
2019-09-15,-0.37944,-0.573629,
2019-09-16,-0.881973,0.769509,7.0


In [None]:
# 6.Data 분석용 함수들

In [189]:
data = [[1.4, np.nan],
       [7.1, -4.5],
       [np.nan, np.nan],
       [0.75, -1.3]]

df = pd.DataFrame(data, columns = ['one', 'two'], index=['a', 'b', 'c', 'd'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [193]:
# 행 방향으로의 합(즉, 각 열의 합)
df.sum(axis=0)

one    9.25
two   -5.80
dtype: float64

In [194]:
# 열방향으로의 합(즉, 각 행의 합)
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [None]:
# 이 때, 위에서 볼 수 있듯이 NaN값은 배제하고 계산한다.
# NaN 값을 배제하지 않고 계산하려면 아래와 같이 skipna에 대해 false를 지정해 준다.

In [None]:
# 연산의 대상이 되는 값이 행 요소이면 axis=0, 열 요소이면 axis=1

In [195]:
df.sum(axis=1, skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [196]:
# 특정 행 또는 특정 열에서만 계산하기
df['one'].sum()

9.25

In [201]:
df.loc['b',:].sum()

2.5999999999999996

In [None]:
# 열을 찾을 때는 인덱싱 하는 것처럼 하면 되지만, 행을 찾을 때는 .loc 메소드를 사용한다.
# 데이터프레임에서 열은 라벨이라는 같은 속성으로 값들을 가지지만, 행은 인덱스만 같을 뿐 다른 속성으로 값을 가진다.
# 딕셔너리에서 key값으로 value를 찾을수는 있지만 value로 Key를 찾을수 없는 것과 같다.

In [None]:
# pandas에서 DataFrame에 적용되는 함수들
# sum()함수 이외에도 pandas에서 DataFrame에 적용되는 함수는 다음의 것들이 있다.
# count 전체 성분의(NaN이 아닌) 값의 갯수를 계산
# min, max 전체 성분의 최솟, 최댓값을 계산
# argmin, argmax 전체 성분의 최솟값, 최댓값이 위치한 (정수)인덱스를 반환
# idxmin, idxmax 전체 인덱스 중 최솟값, 최댓값을 반환
# quantile 전체 성분의 특정 사분위수에 해당하는 값을 반환 (0~1 사이)
# sum 전체 성분의 합을 계산
# mean 전체 성분의 평균을 계산
# median 전체 성분의 중간값을 반환
# mad 전체 성분의 평균값으로부터의 절대 편차(absolute deviation)의 평균을 계산
# std, var 전체 성분의 표준편차, 분산을 계산
# cumsum 맨 첫 번째 성분부터 각 성분까지의 누적합을 계산(0에서부터 계속 더해짐)
# cumprod 맨 첫번째 성분부터 각 성분까지의 누적곱을 계산(1에서부터 계속 곱해짐)

In [204]:
df2 = pd.DataFrame(np.random.randn(6,4),
                  columns=['A', 'B', 'C', 'D'],
                  index = pd.date_range('20160701', periods=6))
df2

Unnamed: 0,A,B,C,D
2016-07-01,1.135445,0.133351,0.081907,0.671343
2016-07-02,1.095508,-0.68483,1.038377,1.466279
2016-07-03,1.384512,0.537389,-0.250072,0.905375
2016-07-04,1.084587,0.22967,0.383284,-0.086406
2016-07-05,0.530772,-0.80513,0.205894,0.287849
2016-07-06,0.402503,-1.068011,-1.679904,1.17654


In [205]:
# A열과 B열의 상관계수 구하기
df2['A'].corr(df2['B'])

0.8665563324877111

In [206]:
# 정렬함수 및 기타함수

Unnamed: 0,A,B,C,D
2016-07-01,1.135445,0.133351,0.081907,0.671343
2016-07-02,1.095508,-0.68483,1.038377,1.466279
2016-07-03,1.384512,0.537389,-0.250072,0.905375
2016-07-04,1.084587,0.22967,0.383284,-0.086406
2016-07-05,0.530772,-0.80513,0.205894,0.287849
2016-07-06,0.402503,-1.068011,-1.679904,1.17654


In [207]:
dates = df2.index
random_dates = np.random.permutation(dates)
df2 = df2.reindex(index=random_dates, columns = ['D', 'B', 'C', 'A'])
df2

Unnamed: 0,D,B,C,A
2016-07-01,0.671343,0.133351,0.081907,1.135445
2016-07-04,-0.086406,0.22967,0.383284,1.084587
2016-07-06,1.17654,-1.068011,-1.679904,0.402503
2016-07-02,1.466279,-0.68483,1.038377,1.095508
2016-07-03,0.905375,0.537389,-0.250072,1.384512
2016-07-05,0.287849,-0.80513,0.205894,0.530772


In [208]:
# index와 column의 순서가 섞여있다.
# 이 때 index가 오름차순이 되도록 정렬해보자
df2.sort_index(axis=0)

Unnamed: 0,D,B,C,A
2016-07-01,0.671343,0.133351,0.081907,1.135445
2016-07-02,1.466279,-0.68483,1.038377,1.095508
2016-07-03,0.905375,0.537389,-0.250072,1.384512
2016-07-04,-0.086406,0.22967,0.383284,1.084587
2016-07-05,0.287849,-0.80513,0.205894,0.530772
2016-07-06,1.17654,-1.068011,-1.679904,0.402503


In [209]:
# column을 기준으로?
df2.sort_index(axis=1)

Unnamed: 0,A,B,C,D
2016-07-01,1.135445,0.133351,0.081907,0.671343
2016-07-04,1.084587,0.22967,0.383284,-0.086406
2016-07-06,0.402503,-1.068011,-1.679904,1.17654
2016-07-02,1.095508,-0.68483,1.038377,1.466279
2016-07-03,1.384512,0.537389,-0.250072,0.905375
2016-07-05,0.530772,-0.80513,0.205894,0.287849


In [210]:
# 내림차순으로는?
df2.sort_index(axis=0, ascending=False)

Unnamed: 0,D,B,C,A
2016-07-06,1.17654,-1.068011,-1.679904,0.402503
2016-07-05,0.287849,-0.80513,0.205894,0.530772
2016-07-04,-0.086406,0.22967,0.383284,1.084587
2016-07-03,0.905375,0.537389,-0.250072,1.384512
2016-07-02,1.466279,-0.68483,1.038377,1.095508
2016-07-01,0.671343,0.133351,0.081907,1.135445


In [211]:
# 값 기준 정렬하기
# D열의 값이 오름차순이 되도록 정렬하기
df2.sort_values(by='D')

Unnamed: 0,D,B,C,A
2016-07-04,-0.086406,0.22967,0.383284,1.084587
2016-07-05,0.287849,-0.80513,0.205894,0.530772
2016-07-01,0.671343,0.133351,0.081907,1.135445
2016-07-03,0.905375,0.537389,-0.250072,1.384512
2016-07-06,1.17654,-1.068011,-1.679904,0.402503
2016-07-02,1.466279,-0.68483,1.038377,1.095508


In [213]:
# B열의 값이 내림차순이 되도록 정렬하기
df2.sort_values(by='B', ascending=False)

Unnamed: 0,D,B,C,A
2016-07-03,0.905375,0.537389,-0.250072,1.384512
2016-07-04,-0.086406,0.22967,0.383284,1.084587
2016-07-01,0.671343,0.133351,0.081907,1.135445
2016-07-02,1.466279,-0.68483,1.038377,1.095508
2016-07-05,0.287849,-0.80513,0.205894,0.530772
2016-07-06,1.17654,-1.068011,-1.679904,0.402503


In [214]:
df2['E'] = np.random.randint(0, 6, size=6)
df2['F'] = ['a', 'b', 'g', 'g', 'a', 'g']
df2

Unnamed: 0,D,B,C,A,E,F
2016-07-01,0.671343,0.133351,0.081907,1.135445,3,a
2016-07-04,-0.086406,0.22967,0.383284,1.084587,4,b
2016-07-06,1.17654,-1.068011,-1.679904,0.402503,4,g
2016-07-02,1.466279,-0.68483,1.038377,1.095508,2,g
2016-07-03,0.905375,0.537389,-0.250072,1.384512,4,a
2016-07-05,0.287849,-0.80513,0.205894,0.530772,2,g


In [215]:
# E열과 F열을 동시에 고려하여, 오름차순으로 하려면?
df2.sort_values(by=['E', 'F'])

Unnamed: 0,D,B,C,A,E,F
2016-07-02,1.466279,-0.68483,1.038377,1.095508,2,g
2016-07-05,0.287849,-0.80513,0.205894,0.530772,2,g
2016-07-01,0.671343,0.133351,0.081907,1.135445,3,a
2016-07-03,0.905375,0.537389,-0.250072,1.384512,4,a
2016-07-04,-0.086406,0.22967,0.383284,1.084587,4,b
2016-07-06,1.17654,-1.068011,-1.679904,0.402503,4,g


In [216]:
df2.sort_values(by=['F', 'E'])

Unnamed: 0,D,B,C,A,E,F
2016-07-01,0.671343,0.133351,0.081907,1.135445,3,a
2016-07-03,0.905375,0.537389,-0.250072,1.384512,4,a
2016-07-04,-0.086406,0.22967,0.383284,1.084587,4,b
2016-07-02,1.466279,-0.68483,1.038377,1.095508,2,g
2016-07-05,0.287849,-0.80513,0.205894,0.530772,2,g
2016-07-06,1.17654,-1.068011,-1.679904,0.402503,4,g


In [217]:
# 지정한 행 또는 열에서 중복값을 제외한 유니크한 값만 얻기
df2['F'].unique()

array(['a', 'b', 'g'], dtype=object)

In [218]:
# 지정한 행 또는 열에서 값에 따른 개수 얻기
df2['F'].value_counts()

g    3
a    2
b    1
Name: F, dtype: int64

In [219]:
# 지정한 행 또는 열에서 입력한 값이 있는지 확인하기
df2['F'].isin(['a', 'b'])

2016-07-01     True
2016-07-04     True
2016-07-06    False
2016-07-02    False
2016-07-03     True
2016-07-05    False
Name: F, dtype: bool

In [222]:
# F열의 값이 a나 b인 모든 행 구하기
df2.loc[df2['F'].isin(['a', 'b']),:]

Unnamed: 0,D,B,C,A,E,F
2016-07-01,0.671343,0.133351,0.081907,1.135445,3,a
2016-07-04,-0.086406,0.22967,0.383284,1.084587,4,b
2016-07-03,0.905375,0.537389,-0.250072,1.384512,4,a


In [None]:
# 사용자가 직접 만든 함수를 적용하기

func = lambda x: x.max() - x.min()

# def func(x):
#   r = x.max() - x.min()

In [None]:
del df2['F']

In [227]:
df2.apply(func, axis=0)

D    1.552685
B    1.605400
C    2.718281
A    0.982009
E    2.000000
dtype: float64