# Pandas

In [133]:
import numpy as np
import pandas as pd

## Series

In [134]:
obj = pd.Series([4,7,-5,3])

In [135]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [136]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [137]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [138]:
obj.dtypes

dtype('int64')

In [139]:
obj2 = pd.Series([4,7,-5,3], index = ["d","b","a","c"])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [140]:
sdata = {'Kim':35000, 'Beomwoo': 67000, 'Joan' : 12000 , 'Choi':4000}

In [141]:
obj3 = pd.Series(sdata)
obj3

Kim        35000
Beomwoo    67000
Joan       12000
Choi        4000
dtype: int64

In [142]:
obj3.name = 'Salary'
obj3.index.name = "Names"
# obj3.value.name = "salary"
obj3

Names
Kim        35000
Beomwoo    67000
Joan       12000
Choi        4000
Name: Salary, dtype: int64

In [143]:
obj3.index = ['A','B','C','D']
obj3

A    35000
B    67000
C    12000
D     4000
Name: Salary, dtype: int64

## DataFrame

### 생성

In [144]:
data = {'name':['Beomwoo','Beomwoo','Beomwoo','Kim','Park'],
       'year':[2013,2014,2015,2016,2015],
       'points':[1.5,1.7,3.6,2.4,2.9]}
df = pd.DataFrame(data)
df

Unnamed: 0,name,year,points
0,Beomwoo,2013,1.5
1,Beomwoo,2014,1.7
2,Beomwoo,2015,3.6
3,Kim,2016,2.4
4,Park,2015,2.9


### Parameter

In [145]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [146]:
df.columns

Index(['name', 'year', 'points'], dtype='object')

In [147]:
df.values

array([['Beomwoo', 2013, 1.5],
       ['Beomwoo', 2014, 1.7],
       ['Beomwoo', 2015, 3.6],
       ['Kim', 2016, 2.4],
       ['Park', 2015, 2.9]], dtype=object)

In [148]:
df.index.name='Num'
df.columns.name = 'Info'
df

Info,name,year,points
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Beomwoo,2013,1.5
1,Beomwoo,2014,1.7
2,Beomwoo,2015,3.6
3,Kim,2016,2.4
4,Park,2015,2.9


In [149]:
df2 = pd.DataFrame(data,columns=['year','name','points','penalty'],
                  index=['one','two','three','four','five'])
df2

Unnamed: 0,year,name,points,penalty
one,2013,Beomwoo,1.5,
two,2014,Beomwoo,1.7,
three,2015,Beomwoo,3.6,
four,2016,Kim,2.4,
five,2015,Park,2.9,


In [150]:
df2.describe()

Unnamed: 0,year,points
count,5.0,5.0
mean,2014.6,2.42
std,1.140175,0.864292
min,2013.0,1.5
25%,2014.0,1.7
50%,2015.0,2.4
75%,2015.0,2.9
max,2016.0,3.6


In [151]:
from pandas import DataFrame as df

In [152]:
df_1  = df(data = np.arange(12).reshape(3,4),index=['r0','r1','r2'],
          columns=['c0','c1','c2','c3'],dtype='int',copy=False)

In [153]:
df_1

Unnamed: 0,c0,c1,c2,c3
r0,0,1,2,3
r1,4,5,6,7
r2,8,9,10,11


In [154]:
df_1.T

Unnamed: 0,r0,r1,r2
c0,0,4,8
c1,1,5,9
c2,2,6,10
c3,3,7,11


In [155]:
df_1.axes

[Index(['r0', 'r1', 'r2'], dtype='object'),
 Index(['c0', 'c1', 'c2', 'c3'], dtype='object')]

In [156]:
df_1.dtypes

c0    int32
c1    int32
c2    int32
c3    int32
dtype: object

In [157]:
df_1.shape

(3, 4)

In [158]:
df_1.shape

(3, 4)

In [159]:
df_1.shape[0]

3

In [160]:
df_1.size

12

In [161]:
df_1.values

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

### DF 조회

In [162]:
df_1.head(2)

Unnamed: 0,c0,c1,c2,c3
r0,0,1,2,3
r1,4,5,6,7


In [163]:
df_1.tail(2)

Unnamed: 0,c0,c1,c2,c3
r1,4,5,6,7
r2,8,9,10,11


---------------------------------------------

### pd.concat

In [164]:
import pandas as pd
from pandas import DataFrame

In [165]:
df_1= pd.DataFrame({'A':['A0','A1','A2'],
                 'B':['B0','B1','B2'],
                 'C':['C0','C1','C2'],
                 'D':['D0','D1','D2']},
                 index=[0,1,2])

In [166]:
df_2= pd.DataFrame({'A':['A3','A4','A5'],
                 'B':['B3','B4','B5'],
                 'C':['C3','C4','C5'],
                 'D':['D3','D4','D5']},
                 index=[0,1,2])

In [167]:
df_1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2


In [168]:
df_2

Unnamed: 0,A,B,C,D
0,A3,B3,C3,D3
1,A4,B4,C4,D4
2,A5,B5,C5,D5


#### rbind (axis = 0)

In [169]:
df_12_axis0 = pd.concat([df_1,df_2])

In [170]:
df_12_axis0

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
0,A3,B3,C3,D3
1,A4,B4,C4,D4
2,A5,B5,C5,D5


In [171]:
df_3= pd.DataFrame({'A':['A6','A7','A8'],
                 'B':['B6','B7','B8'],
                 'C':['C6','C7','C8'],
                 'D':['D6','D7','D8']},
                 index=[0,1,2])

In [172]:
df_1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2


In [173]:
df_3

Unnamed: 0,A,B,C,D
0,A6,B6,C6,D6
1,A7,B7,C7,D7
2,A8,B8,C8,D8


#### cbind (axis=1)

In [174]:
df_13_axis1 = pd.concat([df_1,df_3],axis=1)

In [175]:
df_13_axis1

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,A0,B0,C0,D0,A6,B6,C6,D6
1,A1,B1,C1,D1,A7,B7,C7,D7
2,A2,B2,C2,D2,A8,B8,C8,D8


In [176]:
df_12_axis1 = pd.concat([df_1,df_2],axis=1)

In [177]:
df_12_axis1

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,A0,B0,C0,D0,A3,B3,C3,D3
1,A1,B1,C1,D1,A4,B4,C4,D4
2,A2,B2,C2,D2,A5,B5,C5,D5


#### union (join=outer)

In [178]:
df_4= pd.DataFrame({'A':['A0','A1','A2'],
                 'B':['B0','B1','B2'],
                 'C':['C0','C1','C2'],
                 'E':['E0','E1','E2']},
                 index=[0,1,3])

In [179]:
df_1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2


In [180]:
df_4

Unnamed: 0,A,B,C,E
0,A0,B0,C0,E0
1,A1,B1,C1,E1
3,A2,B2,C2,E2


In [181]:
df_14_outer = pd.concat([df_1,df_4],join='outer')

In [182]:
df_14_outer

Unnamed: 0,A,B,C,D,E
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
0,A0,B0,C0,,E0
1,A1,B1,C1,,E1
3,A2,B2,C2,,E2


#### intersection (join=inner)

In [183]:
df_14_inner = pd.concat([df_1,df_4],join='inner')

In [184]:
df_14_inner

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
0,A0,B0,C0
1,A1,B1,C1
3,A2,B2,C2


### index기준으로 병합

    * merge
    * join or index

In [185]:
df_left= DataFrame({'A':['A0','A1','A2','A3'],
                 'B':['B0','B1','B2','B3']},
                 index=['K0','K1','K2','K3'])

In [186]:
df_right= DataFrame({'C':['C2','C3','C4','C5'],
                 'D':['D2','D3','D4','D5']},
                 index=['K2','K3','K4','K5'])

In [187]:
df_left

Unnamed: 0,A,B
K0,A0,B0
K1,A1,B1
K2,A2,B2
K3,A3,B3


In [188]:
df_right

Unnamed: 0,C,D
K2,C2,D2
K3,C3,D3
K4,C4,D4
K5,C5,D5


#### left

In [189]:
pd.merge(df_left,df_right,
        left_index=True, right_index=True,
        how='left')

Unnamed: 0,A,B,C,D
K0,A0,B0,,
K1,A1,B1,,
K2,A2,B2,C2,D2
K3,A3,B3,C3,D3


In [190]:
df_left.join(df_right, how='left')

Unnamed: 0,A,B,C,D
K0,A0,B0,,
K1,A1,B1,,
K2,A2,B2,C2,D2
K3,A3,B3,C3,D3


p.74- 90

#### right 

In [191]:
pd.merge(df_left,df_right,
        left_index=True, right_index=True,
        how='right')

Unnamed: 0,A,B,C,D
K2,A2,B2,C2,D2
K3,A3,B3,C3,D3
K4,,,C4,D4
K5,,,C5,D5


In [192]:
df_left.join(df_right, how='right')

Unnamed: 0,A,B,C,D
K2,A2,B2,C2,D2
K3,A3,B3,C3,D3
K4,,,C4,D4
K5,,,C5,D5


#### inner

In [193]:
pd.merge(df_left,df_right,
        left_index=True, right_index=True,
        how='inner')

Unnamed: 0,A,B,C,D
K2,A2,B2,C2,D2
K3,A3,B3,C3,D3


In [194]:
df_left.join(df_right, how='inner')

Unnamed: 0,A,B,C,D
K2,A2,B2,C2,D2
K3,A3,B3,C3,D3


#### outer

In [195]:
pd.merge(df_left,df_right,
        left_index=True, right_index=True,
        how='outer')

Unnamed: 0,A,B,C,D
K0,A0,B0,,
K1,A1,B1,,
K2,A2,B2,C2,D2
K3,A3,B3,C3,D3
K4,,,C4,D4
K5,,,C5,D5


In [196]:
df_left.join(df_right, how='outer')

Unnamed: 0,A,B,C,D
K0,A0,B0,,
K1,A1,B1,,
K2,A2,B2,C2,D2
K3,A3,B3,C3,D3
K4,,,C4,D4
K5,,,C5,D5


_______________________________________


### 열만 추출 및 조작

In [197]:
data = {"names":["Kilho","Kilho","Kilho","Charles","Charles"],
       "year": [2014,2015,2016,2015,2016],
       'points':[1.5,1.7,3.6,2.4,2.9]}
df = pd.DataFrame(data,columns=['year','names','points','penalty'],
                 index = ['one','two','three','four','five'])
df

Unnamed: 0,year,names,points,penalty
one,2014,Kilho,1.5,
two,2015,Kilho,1.7,
three,2016,Kilho,3.6,
four,2015,Charles,2.4,
five,2016,Charles,2.9,


In [198]:
df['year']

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [199]:
df.year


one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [200]:
df[['year','points']]

Unnamed: 0,year,points
one,2014,1.5
two,2015,1.7
three,2016,3.6
four,2015,2.4
five,2016,2.9


In [201]:
df['penalty']

one      NaN
two      NaN
three    NaN
four     NaN
five     NaN
Name: penalty, dtype: object

In [202]:
df['penalty']=0.5

In [203]:
df

Unnamed: 0,year,names,points,penalty
one,2014,Kilho,1.5,0.5
two,2015,Kilho,1.7,0.5
three,2016,Kilho,3.6,0.5
four,2015,Charles,2.4,0.5
five,2016,Charles,2.9,0.5


In [204]:
df['penalty']=[0.1,0.2,0.3,0.4,0.5]

In [205]:
df

Unnamed: 0,year,names,points,penalty
one,2014,Kilho,1.5,0.1
two,2015,Kilho,1.7,0.2
three,2016,Kilho,3.6,0.3
four,2015,Charles,2.4,0.4
five,2016,Charles,2.9,0.5


In [206]:
df['zeros'] = np.arange(5)

In [207]:
df

Unnamed: 0,year,names,points,penalty,zeros
one,2014,Kilho,1.5,0.1,0
two,2015,Kilho,1.7,0.2,1
three,2016,Kilho,3.6,0.3,2
four,2015,Charles,2.4,0.4,3
five,2016,Charles,2.9,0.5,4


In [208]:
val = pd.Series([-1.2,-1.5,-1.7],index = ['two','four','five'])

In [209]:
val

two    -1.2
four   -1.5
five   -1.7
dtype: float64

In [210]:
df['debt'] = val

In [211]:
df

Unnamed: 0,year,names,points,penalty,zeros,debt
one,2014,Kilho,1.5,0.1,0,
two,2015,Kilho,1.7,0.2,1,-1.2
three,2016,Kilho,3.6,0.3,2,
four,2015,Charles,2.4,0.4,3,-1.5
five,2016,Charles,2.9,0.5,4,-1.7


In [212]:
df['net_points'] = df['points'] - df['penalty']

In [213]:
df[['net_points']]

Unnamed: 0,net_points
one,1.4
two,1.5
three,3.3
four,2.0
five,2.4


In [214]:
df['net_points'] > 2.0

one      False
two      False
three     True
four     False
five      True
Name: net_points, dtype: bool

In [215]:
df['high_points'] = df['net_points'] > 2.0

In [216]:
df

Unnamed: 0,year,names,points,penalty,zeros,debt,net_points,high_points
one,2014,Kilho,1.5,0.1,0,,1.4,False
two,2015,Kilho,1.7,0.2,1,-1.2,1.5,False
three,2016,Kilho,3.6,0.3,2,,3.3,True
four,2015,Charles,2.4,0.4,3,-1.5,2.0,False
five,2016,Charles,2.9,0.5,4,-1.7,2.4,True


In [217]:
# 열 삭제
del df['high_points']

In [218]:
df

Unnamed: 0,year,names,points,penalty,zeros,debt,net_points
one,2014,Kilho,1.5,0.1,0,,1.4
two,2015,Kilho,1.7,0.2,1,-1.2,1.5
three,2016,Kilho,3.6,0.3,2,,3.3
four,2015,Charles,2.4,0.4,3,-1.5,2.0
five,2016,Charles,2.9,0.5,4,-1.7,2.4


In [219]:
del df['net_points']
del df['zeros']

In [220]:
df

Unnamed: 0,year,names,points,penalty,debt
one,2014,Kilho,1.5,0.1,
two,2015,Kilho,1.7,0.2,-1.2
three,2016,Kilho,3.6,0.3,
four,2015,Charles,2.4,0.4,-1.5
five,2016,Charles,2.9,0.5,-1.7


In [221]:
df.columns

Index(['year', 'names', 'points', 'penalty', 'debt'], dtype='object')

### 행만 선택 조작

#### 비추 행 열 동시조건 추출 불가

In [222]:
df[0:3]

Unnamed: 0,year,names,points,penalty,debt
one,2014,Kilho,1.5,0.1,
two,2015,Kilho,1.7,0.2,-1.2
three,2016,Kilho,3.6,0.3,


In [223]:
df['two':'four']

Unnamed: 0,year,names,points,penalty,debt
two,2015,Kilho,1.7,0.2,-1.2
three,2016,Kilho,3.6,0.3,
four,2015,Charles,2.4,0.4,-1.5


### 행 열 동시에 추출 가능 (loc, iloc)

#### df.loc[]  : 이름으로 행열 추출

In [224]:
df.loc['two','year']

2015

In [225]:
df.loc['two':'four']

Unnamed: 0,year,names,points,penalty,debt
two,2015,Kilho,1.7,0.2,-1.2
three,2016,Kilho,3.6,0.3,
four,2015,Charles,2.4,0.4,-1.5


In [226]:
df.loc['two':'four','points']

two      1.7
three    3.6
four     2.4
Name: points, dtype: float64

In [227]:
df.loc[:,'year']

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [228]:
# df.loc[,'year']

In [229]:
df.loc[:,['year','names']]

Unnamed: 0,year,names
one,2014,Kilho
two,2015,Kilho
three,2016,Kilho
four,2015,Charles
five,2016,Charles


In [230]:
df.loc['three':'five','year':'penalty']

Unnamed: 0,year,names,points,penalty
three,2016,Kilho,3.6,0.3
four,2015,Charles,2.4,0.4
five,2016,Charles,2.9,0.5


In [231]:
# 새로운 행 삽입
df.loc['six',:]=[2013,'Jun',4.0,0.1,2.1]

In [232]:
df

Unnamed: 0,year,names,points,penalty,debt
one,2014.0,Kilho,1.5,0.1,
two,2015.0,Kilho,1.7,0.2,-1.2
three,2016.0,Kilho,3.6,0.3,
four,2015.0,Charles,2.4,0.4,-1.5
five,2016.0,Charles,2.9,0.5,-1.7
six,2013.0,Jun,4.0,0.1,2.1


#### df.iloc[]  : index로 행열 추출

In [233]:
df.iloc[3]

year          2015
names      Charles
points         2.4
penalty        0.4
debt          -1.5
Name: four, dtype: object

In [234]:
df.iloc[3:5,0:2]

Unnamed: 0,year,names
four,2015.0,Charles
five,2016.0,Charles


In [235]:
df.iloc[[0,1,3],[1,2]]

Unnamed: 0,names,points
one,Kilho,1.5
two,Kilho,1.7
four,Charles,2.4


In [236]:
df.iloc[:,1:4]

Unnamed: 0,names,points,penalty
one,Kilho,1.5,0.1
two,Kilho,1.7,0.2
three,Kilho,3.6,0.3
four,Charles,2.4,0.4
five,Charles,2.9,0.5
six,Jun,4.0,0.1


In [237]:
df.iloc[1,1]

'Kilho'

#### 논리 인덱싱

In [238]:
df

Unnamed: 0,year,names,points,penalty,debt
one,2014.0,Kilho,1.5,0.1,
two,2015.0,Kilho,1.7,0.2,-1.2
three,2016.0,Kilho,3.6,0.3,
four,2015.0,Charles,2.4,0.4,-1.5
five,2016.0,Charles,2.9,0.5,-1.7
six,2013.0,Jun,4.0,0.1,2.1


In [239]:
df['year'] > 2014

one      False
two       True
three     True
four      True
five      True
six      False
Name: year, dtype: bool

In [240]:
# df.loc[df['year']>2014,:]

In [241]:
df[df['year']>2014]

Unnamed: 0,year,names,points,penalty,debt
two,2015.0,Kilho,1.7,0.2,-1.2
three,2016.0,Kilho,3.6,0.3,
four,2015.0,Charles,2.4,0.4,-1.5
five,2016.0,Charles,2.9,0.5,-1.7


In [242]:
df.loc[df['names']=='Kilho',['names','points']]

Unnamed: 0,names,points
one,Kilho,1.5
two,Kilho,1.7
three,Kilho,3.6


In [243]:
df.loc[(df['points']>2)&(df['points']<3),:]

Unnamed: 0,year,names,points,penalty,debt
four,2015.0,Charles,2.4,0.4,-1.5
five,2016.0,Charles,2.9,0.5,-1.7


In [244]:
df.loc[df['points']>3, 'penalty'] = 0

In [245]:
df

Unnamed: 0,year,names,points,penalty,debt
one,2014.0,Kilho,1.5,0.1,
two,2015.0,Kilho,1.7,0.2,-1.2
three,2016.0,Kilho,3.6,0.0,
four,2015.0,Charles,2.4,0.4,-1.5
five,2016.0,Charles,2.9,0.5,-1.7
six,2013.0,Jun,4.0,0.0,2.1


--------------------------------------------------------

### DF 날짜 

In [246]:
df = pd.DataFrame(np.random.randn(6,4))

In [247]:
df

Unnamed: 0,0,1,2,3
0,-1.169137,-0.75287,-0.144583,-0.696182
1,0.537385,-1.41871,1.075533,-0.509228
2,0.028343,-0.047708,-0.2928,-0.218272
3,-0.217681,-1.090517,0.503141,0.854906
4,-0.00318,1.497773,1.146637,0.495832
5,1.073954,0.935282,0.491765,-1.231107


In [248]:
df.columns = ['A','B','C','D']
df.index = pd.date_range('20160701', periods = 6)
df.index

DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',
               '2016-07-05', '2016-07-06'],
              dtype='datetime64[ns]', freq='D')

In [249]:
df

Unnamed: 0,A,B,C,D
2016-07-01,-1.169137,-0.75287,-0.144583,-0.696182
2016-07-02,0.537385,-1.41871,1.075533,-0.509228
2016-07-03,0.028343,-0.047708,-0.2928,-0.218272
2016-07-04,-0.217681,-1.090517,0.503141,0.854906
2016-07-05,-0.00318,1.497773,1.146637,0.495832
2016-07-06,1.073954,0.935282,0.491765,-1.231107


In [250]:
m = pd.date_range('20160701', periods = 40, freq='M') # Month end
# period는 출력 개수

In [251]:
m

DatetimeIndex(['2016-07-31', '2016-08-31', '2016-09-30', '2016-10-31',
               '2016-11-30', '2016-12-31', '2017-01-31', '2017-02-28',
               '2017-03-31', '2017-04-30', '2017-05-31', '2017-06-30',
               '2017-07-31', '2017-08-31', '2017-09-30', '2017-10-31',
               '2017-11-30', '2017-12-31', '2018-01-31', '2018-02-28',
               '2018-03-31', '2018-04-30', '2018-05-31', '2018-06-30',
               '2018-07-31', '2018-08-31', '2018-09-30', '2018-10-31',
               '2018-11-30', '2018-12-31', '2019-01-31', '2019-02-28',
               '2019-03-31', '2019-04-30', '2019-05-31', '2019-06-30',
               '2019-07-31', '2019-08-31', '2019-09-30', '2019-10-31'],
              dtype='datetime64[ns]', freq='M')

In [252]:
ms = pd.date_range('20160701', periods = 60, freq='MS') # Month Start

In [253]:
ms

DatetimeIndex(['2016-07-01', '2016-08-01', '2016-09-01', '2016-10-01',
               '2016-11-01', '2016-12-01', '2017-01-01', '2017-02-01',
               '2017-03-01', '2017-04-01', '2017-05-01', '2017-06-01',
               '2017-07-01', '2017-08-01', '2017-09-01', '2017-10-01',
               '2017-11-01', '2017-12-01', '2018-01-01', '2018-02-01',
               '2018-03-01', '2018-04-01', '2018-05-01', '2018-06-01',
               '2018-07-01', '2018-08-01', '2018-09-01', '2018-10-01',
               '2018-11-01', '2018-12-01', '2019-01-01', '2019-02-01',
               '2019-03-01', '2019-04-01', '2019-05-01', '2019-06-01',
               '2019-07-01', '2019-08-01', '2019-09-01', '2019-10-01',
               '2019-11-01', '2019-12-01', '2020-01-01', '2020-02-01',
               '2020-03-01', '2020-04-01', '2020-05-01', '2020-06-01',
               '2020-07-01', '2020-08-01', '2020-09-01', '2020-10-01',
               '2020-11-01', '2020-12-01', '2021-01-01', '2021-02-01',
      

### 결측값 이상치(NaN) 조작

In [254]:
df['F'] = [1.0,np.nan,3.5,6.1,np.nan, 7.0]
df

Unnamed: 0,A,B,C,D,F
2016-07-01,-1.169137,-0.75287,-0.144583,-0.696182,1.0
2016-07-02,0.537385,-1.41871,1.075533,-0.509228,
2016-07-03,0.028343,-0.047708,-0.2928,-0.218272,3.5
2016-07-04,-0.217681,-1.090517,0.503141,0.854906,6.1
2016-07-05,-0.00318,1.497773,1.146637,0.495832,
2016-07-06,1.073954,0.935282,0.491765,-1.231107,7.0


In [255]:
df.dropna(how='any')
# 행중에 nan이 1개라도 있다면 삭제

Unnamed: 0,A,B,C,D,F
2016-07-01,-1.169137,-0.75287,-0.144583,-0.696182,1.0
2016-07-03,0.028343,-0.047708,-0.2928,-0.218272,3.5
2016-07-04,-0.217681,-1.090517,0.503141,0.854906,6.1
2016-07-06,1.073954,0.935282,0.491765,-1.231107,7.0


In [256]:
df.dropna(how='all')
# 행에 nan이 모두 있어야 삭제

Unnamed: 0,A,B,C,D,F
2016-07-01,-1.169137,-0.75287,-0.144583,-0.696182,1.0
2016-07-02,0.537385,-1.41871,1.075533,-0.509228,
2016-07-03,0.028343,-0.047708,-0.2928,-0.218272,3.5
2016-07-04,-0.217681,-1.090517,0.503141,0.854906,6.1
2016-07-05,-0.00318,1.497773,1.146637,0.495832,
2016-07-06,1.073954,0.935282,0.491765,-1.231107,7.0


In [257]:
df

Unnamed: 0,A,B,C,D,F
2016-07-01,-1.169137,-0.75287,-0.144583,-0.696182,1.0
2016-07-02,0.537385,-1.41871,1.075533,-0.509228,
2016-07-03,0.028343,-0.047708,-0.2928,-0.218272,3.5
2016-07-04,-0.217681,-1.090517,0.503141,0.854906,6.1
2016-07-05,-0.00318,1.497773,1.146637,0.495832,
2016-07-06,1.073954,0.935282,0.491765,-1.231107,7.0


In [258]:
df.fillna(value=0.5)

Unnamed: 0,A,B,C,D,F
2016-07-01,-1.169137,-0.75287,-0.144583,-0.696182,1.0
2016-07-02,0.537385,-1.41871,1.075533,-0.509228,0.5
2016-07-03,0.028343,-0.047708,-0.2928,-0.218272,3.5
2016-07-04,-0.217681,-1.090517,0.503141,0.854906,6.1
2016-07-05,-0.00318,1.497773,1.146637,0.495832,0.5
2016-07-06,1.073954,0.935282,0.491765,-1.231107,7.0


In [259]:
df.isnull()

Unnamed: 0,A,B,C,D,F
2016-07-01,False,False,False,False,False
2016-07-02,False,False,False,False,True
2016-07-03,False,False,False,False,False
2016-07-04,False,False,False,False,False
2016-07-05,False,False,False,False,True
2016-07-06,False,False,False,False,False


In [260]:
df.loc[df.isnull()['F'],:]

Unnamed: 0,A,B,C,D,F
2016-07-02,0.537385,-1.41871,1.075533,-0.509228,
2016-07-05,-0.00318,1.497773,1.146637,0.495832,


In [261]:
df.isnull()['F']

2016-07-01    False
2016-07-02     True
2016-07-03    False
2016-07-04    False
2016-07-05     True
2016-07-06    False
Freq: D, Name: F, dtype: bool

### 행 열 삭제

In [262]:
pd.to_datetime('20160701')

Timestamp('2016-07-01 00:00:00')

In [263]:
df.drop(pd.to_datetime('20160701'))

Unnamed: 0,A,B,C,D,F
2016-07-02,0.537385,-1.41871,1.075533,-0.509228,
2016-07-03,0.028343,-0.047708,-0.2928,-0.218272,3.5
2016-07-04,-0.217681,-1.090517,0.503141,0.854906,6.1
2016-07-05,-0.00318,1.497773,1.146637,0.495832,
2016-07-06,1.073954,0.935282,0.491765,-1.231107,7.0


In [264]:
df.drop([pd.to_datetime('20160702'),pd.to_datetime('20160704')]) 
# 기본 행삭제

Unnamed: 0,A,B,C,D,F
2016-07-01,-1.169137,-0.75287,-0.144583,-0.696182,1.0
2016-07-03,0.028343,-0.047708,-0.2928,-0.218272,3.5
2016-07-05,-0.00318,1.497773,1.146637,0.495832,
2016-07-06,1.073954,0.935282,0.491765,-1.231107,7.0


In [265]:
df.drop('F',axis=1) # F의 행삭제 :열삭제

Unnamed: 0,A,B,C,D
2016-07-01,-1.169137,-0.75287,-0.144583,-0.696182
2016-07-02,0.537385,-1.41871,1.075533,-0.509228
2016-07-03,0.028343,-0.047708,-0.2928,-0.218272
2016-07-04,-0.217681,-1.090517,0.503141,0.854906
2016-07-05,-0.00318,1.497773,1.146637,0.495832
2016-07-06,1.073954,0.935282,0.491765,-1.231107


In [127]:
df.drop(['B','D'],axis=1)

Unnamed: 0,A,C,F
2016-07-01,0.697795,-0.412307,1.0
2016-07-02,0.947892,-0.3134,
2016-07-03,-0.712625,0.600084,3.5
2016-07-04,-0.345576,-1.448574,6.1
2016-07-05,-0.440028,0.746841,
2016-07-06,1.328865,-0.332754,7.0


### pandas Func

In [127]:
# list로도 DF 생성 가능
data = [[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]]
df = pd.DataFrame(data,columns=['one','two'], index = ['a','b','c','d'])

In [128]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [130]:
df.sum(axis=0)

one    9.25
two   -5.80
dtype: float64

In [131]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [132]:
df.sum(axis=1,skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [133]:
df['one'].sum()

9.25

In [134]:
df.loc['b'].sum()

2.5999999999999996

In [135]:
df['one'].mean()

3.0833333333333335

In [136]:
one_mean = df["one"].mean()

In [137]:
two_min = df['two'].min()

In [138]:
df['one']= df['one'].fillna(value = one_mean)

In [141]:
df['two']= df['two'].fillna(value = two_min)

In [142]:
df

Unnamed: 0,one,two
a,1.4,-4.5
b,7.1,-4.5
c,3.083333,-4.5
d,0.75,-1.3


In [277]:
df2 = pd.DataFrame(np.random.randn(6,4),
                  columns = ["A","B","C","D"],
                  index = pd.date_range("20160701",periods=6))

In [278]:
df2

Unnamed: 0,A,B,C,D
2016-07-01,-1.059553,-1.037434,-0.57864,-0.400516
2016-07-02,-0.618495,0.961035,-1.059195,0.980528
2016-07-03,-1.911473,0.978056,-0.680416,0.214674
2016-07-04,2.185288,0.10629,0.591556,0.571122
2016-07-05,-0.063058,0.124124,2.074285,-0.573853
2016-07-06,2.220763,-0.91641,-1.538874,1.825827


In [146]:
df2['A'].corr(df2['B'])

-0.02132678750254424

In [147]:
df2['B'].cov(df2['C'])

-0.22322345854859338

In [268]:
df2.corr()

Unnamed: 0,A,B,C,D
A,1.0,0.542499,0.448992,-0.458036
B,0.542499,1.0,0.418247,-0.849659
C,0.448992,0.418247,1.0,-0.691197
D,-0.458036,-0.849659,-0.691197,1.0


In [149]:
df2.cov()

Unnamed: 0,A,B,C,D
A,0.421358,-0.014085,-0.02923,0.373285
B,-0.014085,1.035161,-0.223223,-0.695623
C,-0.02923,-0.223223,0.2599,0.0746
D,0.373285,-0.695623,0.0746,1.164127


In [279]:
dates = df2.index

In [280]:
random_dates = np.random.permutation(dates) #날짜 섞음

In [281]:
df2= df2.reindex(index = random_dates, columns = ['D','B','C','A'])
df2

Unnamed: 0,D,B,C,A
2016-07-03,0.214674,0.978056,-0.680416,-1.911473
2016-07-01,-0.400516,-1.037434,-0.57864,-1.059553
2016-07-06,1.825827,-0.91641,-1.538874,2.220763
2016-07-04,0.571122,0.10629,0.591556,2.185288
2016-07-05,-0.573853,0.124124,2.074285,-0.063058
2016-07-02,0.980528,0.961035,-1.059195,-0.618495


#### sort_index() 인덱스 정렬

In [282]:
df2.sort_index(axis=0)

Unnamed: 0,D,B,C,A
2016-07-01,-0.400516,-1.037434,-0.57864,-1.059553
2016-07-02,0.980528,0.961035,-1.059195,-0.618495
2016-07-03,0.214674,0.978056,-0.680416,-1.911473
2016-07-04,0.571122,0.10629,0.591556,2.185288
2016-07-05,-0.573853,0.124124,2.074285,-0.063058
2016-07-06,1.825827,-0.91641,-1.538874,2.220763


In [283]:
df2.sort_index(axis=1)

Unnamed: 0,A,B,C,D
2016-07-03,-1.911473,0.978056,-0.680416,0.214674
2016-07-01,-1.059553,-1.037434,-0.57864,-0.400516
2016-07-06,2.220763,-0.91641,-1.538874,1.825827
2016-07-04,2.185288,0.10629,0.591556,0.571122
2016-07-05,-0.063058,0.124124,2.074285,-0.573853
2016-07-02,-0.618495,0.961035,-1.059195,0.980528


In [284]:
df2.sort_index(axis=1, ascending = False)

Unnamed: 0,D,C,B,A
2016-07-03,0.214674,-0.680416,0.978056,-1.911473
2016-07-01,-0.400516,-0.57864,-1.037434,-1.059553
2016-07-06,1.825827,-1.538874,-0.91641,2.220763
2016-07-04,0.571122,0.591556,0.10629,2.185288
2016-07-05,-0.573853,2.074285,0.124124,-0.063058
2016-07-02,0.980528,-1.059195,0.961035,-0.618495


In [285]:
df2.sort_values(by='D')

Unnamed: 0,D,B,C,A
2016-07-05,-0.573853,0.124124,2.074285,-0.063058
2016-07-01,-0.400516,-1.037434,-0.57864,-1.059553
2016-07-03,0.214674,0.978056,-0.680416,-1.911473
2016-07-04,0.571122,0.10629,0.591556,2.185288
2016-07-02,0.980528,0.961035,-1.059195,-0.618495
2016-07-06,1.825827,-0.91641,-1.538874,2.220763


In [286]:
df2.sort_values(by='B',ascending=False)

Unnamed: 0,D,B,C,A
2016-07-03,0.214674,0.978056,-0.680416,-1.911473
2016-07-02,0.980528,0.961035,-1.059195,-0.618495
2016-07-05,-0.573853,0.124124,2.074285,-0.063058
2016-07-04,0.571122,0.10629,0.591556,2.185288
2016-07-06,1.825827,-0.91641,-1.538874,2.220763
2016-07-01,-0.400516,-1.037434,-0.57864,-1.059553


In [287]:
df2['E'] = np.random.randint(0,6,size = 6) # 0~5까지 랜덤

In [288]:
df2['F'] = ['alpha','beta','gamma','gamma','alpha','gamma']
df2

Unnamed: 0,D,B,C,A,E,F
2016-07-03,0.214674,0.978056,-0.680416,-1.911473,2,alpha
2016-07-01,-0.400516,-1.037434,-0.57864,-1.059553,1,beta
2016-07-06,1.825827,-0.91641,-1.538874,2.220763,4,gamma
2016-07-04,0.571122,0.10629,0.591556,2.185288,3,gamma
2016-07-05,-0.573853,0.124124,2.074285,-0.063058,1,alpha
2016-07-02,0.980528,0.961035,-1.059195,-0.618495,2,gamma


In [160]:
df2['F'].unique()

array(['alpha', 'beta', 'gamma'], dtype=object)

In [161]:
df2['F'].value_counts()

gamma    3
alpha    2
beta     1
Name: F, dtype: int64

In [162]:
df2['F'].isin(['alpha','beta'])

2016-07-06     True
2016-07-03     True
2016-07-01    False
2016-07-04    False
2016-07-02     True
2016-07-05    False
Name: F, dtype: bool

In [163]:
df2.loc[df2['F'].isin(['alpha','beta']),:]

Unnamed: 0,D,B,C,A,E,F
2016-07-06,-0.477434,-0.733172,0.135929,-0.492171,5,alpha
2016-07-03,0.874406,-1.008058,0.980876,-0.67834,4,beta
2016-07-02,2.023203,-1.193125,0.223953,0.999608,2,alpha


In [165]:
df3 = pd.DataFrame(np.random.randn(4,3), columns = ['b','d','e'],index=['Seoul','Incheon','Busan','Daegu'])
df3

Unnamed: 0,b,d,e
Seoul,0.254189,0.238857,-1.285643
Incheon,-0.476705,-0.808729,-0.403328
Busan,0.530577,-1.195103,-0.649934
Daegu,1.430485,0.511266,-0.708442


In [166]:
func = lambda x: x.max() - x.min()

In [167]:
df3.apply(func,axis=0)

b    1.907190
d    1.706369
e    0.882315
dtype: float64

## lambda

In [168]:
my_pow2 = lambda x: x**2

In [169]:
my_pow2(3)

9

In [170]:
my_list=[1,2,3,4]

In [171]:
my_pow2(my_list)

TypeError: unsupported operand type(s) for ** or pow(): 'list' and 'int'

### map() + lambda

In [172]:
list(map(lambda i: i**2, my_list))

[1, 4, 9, 16]

##### map() 활용 X

In [173]:
a = [1.2,2.5,3.7,4.6]


In [174]:
for i in range(len(a)):
    a[i] = int(a[i])

In [175]:
a

[1, 2, 3, 4]

In [176]:
a = [1.2, 2.5, 3.7, 4.6]

##### map() 활용 O

###### int로 변환

In [177]:
a = list(map(int,a))

In [178]:
a

[1, 2, 3, 4]

###### str로 변환

In [179]:
a = list(map(str,range(10)))

In [180]:
a

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']