# Handling Missing Data

df.dropna()\
df.fillna(value)

# Make New Columns

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.DataFrame({'A':range(1,11),'B':np.random.randn(10)})
df

Unnamed: 0,A,B
0,1,0.457063
1,2,0.745689
2,3,-0.178034
3,4,-0.707746
4,5,0.69364
5,6,-0.366194
6,7,-0.456222
7,8,2.13597
8,9,-0.103601
9,10,1.642166


In [3]:
df.assign(Area=lambda df: df.A*df.B)

Unnamed: 0,A,B,Area
0,1,0.457063,0.457063
1,2,0.745689,1.491378
2,3,-0.178034,-0.534102
3,4,-0.707746,-2.830986
4,5,0.69364,3.468198
5,6,-0.366194,-2.197165
6,7,-0.456222,-3.193553
7,8,2.13597,17.087757
8,9,-0.103601,-0.93241
9,10,1.642166,16.421657


In [4]:
df.assign(ln_A=lambda x:np.log(x.A)).head()

Unnamed: 0,A,B,ln_A
0,1,0.457063,0.0
1,2,0.745689,0.693147
2,3,-0.178034,1.098612
3,4,-0.707746,1.386294
4,5,0.69364,1.609438


In [5]:
df['ln_A'] = np.log(df.A)
df.ln_A = np.log(df.A)
df

Unnamed: 0,A,B,ln_A
0,1,0.457063,0.0
1,2,0.745689,0.693147
2,3,-0.178034,1.098612
3,4,-0.707746,1.386294
4,5,0.69364,1.609438
5,6,-0.366194,1.791759
6,7,-0.456222,1.94591
7,8,2.13597,2.079442
8,9,-0.103601,2.197225
9,10,1.642166,2.302585


In [6]:
pd.qcut(df.A, 3, labels=["good", "medium", "bad"])

0      good
1      good
2      good
3      good
4    medium
5    medium
6    medium
7       bad
8       bad
9       bad
Name: A, dtype: category
Categories (3, object): ['good' < 'medium' < 'bad']

In [7]:
pd.qcut(df.B, 2, labels=["good", "bad"]) # 숫자형을 카테고리형 데이터로 변환

0     bad
1     bad
2    good
3    good
4     bad
5    good
6    good
7     bad
8    good
9     bad
Name: B, dtype: category
Categories (2, object): ['good' < 'bad']

In [8]:
df.min(axis=0)

A       1.000000
B      -0.707746
ln_A    0.000000
dtype: float64

In [9]:
df['A'].clip(lower=3,upper=8) # 임계치 지정

0    3
1    3
2    3
3    4
4    5
5    6
6    7
7    8
8    8
9    8
Name: A, dtype: int64

In [10]:
df['B'].abs() # 절댓값

0    0.457063
1    0.745689
2    0.178034
3    0.707746
4    0.693640
5    0.366194
6    0.456222
7    2.135970
8    0.103601
9    1.642166
Name: B, dtype: float64

# Reshaping data

## Melt & Pivot

In [11]:
pd.melt?

In [12]:
df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
                   'B': {0: 1, 1: 3, 2: 5},
                   'C': {0: 2, 1: 4, 2: 6}})
df

Unnamed: 0,A,B,C
0,a,1,2
1,b,3,4
2,c,5,6


In [13]:
pd.melt(df, id_vars=['A'], value_vars=['B']) # B column을 row로

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5


In [14]:
pd.melt(df, id_vars=['A'], value_vars=['B','C'])

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5
3,a,C,2
4,b,C,4
5,c,C,6


In [15]:
pd.melt(df, value_vars=['A','B','C']).rename(columns={
    'variable':'var',
    'value':'val'
})

Unnamed: 0,var,val
0,A,a
1,A,b
2,A,c
3,B,1
4,B,3
5,B,5
6,C,2
7,C,4
8,C,6


In [16]:
df2 = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two','two'],
                    'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                    'baz': [1, 2, 3, 4, 5, 6]})
df2

Unnamed: 0,foo,bar,baz
0,one,A,1
1,one,B,2
2,one,C,3
3,two,A,4
4,two,B,5
5,two,C,6


In [17]:
df2.pivot(index='foo',columns='bar',values='baz')

bar,A,B,C
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1,2,3
two,4,5,6


In [18]:
df3 = df2.pivot(index='foo',columns='bar',values='baz').reset_index()
df3

bar,foo,A,B,C
0,one,1,2,3
1,two,4,5,6


In [19]:
df3.melt(id_vars=['foo'], value_vars=['A','B','C'])

Unnamed: 0,foo,bar,value
0,one,A,1
1,two,A,4
2,one,B,2
3,two,B,5
4,one,C,3
5,two,C,6


In [20]:
df3.melt(id_vars=['foo'], value_vars=['A','B','C']).sort_values('foo')

Unnamed: 0,foo,bar,value
0,one,A,1
2,one,B,2
4,one,C,3
1,two,A,4
3,two,B,5
5,two,C,6


In [21]:
df3.melt(id_vars=['foo'], value_vars=['A','B','C']).sort_values(
    ['foo','bar']).rename(columns={'value':'baz'})

Unnamed: 0,foo,bar,baz
0,one,A,1
2,one,B,2
4,one,C,3
1,two,A,4
3,two,B,5
5,two,C,6


## Pandas Concat

In [22]:
pd.concat?

In [23]:
s1 = pd.Series(['a', 'b'])
s1

0    a
1    b
dtype: object

In [24]:
s2 = pd.Series(['c', 'd'])
s2

0    c
1    d
dtype: object

In [25]:
pd.concat([s1, s2])

0    a
1    b
0    c
1    d
dtype: object

In [26]:
pd.concat([s1, s2], ignore_index=True)

0    a
1    b
2    c
3    d
dtype: object

In [27]:
pd.concat([s1, s2], keys=['s1','s2'])

s1  0    a
    1    b
s2  0    c
    1    d
dtype: object

In [28]:
pd.concat([s1, s2], keys=['s1','s2'],
         names=['Series name','Row ID'])

Series name  Row ID
s1           0         a
             1         b
s2           0         c
             1         d
dtype: object

In [29]:
df1 = pd.DataFrame([['a', 1], ['b', 2]],
                   columns=['letter', 'number'])
df1

Unnamed: 0,letter,number
0,a,1
1,b,2


In [30]:
df2 = pd.DataFrame([['c', 3], ['d', 4]],
                    columns=['letter', 'number'])
df2

Unnamed: 0,letter,number
0,c,3
1,d,4


In [31]:
pd.concat([df1,df2])

Unnamed: 0,letter,number
0,a,1
1,b,2
0,c,3
1,d,4


In [32]:
df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
                   columns=['letter', 'number', 'animal'])
df3

Unnamed: 0,letter,number,animal
0,c,3,cat
1,d,4,dog


In [33]:
pd.concat([df1,df3])

Unnamed: 0,letter,number,animal
0,a,1,
1,b,2,
0,c,3,cat
1,d,4,dog


In [34]:
pd.concat([df1,df3], join='inner')

Unnamed: 0,letter,number
0,a,1
1,b,2
0,c,3
1,d,4


In [35]:
df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
                   columns=['animal', 'name'])
df4

Unnamed: 0,animal,name
0,bird,polly
1,monkey,george


In [36]:
df5 = pd.DataFrame([1], index=['a'])
df5

Unnamed: 0,0
a,1


In [37]:
df6 = pd.DataFrame([2], index=['a'])
df6

Unnamed: 0,0
a,2


In [38]:
pd.concat([df5,df6])

Unnamed: 0,0
a,1
a,2


In [39]:
pd.concat([df5,df6],verify_integrity=True)
    # value error : 같은 index값이 존재하기 때문
    # 합칠 때 중복된 값 있는지 검증 가능

ValueError: Indexes have overlapping values: Index(['a'], dtype='object')