In [3]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

## 重複データ処理

In [2]:
df1 = DataFrame({'key1':['A'] * 2 + ['B'] * 3,
                'key2': [2,2,2,3,3]})

In [3]:
df1

Unnamed: 0,key1,key2
0,A,2
1,A,2
2,B,2
3,B,3
4,B,3


上から順に重複した場合True

In [6]:
df1.duplicated()

0    False
1     True
2    False
3    False
4     True
dtype: bool

#### 重複を取り除く

In [7]:
df1.drop_duplicates() 

Unnamed: 0,key1,key2
0,A,2
2,B,2
3,B,3


In [8]:
df1.drop_duplicates(['key1'])

Unnamed: 0,key1,key2
0,A,2
2,B,2


In [12]:
df1.drop_duplicates(['key1'], keep='last')

Unnamed: 0,key1,key2
1,A,2
4,B,3


## マッピング

In [13]:
df2 = DataFrame({'city':['Alma','Brion','Fox'],
                'altitude':[3155, 55000, 2434]})

In [14]:
df2

Unnamed: 0,city,altitude
0,Alma,3155
1,Brion,55000
2,Fox,2434


In [15]:
state_map = {'Alma':'Colorado','Brion':'Utah','Fox':'Myowing'}

keyに対応するように列を追加できる

In [16]:
df2['state'] = df2['city'].map(state_map)

In [17]:
df2

Unnamed: 0,city,altitude,state
0,Alma,3155,Colorado
1,Brion,55000,Utah
2,Fox,2434,Myowing


In [18]:
df2['key1'] = [0,1,2]

In [19]:
df2

Unnamed: 0,city,altitude,state,key1
0,Alma,3155,Colorado,0
1,Brion,55000,Utah,1
2,Fox,2434,Myowing,2


## 置換

In [20]:
ser1 = Series([1,2,3,4,1,2,3,4])

In [21]:
ser1.replace(1, np.nan)

0    NaN
1    2.0
2    3.0
3    4.0
4    NaN
5    2.0
6    3.0
7    4.0
dtype: float64

In [22]:
ser1.replace([1,4],[100,400])

0    100
1      2
2      3
3    400
4    100
5      2
6      3
7    400
dtype: int64

In [23]:
ser1.replace([1,4],100)

0    100
1      2
2      3
3    100
4    100
5      2
6      3
7    100
dtype: int64

In [25]:
ser1.replace({4:np.nan})

0    1.0
1    2.0
2    3.0
3    NaN
4    1.0
5    2.0
6    3.0
7    NaN
dtype: float64

## Indexの変更

In [4]:
df3 = DataFrame(np.arange(12).reshape((3,4)),
               index=['NY','LA','SF'],
               columns=list('ABCD'))

In [30]:
df3

Unnamed: 0,A,B,C,D
NY,0,1,2,3
LA,4,5,6,7
SF,8,9,10,11


In [31]:
df3.index.map(str.lower)

Index(['ny', 'la', 'sf'], dtype='object')

In [5]:
df3.index = df3.index.map(str.lower)

In [6]:
df3

Unnamed: 0,A,B,C,D
ny,0,1,2,3
la,4,5,6,7
sf,8,9,10,11


renameをしたDataFrameを返す

In [9]:
df3.rename(index=str.title, columns=str.lower)

Unnamed: 0,a,b,c,d
Ny,0,1,2,3
La,4,5,6,7
Sf,8,9,10,11


元のデータを変更

In [11]:
df3.rename(index=str.title, columns=str.lower, inplace=True)

In [12]:
df3

Unnamed: 0,a,b,c,d
Ny,0,1,2,3
La,4,5,6,7
Sf,8,9,10,11


## Binining(分類)

In [13]:
years = [1990,1991,1992,2008,2015,1986,2013,2008,1999]

In [15]:
decate_bins = [1960,1970,1980,1990,2000,2010]

In [16]:
decade_cat = pd.cut(years,decate_bins)

In [17]:
decade_cat

[(1980, 1990], (1990, 2000], (1990, 2000], (2000, 2010], NaN, (1980, 1990], NaN, (2000, 2010], (1990, 2000]]
Categories (5, interval[int64]): [(1960, 1970] < (1970, 1980] < (1980, 1990] < (1990, 2000] < (2000, 2010]]

In [18]:
decade_cat.categories

IntervalIndex([(1960, 1970], (1970, 1980], (1980, 1990], (1990, 2000], (2000, 2010]]
              closed='right',
              dtype='interval[int64]')

In [19]:
pd.value_counts(decade_cat)

(1990, 2000]    3
(2000, 2010]    2
(1980, 1990]    2
(1970, 1980]    0
(1960, 1970]    0
dtype: int64

In [20]:
pd.cut(years, 2)

[(1985.971, 2000.5], (1985.971, 2000.5], (1985.971, 2000.5], (2000.5, 2015.0], (2000.5, 2015.0], (1985.971, 2000.5], (2000.5, 2015.0], (2000.5, 2015.0], (1985.971, 2000.5]]
Categories (2, interval[float64]): [(1985.971, 2000.5] < (2000.5, 2015.0]]