# Pandas Series CheetSheet

## Series(配列、辞書)の宣言

In [10]:
import pandas as pd
import numpy as np

obj = pd.Series([4,7,5,1])
obj

0    4
1    7
2    5
3    1
dtype: int64

### indexを指定するやりかた

In [15]:
obj_index = pd.Series([1,2,3,4], index=['a', 'b', 'c', 'd'])

obj_index

a    1
b    2
c    3
d    4
dtype: int64

## Seriesから値だけを取り出す

In [11]:
print(obj.values)

[4 7 5 1]


## 指定したindexの要素を取り出す

In [12]:
print(obj[[0,1,3]])

0    4
1    7
3    1
dtype: int64


### その2

In [13]:
print(obj.values[[0,1,3]])

[4 7 1]


In [17]:
print(obj.values[obj > 4])

[7 5]


## スカラ値との掛け算

In [20]:
obj*2

0     8
1    14
2    10
3     2
dtype: int64

## 指数への変換（全体への計算適用例）

In [26]:
import numpy as np
np.exp(obj)

0      54.598150
1    1096.633158
2     148.413159
3       2.718282
dtype: float64

## 辞書としてのSeries

In [28]:
1 in obj
obj[1]

7

# Pandas DataFrame CheetSheet

In [29]:
import numpy as np
import pandas as pd

## データの準備(宣言)

In [30]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

In [48]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [51]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four',
                             'five', 'six'])
frame2.index

Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')

## 列単位の参照

In [37]:
frame.year
# 出力はSeriesになる

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [63]:
frame['year']

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

## 行単位の参照

In [64]:
frame2.loc[['one', 'two']]
# 複数の行（条件）を取り出す場合はリストで与える

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,


In [None]:
# indexの場合はiloc

## 列単位の挿入

In [69]:
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [71]:
frame2['debt'] = np.arange(6.)
frame2
# 上記の場合、配列の長さが一致するはず

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


# Pandas の統計処理

## データの準備

In [72]:
import pandas as pd
import numpy as np

df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


## 各種計算

### 合計

In [78]:
# 列
df.sum()

one    9.25
two   -5.80
dtype: float64

In [86]:
# 行
#　axis=1 or axis='columns'
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

### 累積

In [88]:
# 列
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [90]:
# 行
df.cumsum(axis='columns')

Unnamed: 0,one,two
a,1.4,
b,7.1,2.6
c,,
d,0.75,-0.55


In [None]:
###

## Tips

### NAの扱いについて

In [84]:
# 基本的にはNAはスキップされる。スキップしたくない場合はskipnaオプション
df.mean(axis='columns', skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

### 要約統計量のまとめて表示

In [92]:
# 数値データの場合
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [94]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [95]:
# 数値データ以外の場合
obj = pd.Series(['a', 'b', 'c', 'd'] * 4)
obj

0     a
1     b
2     c
3     d
4     a
5     b
6     c
7     d
8     a
9     b
10    c
11    d
12    a
13    b
14    c
15    d
dtype: object

In [96]:
obj.describe()

count     16
unique     4
top        c
freq       4
dtype: object

# データの集約とGroup演算

## データの準備

In [99]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.325221,-0.198044
1,a,two,-3.104718,0.329738
2,b,one,-1.73282,0.699017
3,b,two,0.12721,0.240838
4,a,one,-0.918795,2.044021


In [108]:
groupd = df['data1'].groupby(df['key1'])


key1
a   -1.782911
b   -0.802805
Name: data1, dtype: float64


In [109]:
print(groupd.mean())

key1
a   -1.782911
b   -0.802805
Name: data1, dtype: float64


In [110]:
print(groupd.mean()['a'])

-1.7829113530498735


In [112]:
ns = pd.Series(groupd.mead())

AttributeError: 'SeriesGroupBy' object has no attribute 'mead'