<a href="https://colab.research.google.com/github/devmatsuko/python-practice/blob/main/section4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lecture14:Series（1次元のデータ列）


In [1]:
import pandas as pd
from pandas import Series

In [2]:
# アレイとシリーズの違いはシリーズにはindexがついている。
obj = Series([3,6,9,12])

In [3]:
obj

0     3
1     6
2     9
3    12
dtype: int64

In [4]:
# シリーズ内の値のみを取得
obj.values

array([ 3,  6,  9, 12])

In [5]:
# シリーズ内のindexのみ取得
obj.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
# indexに文字列を指定することもできる
ww2_cas = Series([8700000,4300000,3000000,2100000,4000000], index=['USSR','Germany','China','Japan','USA'])

In [7]:
ww2_cas

USSR       8700000
Germany    4300000
China      3000000
Japan      2100000
USA        4000000
dtype: int64

In [9]:
# 参照時に個別にindexを指定し、値を取得
ww2_cas['USA']

4000000

In [10]:
# []内に条件を指定することもできる。
ww2_cas[ww2_cas>4000000]

USSR       8700000
Germany    4300000
dtype: int64

In [11]:
ww2_cas>4000000

USSR        True
Germany     True
China      False
Japan      False
USA        False
dtype: bool

In [13]:
# USSRが含まれているかを判定する
'USSR' in ww2_cas

True

In [16]:
# Series配列を辞書型に変換
ww2_dict = ww2_cas.to_dict()

In [17]:
ww2_dict

{'China': 3000000,
 'Germany': 4300000,
 'Japan': 2100000,
 'USA': 4000000,
 'USSR': 8700000}

In [18]:
# 辞書型の配列をSeries配列に変換
ww2_Series = Series(ww2_dict)

In [19]:
ww2_Series

USSR       8700000
Germany    4300000
China      3000000
Japan      2100000
USA        4000000
dtype: int64

In [20]:
countries = ['China','Germany','Japan','USA','USSR','Argentina']

In [21]:
# 上述で作成した辞書型配列のindexをcountriesにする
obj2 = Series(ww2_dict, index=countries)

In [22]:
# アルゼンチンのデータはないのでNaNになる
obj2

China        3000000.0
Germany      4300000.0
Japan        2100000.0
USA          4000000.0
USSR         8700000.0
Argentina          NaN
dtype: float64

In [24]:
# nullがTrue
pd.isnull(obj2)

China        False
Germany      False
Japan        False
USA          False
USSR         False
Argentina     True
dtype: bool

In [25]:
# null以外がTrue
pd.notnull(obj2)

China         True
Germany       True
Japan         True
USA           True
USSR          True
Argentina    False
dtype: bool

In [26]:
ww2_Series

USSR       8700000
Germany    4300000
China      3000000
Japan      2100000
USA        4000000
dtype: int64

In [27]:
obj2

China        3000000.0
Germany      4300000.0
Japan        2100000.0
USA          4000000.0
USSR         8700000.0
Argentina          NaN
dtype: float64

In [28]:
# indexが揃っている部分は足算される
ww2_Series + obj2

Argentina           NaN
China         6000000.0
Germany       8600000.0
Japan         4200000.0
USA           8000000.0
USSR         17400000.0
dtype: float64

In [29]:
# シリーズ配列に名前をつける
obj2.name = "第二次世界大戦の死傷者"

In [30]:
obj2

China        3000000.0
Germany      4300000.0
Japan        2100000.0
USA          4000000.0
USSR         8700000.0
Argentina          NaN
Name: 第二次世界大戦の死傷者, dtype: float64

In [31]:
# indexに名前をつける
obj2.index.name = 'Countries'

In [32]:
obj2

Countries
China        3000000.0
Germany      4300000.0
Japan        2100000.0
USA          4000000.0
USSR         8700000.0
Argentina          NaN
Name: 第二次世界大戦の死傷者, dtype: float64

# Lecture15:DataFrame

In [7]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [38]:
# クリップボードの中身をデータフレームとして格納
# GoogleColabでは実行不可
nfl_frame = pd.read_clipboard()

PyperclipException: ignored

In [1]:
data = {'City' : ['SF','LA','NWC'], 'Population':[837000,3880000,8400000]}

In [5]:
data

{'City': ['SF', 'LA', 'NWC'], 'Population': [837000, 3880000, 8400000]}

In [8]:
# 辞書型の配列をデータフレーム型に変更する
city_frame = DataFrame(data)

In [9]:
city_frame

Unnamed: 0,City,Population
0,SF,837000
1,LA,3880000
2,NWC,8400000


# Lecture16:indexの基本

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
my_ser = Series([1,2,3,4], index=['A','B','C','D'])

In [3]:
my_ser

A    1
B    2
C    3
D    4
dtype: int64

In [4]:
# indexだけ取り出して取得
my_index = my_ser.index

In [5]:
my_index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [6]:
my_index[0]

'A'

In [7]:
my_index[2:]

Index(['C', 'D'], dtype='object')

In [8]:
# pandasのindexは個別に変更することはできない
my_index[0] = 'Z'

TypeError: ignored

# Lecture17:indexを変える

In [9]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from numpy.random import randn

In [10]:
ser1 = Series([1,2,3,4], index = ['A','B','C','D'])

In [11]:
ser1

A    1
B    2
C    3
D    4
dtype: int64

In [12]:
# インデックスを再割り当てする。indexが増えたら値にNULLが自動的に入る
ser2 = ser1.reindex(['A','B','C','D','E','F'])

In [13]:
ser2

A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
F    NaN
dtype: float64

In [14]:
# 新しく追加されたindexの値にNULLではなく0を代入する
ser2.reindex(['A','B','C','D','E','F','G'],fill_value=0)

A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
F    NaN
G    0.0
dtype: float64

In [15]:
ser3 = Series(['USA','Mexico','Canada'], index=[0,5,10])

In [16]:
ser3

0        USA
5     Mexico
10    Canada
dtype: object

In [18]:
# indexを１５まで増やし、ForwordFillにより、下記のように値が代入される
ser3.reindex(range(15), method='ffill')

0        USA
1        USA
2        USA
3        USA
4        USA
5     Mexico
6     Mexico
7     Mexico
8     Mexico
9     Mexico
10    Canada
11    Canada
12    Canada
13    Canada
14    Canada
dtype: object

In [19]:
dframe = DataFrame(randn(25).reshape((5,5)), index=['A','B','D','E','F'], columns=['col1','col2','col3','col4','col5'])

In [21]:
dframe

Unnamed: 0,col1,col2,col3,col4,col5
A,1.798281,-0.853011,-1.994342,-0.456764,0.817262
B,-0.956073,-0.200918,-0.967874,-0.698836,1.127923
D,0.975924,0.813505,-0.360992,0.567961,0.956236
E,-1.542524,-0.768814,-0.354886,0.289908,-1.418927
F,1.590327,0.288436,-0.349002,-3.503006,-0.618537


In [25]:
# 新しいindexを作成し、データフレームに割り当てる
new_index=['A','B','C','D','E','F']
dframe2 = dframe.reindex(new_index)

In [23]:
dframe2

Unnamed: 0,col1,col2,col3,col4,col5
A,1.798281,-0.853011,-1.994342,-0.456764,0.817262
B,-0.956073,-0.200918,-0.967874,-0.698836,1.127923
C,,,,,
D,0.975924,0.813505,-0.360992,0.567961,0.956236
E,-1.542524,-0.768814,-0.354886,0.289908,-1.418927
F,1.590327,0.288436,-0.349002,-3.503006,-0.618537


In [26]:
# 新しい列を作成し、データフレームに割り当てる
new_columns = ['col1','col2','col3','col4','col5','col6']
dframe2.reindex(columns=new_columns)

Unnamed: 0,col1,col2,col3,col4,col5,col6
A,1.798281,-0.853011,-1.994342,-0.456764,0.817262,
B,-0.956073,-0.200918,-0.967874,-0.698836,1.127923,
C,,,,,,
D,0.975924,0.813505,-0.360992,0.567961,0.956236,
E,-1.542524,-0.768814,-0.354886,0.289908,-1.418927,
F,1.590327,0.288436,-0.349002,-3.503006,-0.618537,


In [32]:
dframe

Unnamed: 0,col1,col2,col3,col4,col5
A,1.798281,-0.853011,-1.994342,-0.456764,0.817262
B,-0.956073,-0.200918,-0.967874,-0.698836,1.127923
D,0.975924,0.813505,-0.360992,0.567961,0.956236
E,-1.542524,-0.768814,-0.354886,0.289908,-1.418927
F,1.590327,0.288436,-0.349002,-3.503006,-0.618537


# Lecture18:行や列を削除する

In [41]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [36]:
ser1 = Series(np.arange(3), index=['a','b','c'])

In [37]:
ser1

a    0
b    1
c    2
dtype: int64

In [38]:
# 行の削除
ser1.drop('b')

a    0
c    2
dtype: int64

In [43]:
dframe1 = DataFrame(np.arange(9).reshape((3,3)), index=['SF','LA','NY'], columns=['pop', 'size', 'year'])

In [44]:
dframe1

Unnamed: 0,pop,size,year
SF,0,1,2
LA,3,4,5
NY,6,7,8


In [46]:
# 行の削除
dframe1.drop('LA')

Unnamed: 0,pop,size,year
SF,0,1,2
NY,6,7,8


In [47]:
# 列の削除、axis=1は列の削除、axis=0は行の削除
dframe1.drop('year',axis=1)

Unnamed: 0,pop,size
SF,0,1
LA,3,4
NY,6,7


# Lecture19:データを取り出す

In [48]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [49]:
ser1 = Series(np.arange(3), index=['A','B','C'])

In [50]:
ser1 = 2*ser1

In [51]:
ser1

A    0
B    2
C    4
dtype: int64

In [52]:
ser1['B']

2

In [53]:
ser1[1]

2

In [54]:
ser1[0:2]

A    0
B    2
dtype: int64

In [55]:
ser1[['A','B']]

A    0
B    2
dtype: int64

In [56]:
ser1[ser1>3]

C    4
dtype: int64

In [57]:
# 条件に当てはまる値のみを変更する
ser1[ser1>3] = 10

In [58]:
ser1

A     0
B     2
C    10
dtype: int64

In [59]:
dframe = DataFrame(np.arange(25).reshape((5,5)), index=['NYC','LA','SF','DC','Chi'], columns=['A','B','C','D','E'])

In [60]:
dframe

Unnamed: 0,A,B,C,D,E
NYC,0,1,2,3,4
LA,5,6,7,8,9
SF,10,11,12,13,14
DC,15,16,17,18,19
Chi,20,21,22,23,24


In [61]:
dframe['B']

NYC     1
LA      6
SF     11
DC     16
Chi    21
Name: B, dtype: int64

In [62]:
dframe[['B','D']]

Unnamed: 0,B,D
NYC,1,3
LA,6,8
SF,11,13
DC,16,18
Chi,21,23


In [65]:
dframe[dframe['C']>8]

Unnamed: 0,A,B,C,D,E
SF,10,11,12,13,14
DC,15,16,17,18,19
Chi,20,21,22,23,24


In [66]:
dframe > 10

Unnamed: 0,A,B,C,D,E
NYC,False,False,False,False,False
LA,False,False,False,False,False
SF,False,True,True,True,True
DC,True,True,True,True,True
Chi,True,True,True,True,True


# Lecture20:形の違うデータの計算

In [68]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [69]:
ser1 = Series([0,1,2], index=['A','B','C'])

In [70]:
ser1

A    0
B    1
C    2
dtype: int64

In [71]:
ser2 = Series([3,4,5,6], index=['A','B','C','D'])

In [72]:
ser2

A    3
B    4
C    5
D    6
dtype: int64

In [73]:
# ser1にはD行がないのでNaNになる
ser1 + ser2

A    3.0
B    5.0
C    7.0
D    NaN
dtype: float64

In [74]:
dframe1 = DataFrame(np.arange(4).reshape((2,2)), columns=list('AB'), index=['NYC','LA'])

In [75]:
dframe1

Unnamed: 0,A,B
NYC,0,1
LA,2,3


In [80]:
dframe2 = DataFrame(np.arange(9).reshape((3,3)),columns=list('ADC'), index=['NYC','SF','LA'])

In [81]:
dframe2

Unnamed: 0,A,D,C
NYC,0,1,2
SF,3,4,5
LA,6,7,8


In [82]:
dframe1 + dframe2

Unnamed: 0,A,B,C,D
LA,8.0,,,
NYC,0.0,,,
SF,,,,


In [83]:
# dframe2のうち、dframe1と　共通するところ意外は0を穴埋めして計算する
dframe1.add(dframe2, fill_value=0)

Unnamed: 0,A,B,C,D
LA,8.0,3.0,8.0,7.0
NYC,0.0,1.0,2.0,1.0
SF,3.0,,5.0,4.0


In [84]:
dframe2

Unnamed: 0,A,D,C
NYC,0,1,2
SF,3,4,5
LA,6,7,8


# Lecture21:データの並び替えと順番

In [85]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [86]:
ser1 = Series(range(3), index=['C', 'A', 'B'])

In [87]:
ser1

C    0
A    1
B    2
dtype: int64

In [89]:
# indexでソートする
ser1.sort_index()

A    1
B    2
C    0
dtype: int64

In [92]:
ser1

C    0
A    1
B    2
dtype: int64

In [94]:
# 値でソートする
ser1.sort_values()

C    0
A    1
B    2
dtype: int64

In [95]:
from numpy.random import randn

In [97]:
ser2 = Series(randn(10))

In [98]:
ser2

0   -1.053477
1    0.162073
2    1.433453
3   -0.313250
4    1.161542
5    1.229998
6   -1.720683
7   -0.396985
8   -0.072989
9   -0.027500
dtype: float64

In [100]:
# 値でソートした場合に何番目に来るかを表示する
ser2.rank()

0     2.0
1     7.0
2    10.0
3     4.0
4     8.0
5     9.0
6     1.0
7     3.0
8     5.0
9     6.0
dtype: float64

In [104]:
# ソートされた結果を元のデータに反映させる
ser2.sort_values(inplace=True)

In [105]:
ser2

6   -1.720683
0   -1.053477
7   -0.396985
3   -0.313250
8   -0.072989
9   -0.027500
1    0.162073
4    1.161542
5    1.229998
2    1.433453
dtype: float64

In [106]:
ser2.rank()

6     1.0
0     2.0
7     3.0
3     4.0
8     5.0
9     6.0
1     7.0
4     8.0
5     9.0
2    10.0
dtype: float64

# Lecture22:データの統計量

In [107]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [108]:
arr = np.array([[1,2,np.nan], [np.nan,3,4]])

In [109]:
arr

array([[ 1.,  2., nan],
       [nan,  3.,  4.]])

In [110]:
dframe1 = DataFrame(arr, index=['A','B'], columns=['One','Two','Three'])

In [111]:
dframe1

Unnamed: 0,One,Two,Three
A,1.0,2.0,
B,,3.0,4.0


In [112]:
# 列方向の和を算出
dframe1.sum()

One      1.0
Two      5.0
Three    4.0
dtype: float64

In [113]:
# 行方向の和を算出
dframe1.sum(axis=1)

A    3.0
B    7.0
dtype: float64

In [114]:
# 列ごとの最小値を算出
dframe1.min()

One      1.0
Two      2.0
Three    4.0
dtype: float64

In [116]:
# 最小値がどのindexか表示する
dframe1.idxmin()

One      A
Two      A
Three    B
dtype: object

In [117]:
dframe1

Unnamed: 0,One,Two,Three
A,1.0,2.0,
B,,3.0,4.0


In [119]:
# 累積を計算する
dframe1.cumsum()

Unnamed: 0,One,Two,Three
A,1.0,2.0,
B,,5.0,4.0


In [120]:
# 各種統計値を表示する
dframe1.describe()

Unnamed: 0,One,Two,Three
count,1.0,2.0,1.0
mean,1.0,2.5,4.0
std,,0.707107,
min,1.0,2.0,4.0
25%,1.0,2.25,4.0
50%,1.0,2.5,4.0
75%,1.0,2.75,4.0
max,1.0,3.0,4.0
