# 10分で分かるpandas
## はじめに
この記事はpandas公式チュートリアル「10 minutes to pandas」の写経及び解説です

以下のURLを参考にしています
https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html


## 環境
- Python3.8
- Jupyter Lab


In [None]:
## とりあえずインポート


In [None]:
import numpy as np
import pandas as pd

In [47]:
np

<module 'numpy' from '/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/numpy/__init__.py'>

In [48]:
pd

<module 'pandas' from '/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pandas/__init__.py'>

## [1. オブジェクトを作る](https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html#object-creation)

[Series](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html#pandas.Series)クラスにリストを入れることで簡単にデータを作ることが出来ます。


In [55]:
# 簡単に一列作る
s = pd.Series(data=[1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### data_range()
[date_range()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.date_range.html)を使うことで、特定の期間の日付の行を作成出来ます。

In [50]:
# 2020年1月１日から6日間のデータ
dates = pd.date_range("20200101", periods=6)
dates

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06'],
              dtype='datetime64[ns]', freq='D')

In [51]:
# 行インデックスに2020年1月1日からのデータを指定
# 各値にはランダムな数値を入れる
df = pd.DataFrame(np.random.randn(6, 4), index=dates)
df

Unnamed: 0,0,1,2,3
2020-01-01,-0.450454,0.160998,-0.271346,0.553023
2020-01-02,0.517373,0.56849,0.301283,-0.671569
2020-01-03,-1.071441,-1.21572,0.467567,-0.632119
2020-01-04,0.816366,0.79287,-1.173716,0.427666
2020-01-05,-2.067578,0.144527,-0.056876,0.522818
2020-01-06,-0.721893,-0.510241,-0.837046,-1.950327


In [53]:
# 列名ABCDを設定
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2020-01-01,1.531985,-1.069089,-0.309808,1.875026
2020-01-02,-0.01794,1.815473,-0.348716,0.360714
2020-01-03,1.538583,-0.198397,-0.806677,-1.890812
2020-01-04,0.287619,1.142648,0.158049,0.504292
2020-01-05,0.553192,0.86355,-0.690845,0.075127
2020-01-06,1.678275,-0.933814,0.353085,-0.896058


In [54]:
df2 = pd.DataFrame(
    {
        "A": 1.,
        "B": pd.Timestamp("20200101"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2020-01-01,1.0,3,test,foo
1,1.0,2020-01-01,1.0,3,train,foo
2,1.0,2020-01-01,1.0,3,test,foo
3,1.0,2020-01-01,1.0,3,train,foo


In [10]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [11]:
df.head(2)

Unnamed: 0,A,B,C,D
2020-01-01,-0.866703,-0.524674,-0.780228,-0.211863
2020-01-02,0.384998,1.149817,-1.021689,0.008706


In [12]:
df.tail(2)

Unnamed: 0,A,B,C,D
2020-01-05,-0.343103,0.984573,0.196222,2.237728
2020-01-06,0.56171,-0.570133,2.138998,0.703548


In [13]:
df.index

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06'],
              dtype='datetime64[ns]', freq='D')

In [14]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [15]:
df.to_numpy()

array([[-0.86670266, -0.5246742 , -0.78022815, -0.21186347],
       [ 0.3849976 ,  1.14981718, -1.02168934,  0.00870609],
       [ 1.56125162, -0.65902832,  2.59689358,  0.90701959],
       [-0.91577704,  0.89686942,  0.07969123,  0.39382864],
       [-0.3431028 ,  0.98457256,  0.19622196,  2.23772846],
       [ 0.5617101 , -0.5701328 ,  2.13899808,  0.70354844]])

In [16]:
df2.to_numpy()

array([[1.0, Timestamp('2020-01-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2020-01-01 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2020-01-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2020-01-01 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [17]:
df

Unnamed: 0,A,B,C,D
2020-01-01,-0.866703,-0.524674,-0.780228,-0.211863
2020-01-02,0.384998,1.149817,-1.021689,0.008706
2020-01-03,1.561252,-0.659028,2.596894,0.90702
2020-01-04,-0.915777,0.896869,0.079691,0.393829
2020-01-05,-0.343103,0.984573,0.196222,2.237728
2020-01-06,0.56171,-0.570133,2.138998,0.703548


In [18]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.063729,0.212904,0.534981,0.673161
std,0.957541,0.878467,1.50329,0.8725
min,-0.915777,-0.659028,-1.021689,-0.211863
25%,-0.735803,-0.558768,-0.565248,0.104987
50%,0.020947,0.186098,0.137957,0.548689
75%,0.517532,0.962647,1.653304,0.856152
max,1.561252,1.149817,2.596894,2.237728


In [19]:
df.T

Unnamed: 0,2020-01-01,2020-01-02,2020-01-03,2020-01-04,2020-01-05,2020-01-06
A,-0.866703,0.384998,1.561252,-0.915777,-0.343103,0.56171
B,-0.524674,1.149817,-0.659028,0.896869,0.984573,-0.570133
C,-0.780228,-1.021689,2.596894,0.079691,0.196222,2.138998
D,-0.211863,0.008706,0.90702,0.393829,2.237728,0.703548


In [20]:
df.transpose()

Unnamed: 0,2020-01-01,2020-01-02,2020-01-03,2020-01-04,2020-01-05,2020-01-06
A,-0.866703,0.384998,1.561252,-0.915777,-0.343103,0.56171
B,-0.524674,1.149817,-0.659028,0.896869,0.984573,-0.570133
C,-0.780228,-1.021689,2.596894,0.079691,0.196222,2.138998
D,-0.211863,0.008706,0.90702,0.393829,2.237728,0.703548


In [21]:
df.sort_index()

Unnamed: 0,A,B,C,D
2020-01-01,-0.866703,-0.524674,-0.780228,-0.211863
2020-01-02,0.384998,1.149817,-1.021689,0.008706
2020-01-03,1.561252,-0.659028,2.596894,0.90702
2020-01-04,-0.915777,0.896869,0.079691,0.393829
2020-01-05,-0.343103,0.984573,0.196222,2.237728
2020-01-06,0.56171,-0.570133,2.138998,0.703548


In [22]:
df.sort_index(axis="columns", ascending=False)

Unnamed: 0,D,C,B,A
2020-01-01,-0.211863,-0.780228,-0.524674,-0.866703
2020-01-02,0.008706,-1.021689,1.149817,0.384998
2020-01-03,0.90702,2.596894,-0.659028,1.561252
2020-01-04,0.393829,0.079691,0.896869,-0.915777
2020-01-05,2.237728,0.196222,0.984573,-0.343103
2020-01-06,0.703548,2.138998,-0.570133,0.56171


In [23]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2020-01-01,-0.211863,-0.780228,-0.524674,-0.866703
2020-01-02,0.008706,-1.021689,1.149817,0.384998
2020-01-03,0.90702,2.596894,-0.659028,1.561252
2020-01-04,0.393829,0.079691,0.896869,-0.915777
2020-01-05,2.237728,0.196222,0.984573,-0.343103
2020-01-06,0.703548,2.138998,-0.570133,0.56171


In [24]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2020-01-03,1.561252,-0.659028,2.596894,0.90702
2020-01-06,0.56171,-0.570133,2.138998,0.703548
2020-01-01,-0.866703,-0.524674,-0.780228,-0.211863
2020-01-04,-0.915777,0.896869,0.079691,0.393829
2020-01-05,-0.343103,0.984573,0.196222,2.237728
2020-01-02,0.384998,1.149817,-1.021689,0.008706


In [25]:
df.sort_values(by="2020-01-01", axis=1)

Unnamed: 0,A,C,B,D
2020-01-01,-0.866703,-0.780228,-0.524674,-0.211863
2020-01-02,0.384998,-1.021689,1.149817,0.008706
2020-01-03,1.561252,2.596894,-0.659028,0.90702
2020-01-04,-0.915777,0.079691,0.896869,0.393829
2020-01-05,-0.343103,0.196222,0.984573,2.237728
2020-01-06,0.56171,2.138998,-0.570133,0.703548


In [26]:
df["A"]

2020-01-01   -0.866703
2020-01-02    0.384998
2020-01-03    1.561252
2020-01-04   -0.915777
2020-01-05   -0.343103
2020-01-06    0.561710
Freq: D, Name: A, dtype: float64

In [27]:
df.A

2020-01-01   -0.866703
2020-01-02    0.384998
2020-01-03    1.561252
2020-01-04   -0.915777
2020-01-05   -0.343103
2020-01-06    0.561710
Freq: D, Name: A, dtype: float64

In [28]:
# 先頭4列表示
df[0:3]

Unnamed: 0,A,B,C,D
2020-01-01,-0.866703,-0.524674,-0.780228,-0.211863
2020-01-02,0.384998,1.149817,-1.021689,0.008706
2020-01-03,1.561252,-0.659028,2.596894,0.90702


In [29]:
# 2020年1月2日から2020年1月4日まで表示
df['20200102':'20200104']

Unnamed: 0,A,B,C,D
2020-01-02,0.384998,1.149817,-1.021689,0.008706
2020-01-03,1.561252,-0.659028,2.596894,0.90702
2020-01-04,-0.915777,0.896869,0.079691,0.393829


In [30]:
df.loc[dates]

Unnamed: 0,A,B,C,D
2020-01-01,-0.866703,-0.524674,-0.780228,-0.211863
2020-01-02,0.384998,1.149817,-1.021689,0.008706
2020-01-03,1.561252,-0.659028,2.596894,0.90702
2020-01-04,-0.915777,0.896869,0.079691,0.393829
2020-01-05,-0.343103,0.984573,0.196222,2.237728
2020-01-06,0.56171,-0.570133,2.138998,0.703548


In [31]:
df.loc[dates[0]]

A   -0.866703
B   -0.524674
C   -0.780228
D   -0.211863
Name: 2020-01-01 00:00:00, dtype: float64

In [32]:
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2020-01-01,-0.866703,-0.524674
2020-01-02,0.384998,1.149817
2020-01-03,1.561252,-0.659028
2020-01-04,-0.915777,0.896869
2020-01-05,-0.343103,0.984573
2020-01-06,0.56171,-0.570133


In [33]:
df.loc['20200102':'20200104', ['A', 'B']]

Unnamed: 0,A,B
2020-01-02,0.384998,1.149817
2020-01-03,1.561252,-0.659028
2020-01-04,-0.915777,0.896869


In [34]:
df.loc[dates[0], 'A']

-0.8667026620111296

In [35]:
df.at[dates[0], 'A']

-0.8667026620111296

In [36]:
df

Unnamed: 0,A,B,C,D
2020-01-01,-0.866703,-0.524674,-0.780228,-0.211863
2020-01-02,0.384998,1.149817,-1.021689,0.008706
2020-01-03,1.561252,-0.659028,2.596894,0.90702
2020-01-04,-0.915777,0.896869,0.079691,0.393829
2020-01-05,-0.343103,0.984573,0.196222,2.237728
2020-01-06,0.56171,-0.570133,2.138998,0.703548


In [37]:
df.iloc[3] # 4行目を1列として選択

A   -0.915777
B    0.896869
C    0.079691
D    0.393829
Name: 2020-01-04 00:00:00, dtype: float64

In [38]:
df.iloc[3:5, 0:2] # 4行目から5行目まで、1列目から2列目まで選択

Unnamed: 0,A,B
2020-01-04,-0.915777,0.896869
2020-01-05,-0.343103,0.984573


In [39]:
df.iloc[[1, 2, 4], [0, 2]] # 2行目、3行目、5行目、1列目、3列目を選択

Unnamed: 0,A,C
2020-01-02,0.384998,-1.021689
2020-01-03,1.561252,2.596894
2020-01-05,-0.343103,0.196222


In [40]:
df.iloc[1:3, :] # 2行目から3行目を全列選択


Unnamed: 0,A,B,C,D
2020-01-02,0.384998,1.149817,-1.021689,0.008706
2020-01-03,1.561252,-0.659028,2.596894,0.90702


In [41]:
df.iloc[:, 1:3] # 2列目から3列目を善行選択

Unnamed: 0,B,C
2020-01-01,-0.524674,-0.780228
2020-01-02,1.149817,-1.021689
2020-01-03,-0.659028,2.596894
2020-01-04,0.896869,0.079691
2020-01-05,0.984573,0.196222
2020-01-06,-0.570133,2.138998


In [42]:
df.iloc[1, 1]

1.1498171779772

In [43]:
df.iat[1, 1]

1.1498171779772

In [44]:
df

Unnamed: 0,A,B,C,D
2020-01-01,-0.866703,-0.524674,-0.780228,-0.211863
2020-01-02,0.384998,1.149817,-1.021689,0.008706
2020-01-03,1.561252,-0.659028,2.596894,0.90702
2020-01-04,-0.915777,0.896869,0.079691,0.393829
2020-01-05,-0.343103,0.984573,0.196222,2.237728
2020-01-06,0.56171,-0.570133,2.138998,0.703548


In [45]:
df[df["A"] > 0] # A列のデータが0を超えている行を選択する

Unnamed: 0,A,B,C,D
2020-01-02,0.384998,1.149817,-1.021689,0.008706
2020-01-03,1.561252,-0.659028,2.596894,0.90702
2020-01-06,0.56171,-0.570133,2.138998,0.703548
