# Series & DataFrame

In [2]:
# 創建Series
import numpy as np
import pandas as pd

s = pd.Series([1, 3, 6, 33, np.nan, 7])
print(s)

0     1.0
1     3.0
2     6.0
3    33.0
4     NaN
5     7.0
dtype: float64


In [8]:
# 創建DataFrame
import numpy as np
import pandas as pd

dates = pd.date_range("20230321", periods=6)
print(date)

df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=["A", "B", "C", "D"])
print(df)

DatetimeIndex(['2023-03-21', '2023-03-22', '2023-03-23', '2023-03-24',
               '2023-03-25', '2023-03-26'],
              dtype='datetime64[ns]', freq='D')
                   A         B         C         D
2023-03-21 -1.378532  0.627565 -0.422249 -0.401346
2023-03-22 -0.231084 -0.251911  0.341865  0.944841
2023-03-23  0.278093  0.669009 -0.109846 -0.870076
2023-03-24  0.743197 -1.809101  1.280765 -1.616019
2023-03-25  1.054189  0.068271 -0.431593  0.068166
2023-03-26  1.168547  1.417183  0.070009  0.515533


In [24]:
# DataFrame功能
import numpy as np
import pandas as pd

df0 = pd.DataFrame(np.arange(12).reshape(3, 4))
df1 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
print(df0)
print(df1.dtypes)
print(df1.index)
print(df1.columns)
print(df1.values)
print(df1.describe())
print(df1.T)  # 轉置
print(df1.sort_index(axis=1, ascending=False))  # 對axis=1倒序
print(df1.sort_values(by="E"))  # col:E一樣的排一起

   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
Int64Index([0, 1, 2, 3], dtype='int64')
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
[[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]
         A    C    D
count  4.0  4.0  4.0
mean   1.0  1.0  3.0
std    0.0  0.0  0.0
min    1.0  1.0  3.0
25%    1.0  1.0  3.0
50%    1.0  1.0  3.0
75%    1.0  1.0  3.0
max    1.0  1.0  3.0
                     0                    1                    2  \
A                  1.0                  1.0                  1.0   
B  2013-01-02 00:00:00  2013-01-02 00:00:00  2013-01-02 00:00:00   
C                  1.0                  1.0                  1.0   
D       

# DataFrame選擇

In [2]:
import numpy as np
import pandas as pd

dates = pd.date_range("20130101", periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=["A", "B", "C", "D"])

# loc:用標籤選
print(df.loc['20130102'])
print(df.loc[:,['A','B']])
print(df.loc['20130102', ['A','B']])

# iloc:用位置選
print(df.iloc[3])
print(df.iloc[3, 1])
print(df.iloc[3:5,0:2])
print(df.iloc[[1,2,4],[0,2]])

# 用Boolean選
print(df[df.A > 0])

A    1.635638
B    1.706587
C    1.772408
D    1.286888
Name: 2013-01-02 00:00:00, dtype: float64
                   A         B
2013-01-01 -0.125622  0.363997
2013-01-02  1.635638  1.706587
2013-01-03  0.124642 -0.754653
2013-01-04  1.012108  0.108630
2013-01-05  0.718474  0.310616
2013-01-06  0.289073  0.401007
A    1.635638
B    1.706587
Name: 2013-01-02 00:00:00, dtype: float64
A    1.012108
B    0.108630
C    1.948809
D    1.094012
Name: 2013-01-04 00:00:00, dtype: float64
0.10863017413572437
                   A         B
2013-01-04  1.012108  0.108630
2013-01-05  0.718474  0.310616
                   A         C
2013-01-02  1.635638  1.772408
2013-01-03  0.124642 -0.105290
2013-01-05  0.718474 -0.897945
                   A         B         C         D
2013-01-02  1.635638  1.706587  1.772408  1.286888
2013-01-03  0.124642 -0.754653 -0.105290  0.327883
2013-01-04  1.012108  0.108630  1.948809  1.094012
2013-01-05  0.718474  0.310616 -0.897945  0.721696
2013-01-06  0.289073  0.4

# dataframe值更改

In [9]:
import pandas as pd
import numpy as np

dates = pd.date_range('20230101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['A', 'B', 'C', 'D'])

df.iloc[2,2] = 1111 #用位置更改
df.loc['2023-01-03', 'D'] = 2222 #用標籤更改
df.A[df.A>0] = 0 #用條件更改
df['F'] = np.nan #新增預設是column
df['G']  = pd.Series([1,2,3,4,5,6], index=dates) #使值對齊index
print(df)

                   A         B            C            D   F  G
2023-01-01 -2.092285 -0.704764     0.419119     0.485042 NaN  1
2023-01-02  0.000000 -0.402873     1.296465     0.590669 NaN  2
2023-01-03 -1.191727  0.654061  1111.000000  2222.000000 NaN  3
2023-01-04 -0.921407  0.686227     0.146706     1.027121 NaN  4
2023-01-05 -1.303773 -1.494584    -0.879177     0.716583 NaN  5
2023-01-06 -1.480565  0.940286     0.085198     0.283437 NaN  6
