In [1]:
import pandas as pd
import numpy as np

## Series

In [2]:
obj = pd.Series([4,7,-5,3])
print("obj: ", obj)
print("index: ", obj.index)
print("index: ", obj.values)

obj:  0    4
1    7
2   -5
3    3
dtype: int64
index:  RangeIndex(start=0, stop=4, step=1)
index:  [ 4  7 -5  3]


### 创建Series

In [3]:
# 创建List
pd.Series(range(5))

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [4]:
# 通过ndarray
pd.Series(np.arange(5))

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [5]:
# 通过dict
pd.Series({1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e'})

1    a
2    b
3    c
4    d
5    e
dtype: object

In [6]:
# 构建索引
se = pd.Series(data=np.arange(5), index=['a','b','c','d','e'])
se

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [7]:
# ndarray
se.values

array([0, 1, 2, 3, 4])

In [8]:
# index
se.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

### 索引

In [9]:
# 自定义索引
obj2 = pd.Series([4,7,-5,3], index=['d', 'b', 'a', 'c'])
print("\n obj2: ", obj2)

# 修改索引
obj2.index=['d', 'b', 'a', 'cc']
print("\n obj2: ", obj2)

# 可通过索引定位数据
print("\n obj2['a']: ", obj2['a'])

# 索引的数据是视图
obj2['a']=100
print("\n obj2['a']=100: ", obj2['a'])
print("obj2: ", obj2)



 obj2:  d    4
b    7
a   -5
c    3
dtype: int64

 obj2:  d     4
b     7
a    -5
cc    3
dtype: int64

 obj2['a']:  -5

 obj2['a']=100:  100
obj2:  d       4
b       7
a     100
cc      3
dtype: int64


In [10]:
# 一层字典初始化Series
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
print('\n obj3: ', obj3)

# 也可指定index, 数据无相应值为NaN
obj3 = pd.Series(sdata, index=['Ohio', 'Texas', 'Oregon', 'key1', 'Utah'])
print('\n obj3: ', obj3)



 obj3:  Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

 obj3:  Ohio      35000.0
Texas     71000.0
Oregon    16000.0
key1          NaN
Utah       5000.0
dtype: float64


## DataFrame

### 增

#### 创建DataFrame

In [33]:
# 通过list创建
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]])
df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [34]:
# 通过ndarray创建
df = pd.DataFrame(np.arange(9).reshape(3,3))
df

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8


In [2]:
# 通过dict创建
df = pd.DataFrame([
    {
        'A': 'a1',
        'B': 'b1',    
        'C': 'c1',        
        'D': 'd1'            
    },
    {
        'A': 'a2',
        'B': 'b2',    
        'C': 'c2',        
        'D': 'd2'            
    }    
])
df

Unnamed: 0,A,B,C,D
0,a1,b1,c1,d1
1,a2,b2,c2,d2


In [20]:
# 通过dict创建
df = pd.DataFrame({
    'A': 1,
    'B': np.arange(5),
    'C': ['r0','r1','r2','r3','r4']
})
df

Unnamed: 0,A,B,C
0,1,0,r0
1,1,1,r1
2,1,2,r2
3,1,3,r3
4,1,4,r4


In [15]:
# head
df.head(1)

Unnamed: 0,A,B,C
0,1,0,r0


In [16]:
# tail
df.tail(1)

Unnamed: 0,A,B,C
4,1,4,r4


### 删

In [61]:
d_df = df.copy()

#### 删除列

In [52]:
tmp_df = d_df[['A', 'B']]
# 删除列
d_df.drop(['A', 'B'], axis=1, inplace=True)
d_df.columns

Index(['C'], dtype='object')

In [57]:
# 新增删除的的列
d_df[['A','B']] = tmp_df
d_df.columns

Index(['A', 'B', 'C'], dtype='object')

#### 删除行数据

In [60]:
# 法一：基于条件过滤覆盖

In [63]:
# 法二：直接按照索引过滤
d_df.drop([0,1,2])

Unnamed: 0,A,B,C
3,1,3,r3
4,1,4,r4


### 改

In [8]:
u_df = df.copy()

#### 数据类型修改

In [4]:
# 法一
# 无效数据设置为空
u_df['A'] = pd.to_numeric(df['A'], downcast='float' ,errors='coerce')
u_df.dtypes

A    float32
B      int64
C     object
dtype: object

In [5]:
# 法二
u_df['A'] = df['A'].astype(int)
u_df.dtypes

A     int64
B     int64
C    object
dtype: object

#### 列名修改

In [11]:
u_df.rename(columns={'A': 'a', 'B': 'b', 'C': 'c'}, inplace=True)
print('columns: ', df.columns)
u_df.rename(columns={'a': 'A', 'b': 'B', 'c': 'C'}, inplace=True)

columns:  Index(['A', 'B', 'C'], dtype='object')


#### 数据修改

In [18]:
# 基于条件修改数据 法一
u_df.loc[u_df['B']==2, 'B'] = None

In [24]:
# 填充空值 法一
u_df['B'].fillna(2, inplace=True)

In [31]:
# 基于条件修改数据 法二
u_df['B'] = u_df['B'].apply(lambda x: None if x==2 else x)

In [34]:
# 填充空值 法二
u_df.loc[u_df['B'].isnull(), 'B'] = 2

### 查

#### 索引

In [17]:
# 构建DataFrame
country1 = pd.Series({'Name': '中国',
                    'Language': 'Chinese',
                    'Area': '9.597M km2',
                     'Happiness Rank': 79})

country2 = pd.Series({'Name': '美国',
                    'Language': 'English (US)',
                    'Area': '9.834M km2',
                     'Happiness Rank': 14})

country3 = pd.Series({'Name': '澳大利亚',
                    'Language': 'English (AU)',
                    'Area': '7.692M km2',
                     'Happiness Rank': 9})

df = pd.DataFrame([country1, country2, country3], index=['CH', 'US', 'AU'])
df

Unnamed: 0,Name,Language,Area,Happiness Rank
CH,中国,Chinese,9.597M km2,79
US,美国,English (US),9.834M km2,14
AU,澳大利亚,English (AU),7.692M km2,9


In [18]:
# 定位'Name','Language'列
df[['Name','Language']]

Unnamed: 0,Name,Language
CH,中国,Chinese
US,美国,English (US)
AU,澳大利亚,English (AU)


#### loc与iloc

loc按着索引检索，iloc按照行号检索

In [19]:
# 定位索引为'CH'的行
df.loc['CH']

Name                      中国
Language             Chinese
Area              9.597M km2
Happiness Rank            79
Name: CH, dtype: object

In [30]:
# 定位第0行
df.iloc[0]

Name                      中国
Language             Chinese
Area              9.597M km2
Happiness Rank            79
Name: CH, dtype: object

In [27]:
# 定位前两列
df.iloc[:,0:2]

Unnamed: 0,Name,Language
CH,中国,Chinese
US,美国,English (US)
AU,澳大利亚,English (AU)


In [28]:
# 定位索引为‘Name’,'Language'的两列
df.loc[:, 'Name':'Language']

Unnamed: 0,Name,Language
CH,中国,Chinese
US,美国,English (US)
AU,澳大利亚,English (AU)


#### bool逻辑查询

In [32]:
df[(df['Name']=='中国') & (df['Language']=='Chinese')]

Unnamed: 0,Name,Language,Area,Happiness Rank
CH,中国,Chinese,9.597M km2,79


## 参考

- [5 ways to apply an IF condition in pandas DataFrame](https://datatofish.com/if-condition-in-pandas-dataframe/)