# Pandas中的13个技巧 

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#计算数据缺失量" data-toc-modified-id="计算数据缺失量-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>计算数据缺失量</a></span></li><li><span><a href="#获取分组最大值所在行" data-toc-modified-id="获取分组最大值所在行-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>获取分组最大值所在行</a></span></li><li><span><a href="#多列合并为一行" data-toc-modified-id="多列合并为一行-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>多列合并为一行</a></span></li><li><span><a href="#删除包含特定字符串所在的行" data-toc-modified-id="删除包含特定字符串所在的行-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>删除包含特定字符串所在的行</a></span></li><li><span><a href="#组内排序" data-toc-modified-id="组内排序-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>组内排序</a></span></li></ul></div>

In [1]:
import numpy as np
import pandas as pd

## 计算数据缺失量

In [5]:
def missing_cal(df: pd.DataFrame) -> pd.DataFrame:
    """
    df: 数据集
    return: 每个变量的缺失值
    """
    missing_series = df.isnull().sum()/df.shape[0]
    missing_df = pd.DataFrame(missing_series).reset_index()
    missing_df = missing_df.rename(columns={'index':'col',
                                            0:'missing_pct'})
    missing_df = missing_df.sort_values('missing_pct',ascending=False).reset_index(drop=True)
    return missing_df

In [6]:
file = r'../data/titanic_train.csv'
df = pd.read_csv(file)
missing_cal(df)

Unnamed: 0,col,missing_pct
0,Cabin,0.771044
1,Age,0.198653
2,Embarked,0.002245
3,PassengerId,0.0
4,Survived,0.0
5,Pclass,0.0
6,Name,0.0
7,Sex,0.0
8,SibSp,0.0
9,Parch,0.0


## 获取分组最大值所在行 

In [15]:
df = pd.DataFrame({'Sp':['a','b','c','d','e','f'], 
                   'Mt':['s1', 's1', 's2','s2','s2','s3'], 
                   'Value':[1,2,3,4,5,6], 
                   'Count':[3,2,5,10,10,6]})
df.iloc[df.groupby('Mt').apply(lambda x: x['Count'].idxmax())]

Unnamed: 0,Sp,Mt,Value,Count
0,a,s1,1,3
3,d,s2,4,10
5,f,s3,6,6


## 多列合并为一行

In [17]:
df = pd.DataFrame({'id_part':['a','b','c','d'], 
                   'pred':[0.1,0.2,0.3,0.4], 
                   'pred_class':['women','man','cat','dog'], 
                   'v_id':['d1','d2','d3','d1']})
df

Unnamed: 0,id_part,pred,pred_class,v_id
0,a,0.1,women,d1
1,b,0.2,man,d2
2,c,0.3,cat,d3
3,d,0.4,dog,d1


In [18]:
df.groupby(['v_id']).agg({'pred_class': [', '.join],'pred': lambda x: list(x),
                          'id_part': 'first'}).reset_index()

Unnamed: 0_level_0,v_id,pred_class,pred,id_part
Unnamed: 0_level_1,Unnamed: 1_level_1,join,<lambda>,first
0,d1,"women, dog","[0.1, 0.4]",a
1,d2,man,[0.2],b
2,d3,cat,[0.3],c


## 删除包含特定字符串所在的行

In [19]:
df = pd.DataFrame({'a':[1,2,3,4], 
                   'b':['s1', 'exp_s2', 's3','exps4'], 
                   'c':[5,6,7,8], 
                   'd':[3,2,5,10]})
df

Unnamed: 0,a,b,c,d
0,1,s1,5,3
1,2,exp_s2,6,2
2,3,s3,7,5
3,4,exps4,8,10


In [23]:
df[~df['b'].str.contains('exp')]

Unnamed: 0,a,b,c,d
0,1,s1,5,3
2,3,s3,7,5


## 组内排序