In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [3]:
score = DataFrame(data = np.random.randint(0,100,size=(5,3)),
                  index=['tom','jack','mery','lucy','lili'],
                  columns=['python','java','php'])
score

Unnamed: 0,python,java,php
tom,60,53,59
jack,28,7,67
mery,95,30,24
lucy,48,21,66
lili,7,76,26


- score.drop 删除一行或一列  
    - label 指定删除的行或列标签(也可以是列表)
    - axis 改变删除的方向
- score.drop_duplicates 删除重复元素
    - keep 'first','last',保留第一个或者最后一个
- score.dropna 删除空值
    - axis 轴向
    - how 'any','all',有一个就删除或者全部为nan才删除

In [4]:
# 删除列
score.drop(labels='python',axis=1)

Unnamed: 0,java,php
tom,53,59
jack,7,67
mery,30,24
lucy,21,66
lili,76,26


In [5]:
# 删除行
score.drop(labels='tom',axis=0)

Unnamed: 0,python,java,php
jack,28,7,67
mery,95,30,24
lucy,48,21,66
lili,7,76,26


In [7]:
score.loc['lucy'] = score.loc['tom']
score.loc['lili'] = score.loc['tom']

In [12]:
# 检查重复行
index = score.duplicated(keep='last')

In [15]:
# 获取重复元素的索引
drop_index = score.loc[index].index
# 根据索引删除重复的行
score.drop(drop_index)

Unnamed: 0,python,java,php
jack,28,7,67
mery,95,30,24
lili,60,53,59


In [17]:
score.drop_duplicates(keep='last')

Unnamed: 0,python,java,php
jack,28,7,67
mery,95,30,24
lili,60,53,59


In [18]:
score.dropna()

Unnamed: 0,python,java,php
tom,60,53,59
jack,28,7,67
mery,95,30,24
lucy,60,53,59
lili,60,53,59


In [20]:
score.loc['jack','python'] = None

In [24]:
score.dropna(axis=1,how='all')

Unnamed: 0,python,java,php
tom,60.0,53,59
jack,,7,67
mery,95.0,30,24
lucy,60.0,53,59
lili,60.0,53,59


In [25]:
# 新增一个学科
score['c++'] = np.random.randint(0,100,size=5)
score

Unnamed: 0,python,java,php,c++
tom,60.0,53,59,29
jack,,7,67,9
mery,95.0,30,24,62
lucy,60.0,53,59,69
lili,60.0,53,59,27


In [29]:
# map是Sereis对象的函数,不是DataFrame的
score['c++'].map(lambda x:x+10)

tom     39
jack    19
mery    72
lucy    79
lili    37
Name: c++, dtype: int64

In [None]:
# 映射关系 字典\lmabda\function

In [31]:
score['sex'] = ['boy','boy','gril','gril','boy']
score.dtypes

python    float64
java        int32
php         int32
c++         int32
sex        object
dtype: object

In [32]:
def sex_transform(x):
    if x == 'boy':
        return 1
    else:
        return 0

In [33]:
score['sex'] = score['sex'].map(sex_transform)

In [35]:
score.dtypes

python    float64
java        int32
php         int32
c++         int32
sex         int64
dtype: object

In [None]:
# 异常值检测和过滤
1. 确定过滤条件 bool_list
2. 使用过滤条件筛选符合条件的数据
3. 获取符合条件的索引标签
4. 使用drop函数,通过索引标签把异常值删除

In [39]:
data = DataFrame(data = np.random.randn(1000,3))
data.head()

Unnamed: 0,0,1,2
0,1.326924,-0.609605,-2.279438
1,0.252979,-0.105621,-1.357682
2,1.14578,0.190798,1.637246
3,1.481688,-0.683615,1.791169
4,-0.191842,0.199332,0.195905


In [40]:
data.std()

0    0.955316
1    0.975027
2    1.007340
dtype: float64

In [52]:
# 假设大于3*std的数据即为异常数据
condition = np.abs(data) > 3*data.std()
# 获取包含空值的行,相当于确认了满足条件的数据样本
drop_datas = data[condition.any(axis=1)]

# 获取符合条件的索引
drop_index = drop_datas.index
drop_index

# 根据索引删除符合条件的数据
data.drop(drop_index,inplace=True)

In [53]:
data.shape

(993, 3)

# 排序

In [55]:
# numpy特性之一,可以用索引列表来对数据进行重新排序
a = np.array(['boy','girl'])
b = [0,1,0,0,0,1,1,0]
a[b]

array(['boy', 'girl', 'boy', 'boy', 'boy', 'girl', 'girl', 'boy'],
      dtype='<U4')

In [57]:
df = DataFrame(data = np.random.randint(0,100,size=(5,5)),
              index = list('甲乙丙丁戊'),columns = list('ABCDE'))
df

Unnamed: 0,A,B,C,D,E
甲,96,37,53,13,82
乙,9,17,26,31,60
丙,17,94,22,64,23
丁,10,58,34,25,15
戊,52,95,0,42,40


In [60]:
df.take([1,1,1,3,4],axis=1)

Unnamed: 0,B,B.1,B.2,D,E
甲,37,37,37,13,82
乙,17,17,17,31,60
丙,94,94,94,64,23
丁,58,58,58,25,15
戊,95,95,95,42,40


In [72]:
# take函数结合此方法,可以对数据进行随机排序
random_index = np.random.permutation(5)
df.take(random_index,axis=1)

Unnamed: 0,A,D,B,E,C
甲,96,13,37,82,53
乙,9,31,17,60,26
丙,17,64,94,23,22
丁,10,25,58,15,34
戊,52,42,95,40,0


In [78]:
# take函数也可以结合random.randint进行随机抽样
random_index = np.random.randint(0,5,size=2)
df.take(random_index,axis=1)

Unnamed: 0,E,E.1
甲,82,82
乙,60,60
丙,23,23
丁,15,15
戊,40,40


# 数据分类处理

In [None]:
# 汇总 交叉表 透视表
# 分类--聚合--合并
# 一般 非数字类型作为分类标签,数值类型不作为分类标签
# 连续型\数值型不能作为分类标签的,(1.2,2.4,5,6,6.7,...)
# 离散型\标称型才可以作为分类标签(red,blue,orange)

In [82]:
df = DataFrame({'color':['white','red','green','red'],
               'item':['ball','mug','pen','pencil'],
               'price':np.random.rand(4),
               'weight':np.random.rand(4)})
df.dtypes

color      object
item       object
price     float64
weight    float64
dtype: object

In [84]:
# 按照颜色划分的总重量
df.groupby('color').sum()['weight']

color
green    0.938751
red      1.946972
white    0.829003
Name: weight, dtype: float64

In [87]:
groups = df.groupby('color')

In [101]:
groups.groups

{'green': Int64Index([2], dtype='int64'),
 'red': Int64Index([1, 3], dtype='int64'),
 'white': Int64Index([0], dtype='int64')}

In [91]:
groups['weight'].sum()

color
green    0.938751
red      1.946972
white    0.829003
Name: weight, dtype: float64

In [93]:
# 按照多个条件分组
df.groupby(['color','item']).sum()['weight']

color  item  
green  pen       0.938751
red    mug       0.991736
       pencil    0.955236
white  ball      0.829003
Name: weight, dtype: float64

In [95]:
# 可以接受可迭代函数
df.groupby('color')['weight'].apply(np.sum)

color
green    0.938751
red      1.946972
white    0.829003
Name: weight, dtype: float64

In [100]:
# map函数不接受可迭代函数
df['color'].map(lambda x:x+'10')

0    white10
1      red10
2    green10
3      red10
Name: color, dtype: object