In [1]:
import pandas as pd
import numpy as np

### 分类数据

In [3]:
values = pd.Series(['apple', 'orange', 'apple', 'apple'] * 2)
values.unique
pd.unique(values)

array(['apple', 'orange'], dtype=object)

In [4]:
pd.value_counts(values)

apple     6
orange    2
dtype: int64

映射

In [5]:
values = pd.Series([0, 1, 0, 0] * 2)
dim = pd.Series(['apple', 'orange'])
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

In [7]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 2
n = len(fruits)
df = pd.DataFrame(data={'fruit': fruits, 'id': np.arange(n),
                       'count': np.random.randint(3, 15, size=n),
                       'weight': np.random.uniform(0, 4, size=n)},
                )
df

Unnamed: 0,fruit,id,count,weight
0,apple,0,7,2.118518
1,orange,1,8,0.179039
2,apple,2,11,0.944281
3,apple,3,3,1.183788
4,apple,4,7,2.31243
5,orange,5,11,3.079262
6,apple,6,3,0.499955
7,apple,7,13,2.795565


In [8]:
fruit_cats = df['fruit'].astype('category')
fruit_cats

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

#### 用分类进行计算

In [9]:
np.random.seed(12345)

In [None]:
draws =np.random.randn(1000)


### 分类方法

In [2]:
s = pd.Series(['a', 'b', 'c', 'd', 'e'] * 2)
cat_s = s.astype('category')
cat_s

0    a
1    b
2    c
3    d
4    e
5    a
6    b
7    c
8    d
9    e
dtype: category
Categories (5, object): ['a', 'b', 'c', 'd', 'e']

In [5]:
cat_s.cat.codes

0    0
1    1
2    2
3    3
4    4
5    0
6    1
7    2
8    3
9    4
dtype: int8

### one-hot编码

In [6]:
pd.get_dummies(cat_s)

Unnamed: 0,a,b,c,d,e
0,1,0,0,0,0
1,0,1,0,0,0
2,0,0,1,0,0
3,0,0,0,1,0
4,0,0,0,0,1
5,1,0,0,0,0
6,0,1,0,0,0
7,0,0,1,0,0
8,0,0,0,1,0
9,0,0,0,0,1


In [7]:
from sklearn.preprocessing import OneHotEncoder

In [17]:
s1 = np.array(['a', 'b', 'c', 'd', 'e'])
s1.shape

(5,)

In [22]:
one = OneHotEncoder()
data = one.fit_transform(s1.reshape(-1, 1))
print(data.todense())

[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]


In [10]:
s.shape

(10,)

### pandas与模型代码的接口

In [23]:
data = pd.DataFrame({
        'x0': [1, 2, 3, 4, 5],
        'x1': [0.01, -0.01, 0.25, -4.1, 0.],
        'y': [-1.5, 0., 3.6, 1.3, -2.]})

In [24]:
data.values

array([[ 1.  ,  0.01, -1.5 ],
       [ 2.  , -0.01,  0.  ],
       [ 3.  ,  0.25,  3.6 ],
       [ 4.  , -4.1 ,  1.3 ],
       [ 5.  ,  0.  , -2.  ]])

In [32]:
data.loc[[1, 2]]  == data.loc[1: 2]

Unnamed: 0,x0,x1,y
1,True,True,True
2,True,True,True


In [28]:
data.iloc[1:2]

Unnamed: 0,x0,x1,y
1,2,-0.01,0.0


In [37]:
data.iloc[:, :2].values

array([[ 1.  ,  0.01],
       [ 2.  , -0.01],
       [ 3.  ,  0.25],
       [ 4.  , -4.1 ],
       [ 5.  ,  0.  ]])

In [35]:
data.loc[:, ['x0', 'x1']].values

array([[ 1.  ,  0.01],
       [ 2.  , -0.01],
       [ 3.  ,  0.25],
       [ 4.  , -4.1 ],
       [ 5.  ,  0.  ]])

In [38]:
import patsy

In [40]:
y, x = patsy.dmatrices('y ~ x0 + x1', data)
y
x

DesignMatrix with shape (5, 3)
  Intercept  x0     x1
          1   1   0.01
          1   2  -0.01
          1   3   0.25
          1   4  -4.10
          1   5   0.00
  Terms:
    'Intercept' (column 0)
    'x0' (column 1)
    'x1' (column 2)