# 1 model
---
经验数据 `-->` 规律发现 `-->` 验证数据 `-->` 实践规律 `-->` 完善提高

`|<--         model           -->|`
- build
- train
- validation
- test
- application

# 2 Application
---
1. regression predict
2. classify
    - non-supervision(cluster)
    - supervision
3. recommend engine
4. natural language
5. video,image recognition
6. face or fingerprient recognition
7. neural network


# 3 pre-process of data(for column value)
---
## 3.1 remove average `import sklearn.preprocessing as sp`
- $\sqrt{\frac{(x_1-\bar{x})^2+\dots+(x_n-\bar{x})^2}{n}}$
- `col = col - mean`
- `col /= col_std `


## 3.2 range scaling (mmx.py)
- %%%%
- `A x = b`
- `np.linalg.lstsq(A, b)`

In [15]:
## mmx.py
import numpy as np

def mmx_scale(raw_samples, bmin, bmax):
    nor_samples = raw_samples.copy()
    cols =nor_samples.shape[1]
    for col in range(cols):
        col_samples = nor_samples[:, col]
        col_min = col_samples.min()
        col_max = col_samples.max()
        k, b = np.linalg.lstsq(
            np.array([[col_min,1],[col_max,1]]),
            np.array([bmin, bmax])
        )[0]
        col_samples *= k
        col_samples += b
    return nor_samples

def main():
    raw_samples = np.array([
        [3, -1.5, 2, -5.4],
        [0, 4, -0.3, 2.1],
        [1, 3.3, -1.9, -4.3]
    ])
    print(raw_samples)
    mmx_samples = mmx_scale(raw_samples, 0, 1)
    print(mmx_samples)
    
    # use sp
    # create scaler
    mmx = sp.MinMaxScaler(feature_range=(0, 1))
    mmx_samples = mmx.fit_transform(raw_samples)
    print(mmx_samples)

if __name__ == '__main__':
    main()

[[ 3.  -1.5  2.  -5.4]
 [ 0.   4.  -0.3  2.1]
 [ 1.   3.3 -1.9 -4.3]]
[[  1.00000000e+00  -2.77555756e-16   1.00000000e+00  -2.22044605e-16]
 [  1.85037171e-17   1.00000000e+00   4.10256410e-01   1.00000000e+00]
 [  3.33333333e-01   8.72727273e-01  -2.22044605e-16   1.46666667e-01]]
[[ 1.          0.          1.          0.        ]
 [ 0.          1.          0.41025641  1.        ]
 [ 0.33333333  0.87272727  0.          0.14666667]]


## 3.3 normalization (for row value of sample)(norm.py)
- scaling the feature values of every sample to the range of `[0, 1]`,and the sum of absolute value is 1.


In [12]:
## norm.py
import numpy as np
import sklearn.preprocessing as sp

def normalize(raw_samples):
    nor_samples = raw_samples.copy()
    rows =nor_samples.shape[0]
    for row in range(rows):
        row_samples = nor_samples[row]
        row_abs = abs(row_samples)
        row_abs_sum = row_abs.sum()
        row_samples /= row_abs_sum
    return nor_samples

def main():
    raw_samples = np.array([
        [3, -1.5, 2, -5.4],
        [0, 4, -0.3, 2.1],
        [1, 3.3, -1.9, -4.3]
    ])
    print(raw_samples)
    nor = normalize(raw_samples)
    print(nor)
    for row in range(nor.shape[0]):
        row_sample = nor[row]
        abs_sample = abs(row_sample)
        sum_sample = abs_sample.sum()
        print(sum_sample)
        
    # sp
    nor2 = sp.normalize(raw_samples, norm='l1')
    print(nor2)
    for row in range(nor2.shape[0]):
        row_sample = nor2[row]
        abs_sample = abs(row_sample)
        sum_sample = abs_sample.sum()
        print(sum_sample)

if __name__ == '__main__':
    main()

[[ 3.  -1.5  2.  -5.4]
 [ 0.   4.  -0.3  2.1]
 [ 1.   3.3 -1.9 -4.3]]
[[ 0.25210084 -0.12605042  0.16806723 -0.45378151]
 [ 0.          0.625      -0.046875    0.328125  ]
 [ 0.0952381   0.31428571 -0.18095238 -0.40952381]]
1.0
1.0
1.0
[[ 0.25210084 -0.12605042  0.16806723 -0.45378151]
 [ 0.          0.625      -0.046875    0.328125  ]
 [ 0.0952381   0.31428571 -0.18095238 -0.40952381]]
1.0
1.0
1.0


## 3.4 二值化(bin.py)
- 根据一个预先设定的阈值，小于等于阈值的为0，大于的为1.

In [14]:
## bin.py
import numpy as np
import sklearn.preprocessing as sp

def binarize(raw_samples, threshold):
    bin_samples = raw_samples.copy()
    bin_samples[bin_samples <= threshold] = 0
    bin_samples[bin_samples > threshold] = 1
    return bin_samples

def main():
    raw_samples = np.array([
        [3, -1.5, 2, -5.4],
        [0, 4, -0.3, 2.1],
        [1, 3.3, -1.9, -4.3]
    ])
    print(raw_samples)
    bina = binarize(raw_samples, 1.4)
    print(bina)
    
    # use sp
    bin = sp.Binarizer(threshold=1.4)
    bina2 = bin.transform(raw_samples)
    print(bina2)

if __name__ == '__main__':
    main()

[[ 3.  -1.5  2.  -5.4]
 [ 0.   4.  -0.3  2.1]
 [ 1.   3.3 -1.9 -4.3]]
[[ 1.  0.  1.  0.]
 [ 0.  1.  0.  1.]
 [ 0.  1.  0.  0.]]
[[ 1.  0.  1.  0.]
 [ 0.  1.  0.  1.]
 [ 0.  1.  0.  0.]]


## 3.5 独热编码
---
```
    0       0        3
    1       1        0
    0       2        1
    1       0        2
    ------------  -----
    0 -10   0-100    0- 1000
    1 -01   1-010    1- 0100
            2-001    2- 0010
                     3- 0001

    --------------------------
    101000001
    010101000
    100010010
    011000010
    
```    
    

In [17]:
'0\t0'

'0\t0'