In [1]:
import pandas as pd
import numpy as np
from random import shuffle
from numpy.linalg import inv
import matplotlib.pyplot as plt
from math import floor, log
import os

# 涉及知识
* one-hot 编码
* pandas axis 知识
* 协方差矩阵 https://www.cnblogs.com/terencezhou/p/6235974.html
* 归一化处理 Z-score标准化，符合正态分布 

# 数据预处理

我们可以看一下，一开始的数据是多维度，且一个维度下有多个类别的数据

In [45]:
train_data = pd.read_csv('data/train.csv')
#train_data['sex'] = train_data['sex'].str.strip() # 清除一下 sex 的值存在的空格
#(train_data.native_country == 'Taiwan').index
train_data.rename(columns={'native_country':'native_country&area'}, inplace = True)
train_data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country&area,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


对于值为纯数字的维度，我们可以不用做什么处理。而非纯数字的维度，例如 ```education``` 有 ```HS-grad```、```11th``` 等类别。

当然我们可以对于每个维度，都利用$$1 - n$$来表明各个类型。但是这样每一个维度都要有专门逻辑判断，十分不方便。

所以我们希望将这些多类型的维度转换为 **二进制** 来唯一表示。于是我们针对这些多类型的维度使用 ```one-hot``` 编码咯。

当然，```sex``` 和 ```income``` 完全可以直接 0 1 标识，都是二分类。

In [47]:


def X_data_pretreatment(raw_data_frame, is_train):
    data_after_treat = None
    if is_train:
        data_after_treat = raw_data_frame.drop(['sex', 'income'], axis = 1)
    else:
        data_after_treat = raw_data_frame.drop(['sex'], axis = 1)
    
    # 非数字的多类别维度名称
    obj_col_list     = [ col for col in data_after_treat.columns if data_after_treat[col].dtype == 'object' ]
    value_col_list   = [ col for col in data_after_treat.columns if col not in obj_col_list ]
    obj_frame        = data_after_treat[obj_col_list] # 非数字的多类别 data frame
    value_frame      = data_after_treat[value_col_list]
    
    #obj_frame 加入 sex. sex = 1 为 female
    obj_frame.insert(0, 'sex', (raw_data_frame.sex == ' Female').astype('int64'))
    # one-hot 编码
    obj_frame = pd.get_dummies(obj_frame)
    
    #obj_frame 与 value_frame 合并 获得训练输入数据
    X_train = pd.concat([obj_frame, value_frame], axis = 1)
    
    X_train = (X_train - X_train.mean()) / X_train.std()
    return X_train, obj_frame, value_frame
    #print((raw_data_frame.sex == 'Female').astype('int64'))
    

X_train, obj_frame, value_frame = X_data_pretreatment(train_data, True)
X_train = X_train.drop(['native_country&area_ Holand-Netherlands'], axis = 1)
X_train

Unnamed: 0,sex,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,...,native_country&area_ Trinadad&Tobago,native_country&area_ United-States,native_country&area_ Vietnam,native_country&area_ Yugoslavia,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,-0.703061,-0.244446,-0.174292,-0.262093,-0.014664,-1.516769,-0.188386,-0.290931,4.907624,-0.02074,...,-0.024163,0.340949,-0.045408,-0.022172,0.030670,-1.063594,1.134721,0.148451,-0.216656,-0.035429
1,-0.703061,-0.244446,-0.174292,-0.262093,-0.014664,-1.516769,-0.188386,3.437133,-0.203758,-0.02074,...,-0.024163,0.340949,-0.045408,-0.022172,0.837096,-1.008692,1.134721,-0.145918,-0.216656,-2.222119
2,-0.703061,-0.244446,-0.174292,-0.262093,-0.014664,0.659276,-0.188386,-0.290931,-0.203758,-0.02074,...,-0.024163,0.340949,-0.045408,-0.022172,-0.042641,0.245075,-0.420053,-0.145918,-0.216656,-0.035429
3,-0.703061,-0.244446,-0.174292,-0.262093,-0.014664,0.659276,-0.188386,-0.290931,-0.203758,-0.02074,...,-0.024163,0.340949,-0.045408,-0.022172,1.057031,0.425795,-1.197440,-0.145918,-0.216656,-0.035429
4,1.422309,-0.244446,-0.174292,-0.262093,-0.014664,0.659276,-0.188386,-0.290931,-0.203758,-0.02074,...,-0.024163,-2.932903,-0.045408,-0.022172,-0.775756,1.408154,1.134721,-0.145918,-0.216656,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,1.422309,-0.244446,-0.174292,-0.262093,-0.014664,0.659276,-0.188386,-0.290931,-0.203758,-0.02074,...,-0.024163,0.340949,-0.045408,-0.022172,-0.849067,0.639731,0.746028,-0.145918,-0.216656,-0.197406
32557,-0.703061,-0.244446,-0.174292,-0.262093,-0.014664,0.659276,-0.188386,-0.290931,-0.203758,-0.02074,...,-0.024163,0.340949,-0.045408,-0.022172,0.103982,-0.335428,-0.420053,-0.145918,-0.216656,-0.035429
32558,1.422309,-0.244446,-0.174292,-0.262093,-0.014664,0.659276,-0.188386,-0.290931,-0.203758,-0.02074,...,-0.024163,0.340949,-0.045408,-0.022172,1.423588,-0.358772,-0.420053,-0.145918,-0.216656,-0.035429
32559,-0.703061,-0.244446,-0.174292,-0.262093,-0.014664,0.659276,-0.188386,-0.290931,-0.203758,-0.02074,...,-0.024163,0.340949,-0.045408,-0.022172,-1.215625,0.110958,-0.420053,-0.145918,-0.216656,-1.655199


In [None]:
obj_frame

In [None]:
value_frame

## 开始吧

In [59]:
train_data = pd.read_csv('data/train.csv')
test_data  = pd.read_csv('data/test.csv')
#train_data['sex'] = train_data['sex'].str.strip() # 清除一下 sex 的值存在的空格
#(train_data.native_country == 'Taiwan').index
train_data.rename(columns={'native_country':'native_country&area'}, inplace = True)
test_data.rename(columns={'native_country':'native_country&area'}, inplace = True)
X_train = X_data_pretreatment(train_data, True)[0]
# 不得已而为之，不这么处理，算出的 w 的 shape 是 (107, )
# 而 X_test shape is (106,) 会无法点乘
X_train = X_train.drop(['native_country&area_ Holand-Netherlands'], axis = 1)
X_test  = X_data_pretreatment(test_data,  False)[0]

In [None]:
X_train.values

In [None]:
X_test

In [60]:
def Y_data_pretreatment(raw_data_frame):
    data_after_treat = (raw_data_frame.income == ' >50K').astype('int64')
    return data_after_treat

Y_train = Y_data_pretreatment(train_data)
Y_train.values

array([0, 0, 0, ..., 0, 0, 1])

然后我们需要分隔测试集，有效集

In [61]:
def split_valid_set(X_train, Y_train):
    randomize = np.arange(X_train.shape[0])
    np.random.shuffle(randomize)
    vaild_set_percetange = 0.1
    all_size = X_train.shape[0]
    valid_size = int(all_size * vaild_set_percetange)
    X = X_train.values[randomize]
    Y = Y_train.values[randomize]
    X_valid_set, Y_valid_set = X[ : valid_size], Y[ : valid_size]
    X_train_set, Y_train_set = X[valid_size : ], Y[valid_size : ]
    return X_valid_set, Y_valid_set, X_train_set, Y_train_set

X_valid_set, Y_valid_set,X_train_set, Y_train_set = split_valid_set(X_train, Y_train)


# 使用 generate model 其实是贝叶斯公式
开始训练，明确一下，是二分类问题。

In [62]:
def train(X_train, Y_train):
    train_data_size = X_train.shape[0]
    feature_size    = X_train.shape[1]
    
    # 计算 N1,N2，其实就是平均值
    cnt1 = 0
    cnt2 = 0
    # μ1 μ2
    # 收入 > 50k
    mu1  = np.zeros((feature_size))
    # <= 50k 两种类别
    mu2  = np.zeros((feature_size))

    for i in range(train_data_size):
        if Y_train[i] == 1: # >50k
            mu1  += X_train[i] # 不是 +1 啊，加 feature 的值
            cnt1 += 1
        else:
            mu2  += X_train[i]
            cnt2 += 1
    mu1 /= cnt1
    mu2 /= cnt2
    
    # 协方差矩阵 二维，协方差矩阵是对多维随机变量的各个维度之间的线性关系
    # 区别是各自的 μ
    sigma1 = np.zeros((feature_size, feature_size))
    sigma2 = np.zeros((feature_size, feature_size))
    
    for i in range(train_data_size):
        if Y_train[i] == 1:
            sigma1 += np.dot(np.transpose([X_train[i] - mu1]), [X_train[i] - mu1])
        else:
            sigma2 += np.dot(np.transpose([X_train[i] - mu2]), [X_train[i] - mu2])
    sigma1         /= cnt1
    sigma2         /= cnt2
    N1              = cnt1
    N2              = cnt2
    share_sigma     = (N1 / train_data_size) * sigma1 + (N2 / train_data_size) * sigma2
    
    return mu1, mu2, share_sigma, N1, N2
    

mu1, mu2, share_sigma, N1, N2 = train(X_train_set, Y_train_set)
share_sigma

array([[ 9.53985802e-01,  4.61236460e-02,  1.22054427e-02, ...,
        -4.30117007e-05, -1.43566031e-02, -1.79415220e-01],
       [ 4.61236460e-02,  9.88448042e-01, -3.80112772e-02, ...,
         5.59442353e-04, -5.00652614e-03, -1.49502050e-01],
       [ 1.22054427e-02, -3.80112772e-02,  1.00215052e+00, ...,
        -1.96784593e-02,  1.71372776e-03, -1.55801793e-04],
       ...,
       [-4.30117007e-05,  5.59442353e-04, -1.96784593e-02, ...,
         9.49833245e-01, -6.43366919e-02,  2.90248226e-02],
       [-1.43566031e-02, -5.00652614e-03,  1.71372776e-03, ...,
        -6.43366919e-02,  9.71148481e-01,  1.63862881e-02],
       [-1.79415220e-01, -1.49502050e-01, -1.55801793e-04, ...,
         2.90248226e-02,  1.63862881e-02,  9.45881639e-01]])

来，看看公式
![sigmoid](http://img.multiparam.com/dapao/code/20200619061122.png)

![](http://img.multiparam.com/dapao/code/20200619034241.png)

![](http://img.multiparam.com/dapao/code/20200619034302.png)

## 求 w，b。这样模型已经训练出来，可以用测试集来预测，在通过和正确项比较正确率。

In [63]:
def sigmoid(z):
    r = 1.0 / (1.0 + np.exp(-z))
    return np.clip(r, 1e-8, (1-(1e-8)))



share_sigma_inv = inv(share_sigma)
w = np.dot((mu1 - mu2), share_sigma_inv)
b = -0.5 * np.dot(np.dot(mu1.T, share_sigma_inv), mu1) + np.log(float(N1) / N2)

#y = sigmoid(np.dot(w, X_train_set.T) + b)
w.shape

(106,)

In [92]:
y = sigmoid(np.dot(w, X_test.values.T) + b)
y # 分类结果
y.shape

(16281,)

In [68]:
ans = pd.read_csv("data/correct_answer.csv")
ans

Unnamed: 0,id,label
0,1,0
1,2,0
2,3,1
3,4,1
4,5,0
...,...,...
16276,16277,0
16277,16278,0
16278,16279,0
16279,16280,0


In [91]:
y_ = np.around(y).astype(np.int)
y_.shape[0]

16281

In [95]:
y_df = pd.DataFrame({"id": np.arange(y_.shape[0]), "label" : y_})
y_df

Unnamed: 0,id,label
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
16276,16276,0
16277,16277,0
16278,16278,1
16279,16279,0


In [98]:
r = (np.squeeze(ans.label.values) == y_)
r
#np.squeeze(ans.values)
# np.sum() 可以计算为 true 的个数
r.sum() / r.shape[0]

0.8430071862907684