In [1]:
import pandas as pd
import numpy as np
from random import shuffle
from numpy.linalg import inv
import matplotlib.pyplot as plt
from math import floor, log
import os

In [8]:
output_dir = "output/"

def dataProcess_X(rawData):

    #sex 只有两个属性 先drop之后处理
    if "income" in rawData.columns:
        Data = rawData.drop(["sex", 'income'], axis=1)
    else:
        Data = rawData.drop(["sex"], axis=1)
    listObjectColumn = [col for col in Data.columns if Data[col].dtypes == "object"] #读取非数字的column
    listNonObjedtColumn = [x for x in list(Data) if x not in listObjectColumn] #数字的column

    ObjectData = Data[listObjectColumn]
    NonObjectData = Data[listNonObjedtColumn]
    #insert set into nonobject data with male = 0 and female = 1
    NonObjectData.insert(0 ,"sex", (rawData["sex"] == " Female").astype(np.int))
    #set every element in object rows as an attribute
    ObjectData = pd.get_dummies(ObjectData)

    Data = pd.concat([NonObjectData, ObjectData], axis=1)
    Data_x = Data.astype("int64")
    # Data_y = (rawData["income"] == " <=50K").astype(np.int)

    #normalize
    Data_x = (Data_x - Data_x.mean()) / Data_x.std()

    return Data_x

def dataProcess_Y(rawData):
    df_y = rawData['income']
    Data_y = pd.DataFrame((df_y==' >50K').astype("int64"), columns=["income"])
    return Data_y


def sigmoid(z):
    res = 1 / (1.0 + np.exp(-z))
    return np.clip(res, 1e-8, (1-(1e-8)))

def _shuffle(X, Y):                                 #X and Y are np.array
    randomize = np.arange(X.shape[0])
    np.random.shuffle(randomize)
    return (X[randomize], Y[randomize])

def split_valid_set(X, Y, percentage):
    all_size = X.shape[0]
    valid_size = int(floor(all_size * percentage))

    X, Y = _shuffle(X, Y)
    X_valid, Y_valid = X[ : valid_size], Y[ : valid_size]
    X_train, Y_train = X[valid_size:], Y[valid_size:]

    return X_train, Y_train, X_valid, Y_valid



def valid(X, Y, w):
    a = np.dot(w,X.T)
    y = sigmoid(a)
    y_ = np.around(y)
    result = (np.squeeze(Y) == y_)
    print('Valid acc = %f' % (float(result.sum()) / result.shape[0]))
    return y_

def train(X_train, Y_train):
    # valid_set_percentage = 0.1
    # X_train, Y_train, X_valid, Y_valid = split_valid_set(X, Y, valid_set_percentage)

    w = np.zeros(len(X_train[0]))

    l_rate = 0.001
    batch_size = 32
    train_dataz_size = len(X_train)
    step_num = int(floor(train_dataz_size / batch_size))
    epoch_num = 300
    list_cost = []

    total_loss = 0.0
    for epoch in range(1, 4):
        total_loss = 0.0
        X_train, Y_train = _shuffle(X_train, Y_train)

        for idx in range(1, 4):
            X = X_train[idx*batch_size:(idx+1)*batch_size]
            Y = Y_train[idx*batch_size:(idx+1)*batch_size]
            #print(0, X)
            s_grad = np.zeros(len(X[0]))


            z = np.dot(X, w)
            y = sigmoid(z)
            print(1, y)
            loss = y - np.squeeze(Y)

            cross_entropy = -1 * (np.dot(np.squeeze(Y.T), np.log(y)) + np.dot((1 - np.squeeze(Y.T)), np.log(1 - y)))/ len(Y)
            #cross_entropy1 = -1 * (np.dot(np.squeeze(Y), np.log(y)) + np.dot(1 - np.squeeze(Y), np.log(1 - y))) / Y.shape[0]
#             print("cross_entropy", cross_entropy)
#             print("cross_entropy1", cross_entropy1)
            total_loss += cross_entropy
            #print(2, (np.squeeze(Y) - y))
            grad = -1 * np.dot(np.squeeze(Y) - y, X)
            grad1 = np.sum(-1 * X * (np.squeeze(Y) - y).reshape((batch_size, 1)), axis=0)
            print(2, grad)
            print(3, grad1)
            # grad = np.dot(X.T, loss)
            w = w - l_rate * grad1

            # s_grad += grad ** 2
            # ada = np.sqrt(s_grad)
            # w = w - l_rate * grad / ada

#         list_cost.append(total_loss)

    # valid(X_valid, Y_valid, w)
#     plt.plot(np.arange(len(list_cost)), list_cost)
#     plt.title("Train Process")
#     plt.xlabel("epoch_num")
#     plt.ylabel("Cost Function (Cross Entropy)")
#     plt.savefig(os.path.join(os.path.dirname(output_dir), "TrainProcess"))
#     plt.show()

    return w

In [12]:
if __name__ == "__main__":
    trainData = pd.read_csv("data/train.csv")
    testData = pd.read_csv("data/test.csv")
    ans = pd.read_csv("data/correct_answer.csv")

    # here is one more attribute in trainData
    x_train = dataProcess_X(trainData).drop(['native_country_ Holand-Netherlands'], axis=1).values
    x_test = dataProcess_X(testData).values
    y_train = dataProcess_Y(trainData).values
    y_ans = ans['label'].values
    
    print(x_train, x_train.shape)


    x_test = np.concatenate((np.ones((x_test.shape[0], 1)), x_test), axis=1)
    x_train = np.concatenate((np.ones((x_train.shape[0], 1)),x_train), axis=1)

    valid_set_percentage = 0.1
    X_train, Y_train, X_valid, Y_valid = split_valid_set(x_train, y_train, valid_set_percentage)

    w_train = train(X_train, Y_train)
    w_train
#     valid(X_train, Y_train, w_train)

#     w = train(x_train, y_train)

#     y_ = valid(x_test, y_ans, w)
#     y_

[[-0.70306055  0.03067009 -1.06359441 ...  0.34094868 -0.04540766
  -0.02217232]
 [-0.70306055  0.83709613 -1.00869151 ...  0.34094868 -0.04540766
  -0.02217232]
 [-0.70306055 -0.04264137  0.24507474 ...  0.34094868 -0.04540766
  -0.02217232]
 ...
 [ 1.42230892  1.42358779 -0.3587719  ...  0.34094868 -0.04540766
  -0.02217232]
 [-0.70306055 -1.2156247   0.11095818 ...  0.34094868 -0.04540766
  -0.02217232]
 [ 1.42230892  0.98371904  0.9298783  ...  0.34094868 -0.04540766
  -0.02217232]] (32561, 106)
1 [0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5]
2 [  9.           4.2993024   -3.02298484  -1.98396346   0.68949816
  -1.31326419  -4.30994026  -0.84528552   2.13517844  -1.5686319
   1.7185709   -0.13197227  -0.59465151  -1.69547792   1.10968332
  -1.83382488  -0.18665704  -1.54575549   3.62036732  -1.0448115
  -0.64813414  -0.91483186  -1.28042656  -1.13978703  -4.46498713
  -1.89477991   2.75117     -1.0