In [1]:
# import package
import csv
import numpy as np
import pandas as pd

In [2]:
# num of features.
dim = 106

In [3]:
# load data from file.
def load_data():
    x_train = pd.read_csv('X_train')
    x_test = pd.read_csv('X_test')

    x_train = x_train.values
    x_test = x_test.values

    y_train = pd.read_csv('Y_train', header=None)
    y_train = y_train.values.reshape(-1)
    
    return x_train, y_train, x_test

def sigmoid(z):
    # limits output values between [1e-6, 1 - 1e-6]
    return np.clip(1 / (1.0 + np.exp(-z)), 1e-6, 1 - 1e-6)

In [4]:
def train(x_train, y_train):
    # init variable.
    cnt1, cnt2 = 0, 0
    mu1, mu2 = np.zeros((dim,)), np.zeros((dim,))
    sigma1, sigma2 = np.zeros((dim, dim)), np.zeros((dim, dim))

    # find the index which y_train == 1.
    index1 = np.where([y_train == 1])[1]
    # find the index which y_train == 0.
    index2 = np.where([y_train == 0])[1]

    cnt1, cnt2 = len(index1), len(index2)

    for index in index1:
        mu1 += x_train[index]

    for index in index2:
        mu2 += x_train[index]

    mu1, mu2 = mu1/cnt1, mu2/cnt2

    for index in index1:
        sigma1 += np.dot(np.transpose([x_train[index] - mu1]), [(x_train[index] - mu1)])

    for index in index2:
        sigma2 += np.dot(np.transpose([x_train[index] - mu2]), [(x_train[index] - mu2)])

    sigma1, sigma2 = sigma1/cnt1, sigma2/cnt2

    share_sigma = (cnt1 / x_train.shape[0]) * sigma1 + (cnt2 / x_train.shape[0]) * sigma2

    return mu1, mu2, share_sigma, cnt1, cnt2

In [5]:
def predict(x_test, mu1, mu2, share_sigma, N1, N2):
    sigma_inverse = np.linalg.inv(share_sigma)

    w = np.dot((mu1 - mu2), sigma_inverse)
    b = (-0.5) * np.dot(np.dot(mu1.T, sigma_inverse), mu1) + (0.5) * np.dot(np.dot(mu2.T, sigma_inverse), mu2) + np.log(
        float(N1) / N2)

    z = np.dot(w, x_test.T) + b
    pred = sigmoid(z)
    
    return np.around(pred)

In [6]:
# load data.
x_train, y_train, x_test = load_data()

In [7]:
# training the model.
mu1, mu2, shared_sigma, N1, N2 = train(x_train, y_train)

In [8]:
# show the predict accuracy with train data.
y = predict(x_train, mu1, mu2, shared_sigma, N1, N2)

result = (y_train == y)
print('Train acc = %f' % (float(result.sum()) / result.shape[0]))

Train acc = 0.841405


In [9]:
# predict test data.
y_pred = predict(x_test, mu1, mu2, shared_sigma, N1, N2)

In [10]:
with open('generative.csv', 'w', newline='') as csvf:
    # 建立 CSV 檔寫入器
    writer = csv.writer(csvf)
    writer.writerow(['id', 'label'])
    for i in range(int(y_pred.shape[0])):
        writer.writerow([i + 1, int(y_pred[i])])