In [1]:
import numpy as np
import math
import pandas as pd

In [2]:
learning_rate = 0.0001
epoch_num = 10000

In [3]:
# the logistic function
def sigmoid(theta, input):
    
    h = np.dot(theta, input)
    return 1 / (1 + math.exp(-h))

def logistic_regression_fit(train_input, train_labels):
    
    misclassification_error = 0
    print("Training")
    theta = np.random.uniform(low=-0.1, high=0.1, size=(train_input.shape[1]))

    for epoch_i in range(epoch_num):

        index = np.random.randint(0, train_input.shape[0])
        y_hat = sigmoid(theta, train_input[index])
        y = 1 if train_labels[index] == ' M ' else 0

        if y_hat <= 0.5 and y == 1 or y_hat > 0.5 and y == 0:
            misclassification_error += 1

        theta += learning_rate * (y - y_hat) * train_input[index]

        if (epoch_i + 1) % 100 == 0:
            print("Avg Misclassification Error: {} on epoch {}".format(misclassification_error / 100, epoch_i + 1))
            if misclassification_error / 1000 <= 0.01:
                break
            misclassification_error = 0
    print("End of Training")
    return theta



In [4]:
df = pd.read_csv("train.csv")
df.columns = ['h', 'w', 'a', 'g']
heights = df.h
weights = df.w
age = df.a
gender = df.g

In [5]:
train_input = np.asarray([heights, weights, age]).T
train_labels = np.asarray(gender)
theta = logistic_regression_fit(train_input, train_labels)

Training
Avg Misclassification Error: 0.4 on epoch 100
Avg Misclassification Error: 0.34 on epoch 200
Avg Misclassification Error: 0.37 on epoch 300
Avg Misclassification Error: 0.28 on epoch 400
Avg Misclassification Error: 0.32 on epoch 500
Avg Misclassification Error: 0.26 on epoch 600
Avg Misclassification Error: 0.22 on epoch 700
Avg Misclassification Error: 0.32 on epoch 800
Avg Misclassification Error: 0.24 on epoch 900
Avg Misclassification Error: 0.22 on epoch 1000
Avg Misclassification Error: 0.23 on epoch 1100
Avg Misclassification Error: 0.29 on epoch 1200
Avg Misclassification Error: 0.22 on epoch 1300
Avg Misclassification Error: 0.16 on epoch 1400
Avg Misclassification Error: 0.37 on epoch 1500
Avg Misclassification Error: 0.24 on epoch 1600
Avg Misclassification Error: 0.27 on epoch 1700
Avg Misclassification Error: 0.23 on epoch 1800
Avg Misclassification Error: 0.22 on epoch 1900
Avg Misclassification Error: 0.27 on epoch 2000
Avg Misclassification Error: 0.25 on epoc

In [6]:
df_test = pd.read_csv("test.csv")
df_list = df_test.values.tolist()

In [7]:
def classification(sample):
    y_hat = sigmoid(theta, sample)
    pred_class = " M " if y_hat >= 0.5 else " W "
    return pred_class

In [8]:
print("Testing")
for sample in df_list:
    y_hat = sigmoid(theta, sample)
    pred_class = " M " if y_hat >= 0.5 else " W "
    print("Predicted class: {}".format(pred_class))

Testing
Predicted class:  W 
Predicted class:  W 
Predicted class:  W 
Predicted class:  W 


# Leave one out

In [9]:
df_loo = pd.read_csv('leave_one_out.csv')
df_loo.columns = ['h', 'w', 'a', 'g']
heights = df_loo.h
weights = df_loo.w
age = df_loo.a
gender = df_loo.g

In [10]:
count = 0
    
for index, test_sample in df_loo.iterrows():
        
    sample = test_sample.values[:3]
    target = test_sample.values[3]
    prediction = classification(sample)
    df_loo.drop(index)
    
    if target == prediction:
        count = count + 1
        
print("{}/{} correct predictions using all features".format(count, df_loo.shape[0]))

61/120 correct predictions using all features


In [11]:
# count = 0
    
# for index, test_sample in df_loo.iterrows():
        
#     sample = test_sample.values[:2]
#     target = test_sample.values[3]
#     prediction = classification(sample)
#     df_loo.drop(index)
    
#     if target == prediction:
#         count = count + 1
        
# print("{}/{} correct predictions using all features except age".format(count, df_loo.shape[0]))