In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import matplotlib.pyplot as plt

In [2]:
def sigmoid(z):
	return 1.0 / (1 + np.exp(-z))

def threshold(z):
	z = np.array([1 if z_ > 0.5 else 0 for z_ in z])
	return z

class LogisticRegression():
	"""
    Parameters:
    -----------
    n_iterations: The number of training iterations the algorithm will tune the weights for.
    learning_rate: The step length that will be used when updating the weights.
    """
	def __init__(self, n_iterations = 1000, learning_rate = 0.01):
		self.n_iterations = n_iterations
		self.learning_rate = learning_rate
		self.w = None
		self.log_loss = 0
		
	def init_weight(self, n_features):
		# Init weights all zero values
		self.w = np.ones(n_features).reshape(-1, 1)
		
	def fit(self, X, y):
	
		# Insert one more column value 1 for bias
		X = np.insert(X, 0, 1, axis=1)
		
		n_samples, n_features = X.shape
		
		self.init_weight(n_features=X.shape[1])
		
		# Do gradient descent for n_iterations
		for i in range(self.n_iterations+1):
		
			#Calculate y prediction
			y_pred = sigmoid(np.dot(X, self.w))
			
			# Calculate Gradient Descent for Log Loss Error
			self.log_loss = np.mean(np.dot(-y.T, np.log(y_pred)) - np.dot((1-y).T, np.log(1-y_pred)))
				
			if i % 100 == 0:
 				print("Cost: ", self.log_loss)
			grad = (1/n_samples) * np.dot(X.T, (y_pred-y))
				
			#Update weights
			self.w -= self.learning_rate * grad
				
		
	def predict(self, X):
		X = np.insert(X, 0, 1, axis=1)
		y_pred = X.dot(self.w)
		return threshold(sigmoid(y_pred))

In [3]:
np.random.seed(3)

In [4]:
df = pd.read_csv("diabetes2.csv")

In [5]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
data = df.values

In [7]:
N, d = data.shape

In [8]:
X = data[:, 0:d-1].reshape(-1, d-1)

In [9]:
y = data[:, -1].reshape(-1, 1).astype(int)

In [10]:
# Normalize grades to values between -1 and 1 for more efficient computation
normalized_range = preprocessing.MinMaxScaler(feature_range=(-1, 1))

X = normalized_range.fit_transform(X)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [12]:
# Our model
model = LogisticRegression(n_iterations=1000, learning_rate = 1.5)
model.fit(X_train, y_train)

Cost:  272.9274922465252
Cost:  211.783113914797
Cost:  210.03142886749458
Cost:  209.71680345289082
Cost:  209.64515836009537
Cost:  209.6269750439307
Cost:  209.6220789283167
Cost:  209.62071783369385
Cost:  209.62033297113672
Cost:  209.62022315371803
Cost:  209.62019166170876


In [13]:
# Predict
y_pred = model.predict(X_test)

In [14]:
print("Our model acurracy: %.2f %%" %(100*accuracy_score(y_test, y_pred))) 

Our model acurracy: 76.30 %


In [15]:
# Sklearn model
scikit_log_reg = linear_model.LogisticRegression()
scikit_log_reg.fit(X_train,y_train.flatten())



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
y_pred1 = scikit_log_reg.predict(X_test)

In [17]:
# Compare actual output, our output and Sklearn output
print("Actual output:        ", y_test[:15].flatten())
print("Our model output:     ", y_pred[:15].flatten())
print("Sklearn model output: ", y_pred1[:15])

Actual output:         [0 1 0 1 0 0 0 0 0 0 1 0 0 1 1]
Our model output:      [0 1 0 1 1 0 0 0 0 0 0 0 0 1 1]
Sklearn model output:  [0 1 0 1 0 0 0 0 0 0 0 0 0 1 1]


In [18]:
print("Sklearn model acurracy: %.2f %%" %(100*accuracy_score(y_test.flatten(), y_pred1))) 

Sklearn model acurracy: 75.65 %


In [19]:
# Compare our intercept and Sklearn intercept
print("Our model intercept: ", model.w[0])
print("Sklearn model intercept: ", scikit_log_reg.intercept_)

Our model intercept:  [0.29699287]
Sklearn model intercept:  [0.25183269]


In [22]:
# Compare our coefficient and Sklearn coefficient
print("Our model coefficient: ", model.w[1:].flatten())
print("Sklearn model coefficient: ", scikit_log_reg.coef_)

Our model coefficient:  [ 1.0638862   3.227905   -0.62264114  0.29780999 -0.32481277  2.96448464
  1.48149703  0.48861271]
Sklearn model coefficient:  [[ 0.87371133  2.70445614 -0.42760852  0.27567786 -0.10645218  2.12436832
   1.14279462  0.51103529]]
