In [184]:
import numpy as np
import pandas as pd

In [190]:
class LogisticRegression:
    def __init__(self, learning_rate, num_iterations, threshold):
        """
        Initialize Logistic Regression model
        
        Parameters:
        learning_rate (float): Step size for gradient descent
        num_iterations (int): Number of training iterations
        """
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None
        threshold = threshold
        
    def sigmoid(self, z):
        """
        Compute sigmoid function
        
        Parameters:
        z (ndarray): Input values
        
        Returns:
        ndarray: Sigmoid of input values
        """
        # Clip z to avoid overflow
        #z = np.clip(z, -500, 500)       lets keep it out first
        return 1 / (1 + np.exp(-z))
    
    def initialize_parameters(self, num_features):
        """
        Initialize weights and bias
        
        Parameters:
        num_features (int): Number of input features
        """
        self.weights = np.ones(num_features)
        self.bias = 0
        
    def compute_cost(self, X, y, y_pred):
        """
        Compute binary cross-entropy loss
        
        Parameters:
        X (ndarray): Feature matrix (m samples × n features)
        y (ndarray): True labels
        y_pred (ndarray): Predicted probabilities
        
        Returns:
        float: Average loss
        """
        m = len(y)
        small_const = 1e-15  # Small constant to avoid log(0)
        y_pred = np.clip(y_pred, small_const, 1 - small_const)  # Clip predictions
        loss = -np.mean(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))
        return loss
    
    def fit(self, X, y):
        """
        Train the model using gradient descent
        
        Parameters:
        X (ndarray): Feature matrix (m samples × n features)
        y (ndarray): Target labels (0 or 1)
        """
        m, n = X.shape
        self.initialize_parameters(n)
        
        # Store cost history for debugging
        cost_history = []
          # Early stopping parameters
        epsilon = 1e-3  # Minimum cost change threshold
        patience = 10  # Number of iterations to wait for improvement

        best_cost = np.inf
        no_improvement_count = 0
        
        for iteration in range(self.num_iterations):
            # Forward propagation
            z = np.dot(X, self.weights) + self.bias
            y_pred = self.sigmoid(z)
            
            # Compute cost
            cost = self.compute_cost(X, y, y_pred)
            #cost_history.append(cost)
            if cost < best_cost - epsilon:
                best_cost = cost
                no_improvement_count = 0
            else:
                no_improvement_count += 1

            if no_improvement_count >= patience:
                print(f"Early stopping at iteration {iteration}")
                break
            
            # Compute gradients
            dz = y_pred - y
            dw = (1/m) * np.dot(X.T, dz)
            db = (1/m) * np.sum(dz)
            
            # Update parameters
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            
            # Print cost every 100 iterations
            # if iteration % 100 == 0:
            #     print(f"Iteration {iteration}: Cost = {cost} and z = {z}")
                
        return cost_history
    
    def predict_proba(self, X):
        """
        Predict probability of class 1
        
        Parameters:
        X (ndarray): Feature matrix
        
        Returns:
        ndarray: Predicted probabilities
        """
        z = np.dot(X, self.weights) + self.bias
        return self.sigmoid(z)
    
    def predict(self, X, threshold=0.5):
        """
        Predict class labels
        
        Parameters:
        X (ndarray): Feature matrix
        threshold (float): Classification threshold
        
        Returns:
        ndarray: Predicted class labels (0 or 1)
        """
        probas = self.predict_proba(X)
        return (probas >= threshold ).astype(int)
    
    def score(self, X, y):
        """
        Calculate accuracy score
        
        Parameters:
        X (ndarray): Feature matrix
        y (ndarray): True labels
        
        Returns:
        float: Accuracy score
        """
        predictions = self.predict(X)
        return np.mean(predictions == y)

    def predict_y(self, X_test):
        """
        Predict y values for a new dataset

        Parameters:
        X_test (ndarray): Feature matrix of the test dataset

        Returns:
        ndarray: Predicted y values for the test dataset
        """
    def get_weights(self):  # Add self parameter
        return np.array(self.weights)

    def get_bias(self):  # Add self parameter
        return np.array(self.bias)


In [191]:
if __name__ == "__main__":
    # Generate synthetic data
    df = pd.read_csv("binary_classification_train.csv")
    
    # Assuming the last column is the target (y), and the rest are features (X)
    # X = df.iloc[:, 1:-1].values  # All columns except the last one
    X = df.iloc[:, 1:-1].values
    y = df.iloc[:, -1].values

    
    # Create and train model
    #hyperparaneters
    threshold = 0.5
    learning_rate=0.3
    num_iterations=1000
    model = LogisticRegression(learning_rate, num_iterations , threshold)
    cost_history = model.fit(X, y)
    
    # Make predictions
    y_pred = model.predict(X)
    weights = model.get_weights()
    bias = model.get_bias()
    accuracy = model.score(X, y)
    print(f"\nAccuracy: {accuracy:.4f}")

  return 1 / (1 + np.exp(-z))


Early stopping at iteration 37

Accuracy: 0.9264


In [200]:
    df1 = pd.read_csv("binary_classification_test.csv")


    # Assuming the last column is the target (y), and the rest are features (X)
    # X = df.iloc[:, 1:-1].values  # All columns except the last one
    X_test= df1.iloc[:, 1:].values
    

In [201]:
bias_reshaped = np.repeat(bias, X_test.shape[1])  # Repeat bias for each element in weights
z = np.dot(X_test, weights) + bias

In [202]:
X_test.shape

(12000, 20)

In [203]:
weights.shape

(20,)

In [204]:
bias_reshaped.shape

(20,)

In [205]:
weights

array([ 2.29196937e+00,  1.76401862e+00, -2.09784901e+00, -2.96023379e+00,
       -2.30530950e+00,  3.15727759e+01,  1.80215753e+00,  2.52035515e+00,
        6.30996377e+00,  2.29439420e-02,  1.61275995e-02,  1.39166321e+00,
        5.38947690e-01,  5.93123940e+00,  5.48675081e+00, -1.06660500e+01,
        1.50216997e+01, -1.41436203e+01, -2.06082187e+00,  4.14285001e+00])

In [206]:
y_test

array([-4430.68588658, -1098.23096919, -5262.91560224, ...,
       -1867.45362315,  3759.09519205, -4999.12374546])