In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import auc, roc_curve

In [2]:
train_data = pd.read_csv('cleaned_train_data.csv')
test_data = pd.read_csv('cleaned_test_data.csv')
X_train = train_data.drop(['income'], axis=1, inplace=False)
y_train = train_data['income']
X_test = test_data.drop(['income'], axis=1, inplace=False)
y_test = test_data['income']

In [7]:
def preprocess_data(X_train, X_test):
    # Todo: encode categorical features using OneHotEncoder class
    # Todo: standardize numerical features using StandardScaler()class
    # Todo: return processed training and test sets
    train_processed = pd.DataFrame(data = {"age": StandardScaler().fit_transform(X_train.iloc[:, 0:1]).T[0]})
    test_processed = pd.DataFrame(data = {"age": StandardScaler().fit_transform(X_test.iloc[:, 0:1]).T[0]})
    col = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
    ohe = OneHotEncoder()
    for i in col:
        train_col = np.unique(X_train[i].values)
        test_col = np.unique(X_test[i].values)
        train = pd.DataFrame(data=ohe.fit_transform(X_train[i].values.reshape(-1, 1)).toarray(), columns=train_col)
        test = pd.DataFrame(data=ohe.fit_transform(X_test[i].values.reshape(-1, 1)).toarray(), columns=test_col)
        
        for f in train_col:
            title = i + "." + f
            train_processed[title] = train[f]
            
        for t in test_col:
            title = i + "." + t
            test_processed[title] = test[t]
            
    stnd_col = ["capital.gain", "capital.loss", "hours.per.week"]
    stnd_train = StandardScaler().fit_transform(X_train.iloc[:, 8:11]).T 
    stnd_test = StandardScaler().fit_transform(X_test.iloc[:, 8:11]).T
    for z in range(3):
        title = "standardized." + stnd_col[z]
        train_processed[title] = stnd_train[z]
        test_processed[title] = stnd_test[z]
    
    # might have to add columns of zeros for test data that doesnt contain all outcomes as train
    # columns would be {'native.country.Holand-Netherlands'(col 63),'workclass.Never-worked'(col 3),'workclass.Without-pay'(col 8)}
    test_processed.insert(3, "workclass.Never-worked", np.zeros((len(test_processed))))
    test_processed.insert(8, "workclass.Without-pay", np.zeros((len(test_processed))))
    test_processed.insert(63, "native.country.Holand-Netherlands", np.zeros((len(test_processed))))
    return train_processed, test_processed

train_processed, test_processed = preprocess_data(X_train, X_test)

In [17]:
X_train = train_data.drop(['income'], axis=1, inplace=False)
X = np.hstack((np.ones((X_train.shape[0], 1)), train_processed))
X

array([[ 1.        ,  0.6899456 ,  0.        , ..., -0.2325727 ,
        -0.21558026, -0.03580076],
       [ 1.        ,  1.1296067 ,  0.        , ..., -0.2325727 ,
         3.94933449,  0.77598017],
       [ 1.        ,  0.32356136,  0.        , ..., -0.2325727 ,
        -0.21558026, -0.03580076],
       ...,
       [ 1.        ,  0.39683821,  1.        , ..., -0.2325727 ,
        -0.21558026,  1.18187063],
       [ 1.        , -1.14197563,  0.        , ..., -0.2325727 ,
        -0.21558026, -0.03580076],
       [ 1.        , -0.55576084,  0.        , ..., -0.2325727 ,
        -0.21558026, -0.03580076]])

In [159]:
y_train = train_data['income']
y = y_train.values
y

array([0, 0, 1, ..., 1, 0, 0])

In [169]:
def predict_prob(samples, w):
        # Todo: Compute and return the predicted probability of each sample having an income >= 50K
        z = np.dot(samples, w)
        return 1 / (1 + np.exp(-z))

In [165]:
def predict(samples, threshold=0.5):
        return predict_prob(samples) >= threshold

In [171]:
def compute_gradient(y, X, w):
        # Todo: compute and return gradient of the average log likelihood on the training set
        # y is a vector
        # X is feature matrix
        #predict_prob(X, w) is the probability of every row given the weights and a row of data
        diff = y - predict_prob(X, w)
        gradient = np.dot(X.T, diff)
        return gradient

In [167]:
    def gradient_ascent(X, y, lr, its):
        # Todo: Run gradient ascent to learn model weights
        # Todo: Store the average log likelihood and the prediction accuracy 
        #       on the training and test sets after every gradient ascent iteration
        
        # need y vector and X array(featuresxexamples)
        w = np.zeros((X.shape[1]))
        
        for i in range(its):
            gradient = compute_gradient(y, X, w)
            w = w + (lr * gradient)
            y_hat = predict_prob(X, w)
            
            ll = (1/len(y_hat)) * sum(y*np.log(y_hat)+(1-y)*np.log(1-y_hat))
            
        return w

In [172]:
gradient_ascent(X, y, 0.1, 100)

ValueError: operands could not be broadcast together with shapes (94,) (94,26049) 

In [163]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def logistic_regression(X, y, learning_rate=0.01, num_iterations=1000):
    m, n = X.shape  

    weights = np.zeros((n, 1))  # Initialize weights to zeros

    for iteration in range(num_iterations):
        scores = np.dot(X, weights)
        predictions = sigmoid(scores)

        errors = y - predictions

        gradient = np.dot(X.T, errors)
        weights += learning_rate * gradient

        # Calculate log-likelihood
        log_likelihood = np.sum(y * np.log(predictions) + (1 - y) * np.log(1 - predictions))

        print(f"Iteration {iteration + 1}/{num_iterations}, Log-Likelihood: {log_likelihood}")

    return weights

# Example usage:
# X_train is your feature matrix, where each row is a sample and each column is a feature
# y_train is the corresponding binary label (0 or 1)
# Make sure X_train and y_train are NumPy arrays
y_train = train_data['income']
y = y_train.values
y = y.reshape(-1, 1)

X_train = train_data.drop(['income'], axis=1, inplace=False)
X = np.hstack((np.ones((X_train.shape[0], 1)), train_processed))
X

# Train logistic regression model
logistic_regression(X, y, learning_rate=0.01, num_iterations=100)

Iteration 1/100, Log-Likelihood: -18055.790906406015
Iteration 2/100, Log-Likelihood: nan
Iteration 3/100, Log-Likelihood: nan
Iteration 4/100, Log-Likelihood: nan
Iteration 5/100, Log-Likelihood: nan


  log_likelihood = np.sum(y * np.log(predictions) + (1 - y) * np.log(1 - predictions))
  log_likelihood = np.sum(y * np.log(predictions) + (1 - y) * np.log(1 - predictions))


Iteration 6/100, Log-Likelihood: nan
Iteration 7/100, Log-Likelihood: nan
Iteration 8/100, Log-Likelihood: nan
Iteration 9/100, Log-Likelihood: nan
Iteration 10/100, Log-Likelihood: nan
Iteration 11/100, Log-Likelihood: nan
Iteration 12/100, Log-Likelihood: nan
Iteration 13/100, Log-Likelihood: nan
Iteration 14/100, Log-Likelihood: nan
Iteration 15/100, Log-Likelihood: nan
Iteration 16/100, Log-Likelihood: nan
Iteration 17/100, Log-Likelihood: nan
Iteration 18/100, Log-Likelihood: nan
Iteration 19/100, Log-Likelihood: nan
Iteration 20/100, Log-Likelihood: nan
Iteration 21/100, Log-Likelihood: nan
Iteration 22/100, Log-Likelihood: nan
Iteration 23/100, Log-Likelihood: nan
Iteration 24/100, Log-Likelihood: nan
Iteration 25/100, Log-Likelihood: nan


  return 1 / (1 + np.exp(-x))


Iteration 26/100, Log-Likelihood: nan
Iteration 27/100, Log-Likelihood: nan
Iteration 28/100, Log-Likelihood: nan
Iteration 29/100, Log-Likelihood: nan
Iteration 30/100, Log-Likelihood: nan
Iteration 31/100, Log-Likelihood: nan
Iteration 32/100, Log-Likelihood: nan
Iteration 33/100, Log-Likelihood: nan
Iteration 34/100, Log-Likelihood: nan
Iteration 35/100, Log-Likelihood: nan
Iteration 36/100, Log-Likelihood: nan
Iteration 37/100, Log-Likelihood: nan
Iteration 38/100, Log-Likelihood: nan
Iteration 39/100, Log-Likelihood: nan
Iteration 40/100, Log-Likelihood: nan
Iteration 41/100, Log-Likelihood: nan
Iteration 42/100, Log-Likelihood: nan
Iteration 43/100, Log-Likelihood: nan
Iteration 44/100, Log-Likelihood: nan
Iteration 45/100, Log-Likelihood: nan
Iteration 46/100, Log-Likelihood: nan
Iteration 47/100, Log-Likelihood: nan
Iteration 48/100, Log-Likelihood: nan
Iteration 49/100, Log-Likelihood: nan
Iteration 50/100, Log-Likelihood: nan
Iteration 51/100, Log-Likelihood: nan
Iteration 52

array([[-6.71171783e+01],
       [ 4.06820189e+00],
       [ 2.40885453e+01],
       [-1.52647584e+01],
       [-4.24810835e-01],
       [-3.49286168e+01],
       [ 2.20816694e+01],
       [-4.74258704e+01],
       [-1.27324119e+01],
       [-2.51092454e+00],
       [-9.14671155e+00],
       [-2.01656450e+01],
       [ 4.10900578e+01],
       [ 5.79691332e+01],
       [-9.56491505e+01],
       [ 6.46990603e+01],
       [ 7.28893785e+01],
       [-1.19905523e+02],
       [-5.88977778e+01],
       [-2.52492525e+01],
       [ 2.42230891e+00],
       [ 4.66001865e+01],
       [-6.79993633e+00],
       [-5.93432054e+01],
       [-1.26481004e+01],
       [-1.20991791e+01],
       [-6.17261189e+00],
       [-2.30246529e-01],
       [-2.63610802e+01],
       [ 8.35475733e+01],
       [-6.60397993e+01],
       [-3.86321776e+01],
       [-3.28467286e+01],
       [-4.33186545e+01],
       [-5.24774082e+00],
       [ 3.33527845e+01],
       [ 1.18034776e+01],
       [ 1.86859341e+01],
       [ 3.5