In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import auc, roc_curve

In [37]:
def sig(z):
    return 1/(1+np.exp(-z))

In [12]:
def preprocess_data(X_train, X_test):
    # Todo: encode categorical features using OneHotEncoder class
    # Todo: standardize numerical features using StandardScaler()class
    # Todo: return processed training and test sets
    train_processed = pd.DataFrame(data = {"age": StandardScaler().fit_transform(X_train.iloc[:, 0:1]).T[0]})
    test_processed = pd.DataFrame(data = {"age": StandardScaler().fit_transform(X_test.iloc[:, 0:1]).T[0]})
    col = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
    ohe = OneHotEncoder()
    for i in col:
        train_col = np.unique(X_train[i].values)
        test_col = np.unique(X_test[i].values)
        train = pd.DataFrame(data=ohe.fit_transform(X_train[i].values.reshape(-1, 1)).toarray(), columns=train_col)
        test = pd.DataFrame(data=ohe.fit_transform(X_test[i].values.reshape(-1, 1)).toarray(), columns=test_col)
        
        for f in train_col:
            title = i + "." + f
            train_processed[title] = train[f]
            
        for t in test_col:
            title = i + "." + t
            test_processed[title] = test[t]
            
    stnd_col = ["capital.gain", "capital.loss", "hours.per.week"]
    stnd_train = StandardScaler().fit_transform(X_train.iloc[:, 8:11]).T 
    stnd_test = StandardScaler().fit_transform(X_test.iloc[:, 8:11]).T
    for z in range(3):
        title = "standardized." + stnd_col[z]
        train_processed[title] = stnd_train[z]
        test_processed[title] = stnd_test[z]
    
    # might have to add columns of zeros for test data that doesnt contain all outcomes as train
    # columns would be {'native.country.Holand-Netherlands'(col 63),'workclass.Never-worked'(col 3),'workclass.Without-pay'(col 8)}
    test_processed.insert(3, "workclass.Never-worked", np.zeros((len(test_processed))))
    test_processed.insert(8, "workclass.Without-pay", np.zeros((len(test_processed))))
    test_processed.insert(63, "native.country.Holand-Netherlands", np.zeros((len(test_processed))))
    return train_processed, test_processed


In [64]:
class MyLogisticRegression:
    def __init__(self, learning_rate=0.01, max_iterations=1000):
        '''Initialize variables
        Args:
            learning_rate  : Learning Rate
            max_iterations : Max iterations for training weights
        '''
        # Initialising all the parameters
        self.learning_rate  = learning_rate
        self.max_iterations = max_iterations
        self.likelihoods    = []
        
        # Define epsilon because log(0) is not defined
        self.eps = 1e-7

    def sigmoid(self, z):
        '''Sigmoid function: f:R->(0,1)
        Args:
            z : A numpy array (num_samples,)
        Returns:
            A numpy array where sigmoid function applied to every element
        '''
        ### START CODE HERE
        sig_z = (1/(1+np.exp(-z)))
        ### END CODE HERE
        
        assert (z.shape==sig_z.shape), 'Error in sigmoid implementation. Check carefully'
        return sig_z
    
    def log_likelihood(self, y_true, y_pred):
        '''Calculates maximum likelihood estimate
        Remember: y * log(yh) + (1-y) * log(1-yh)
        Note: Likelihood is defined for multiple classes as well, but for this dataset
        we only need to worry about binary/bernoulli likelihood function
        Args:
                    y_true : Numpy array of actual truth values (num_samples,)
            y_pred : Numpy array of predicted values (num_samples,)
        Returns:
            Log-likelihood, scalar value
        '''
        # Fix 0/1 values in y_pred so that log is not undefined
        y_pred = np.maximum(np.full(y_pred.shape, self.eps), np.minimum(np.full(y_pred.shape, 1-self.eps), y_pred))
        
        ### START CODE HERE
        likelihood = (y_true*np.log(y_pred)+(1-y_true)*np.log(1-y_pred))
    
        ### END CODE HERE
        
        return np.mean(likelihood)
    
    def fit(self, X, y):
        '''Trains logistic regression model using gradient ascent
        to gain maximum likelihood on the training data
        Args:
            X : Numpy array (num_examples, num_features)
            y : Numpy array (num_examples, )
        Returns: VOID
        '''
        
        num_examples = X.shape[0]
        num_features = X.shape[1]
        
        ### START CODE HERE
        
        # Initialize weights with appropriate shape
        self.weights = np.zeros((X.shape[1]))
        # print("Z",self.weights.shape)
        # print(X.shape)
               
        
        # Perform gradient ascent
        for i in range(self.max_iterations):
            # Define the linear hypothesis(z) first
            # HINT: what is our hypothesis function in linear regression, remember?
            
            z  = np.dot(X,self.weights)
          
            # Output probability value by appplying sigmoid on z
            y_pred = self.sigmoid(z)
            
            
            
            # Calculate the gradient values
            # This is just vectorized efficient way of implementing gradient. Don't worry, we will discuss it later.
            gradient = np.mean((y-y_pred)*X.T, axis=1)
            
            # Update the weights
            # Caution: It is gradient ASCENT not descent
            self.weights +=  self.learning_rate*gradient
            
            # Calculating log likelihood
            likelihood = self.log_likelihood(y,y_pred)

            self.likelihoods.append(likelihood)
    
        ### END CODE HERE
    
    def predict_proba(self,X):
        '''Predict probabilities for given X.
        Remember sigmoid returns value between 0 and 1.
        Args:
                   X : Numpy array (num_samples, num_features)
        Returns:
            probabilities: Numpy array (num_samples,)
        '''
        if self.weights is None:
            raise Exception("Fit the model before prediction")
      
        ### START CODE HERE
               
        z = np.dot(X,self.weights)
        probabilities = self.sigmoid(z)
        # probabilities.reshape(probabilities.shape[0],1)
        
        ### END CODE HERE
        
        return probabilities
    
    def predict(self, X, threshold=0.5):
        '''Predict/Classify X in classes
        Args:
            X         : Numpy array (num_samples, num_features)
            threshold : scalar value above which prediction is 1 else 0
        Returns:
            binary_predictions : Numpy array (num_samples,)
        '''
        # Thresholding probability to predict binary values
        
        binary_predictions = np.array(list(map(lambda x: 1 if x>threshold else 0, self.predict_proba(X))))
        
        return binary_predictions

In [65]:
train_data = pd.read_csv('cleaned_train_data.csv')
test_data = pd.read_csv('cleaned_test_data.csv')
X_train = train_data.drop(['income'], axis=1, inplace=False)
y_train = train_data['income']
X_test = test_data.drop(['income'], axis=1, inplace=False)
y_test = test_data['income']

In [66]:
train_processed, test_processed = preprocess_data(X_train, X_test)

In [67]:
X = np.hstack((np.ones((train_processed.shape[0], 1)), train_processed))
y = y_train.values

In [71]:
model = MyLogisticRegression(learning_rate=0.75, max_iterations=250)
model.fit(X, y)

In [72]:
model.weights

array([-7.29364656e-01,  2.98929989e-01,  2.46610047e-01, -1.47097409e-01,
       -3.14363938e-03, -2.10291716e-01,  8.81068606e-02, -5.54069124e-01,
       -1.29215468e-01, -2.02642068e-02, -6.18641721e-02, -1.14673045e-01,
        4.06122571e-01,  4.09199646e-01, -7.11330627e-01,  5.66435098e-01,
        5.12257622e-01, -1.37072111e+00, -3.64790636e-01, -3.45115641e-01,
        1.69789053e-02,  9.61287795e-01, -9.95405174e-02, -8.71998133e-01,
       -1.89706878e-01, -2.01270188e-01, -1.16700518e-01, -2.55957705e-03,
       -4.24527488e-02,  7.27357144e-01, -5.99665621e-01, -3.90533134e-01,
       -3.22289869e-01, -6.33616193e-01, -6.03310740e-02,  3.47251613e-01,
        1.25292188e-01,  1.63803220e-01,  2.86488153e-01, -2.11408239e-01,
        2.21095024e-01, -2.75653638e-01, -2.60033050e-01, -7.30305013e-01,
       -4.88120523e-01,  8.03652544e-01, -4.99326296e-01, -2.30038360e-01,
       -5.83149122e-01, -1.46215534e-01,  1.02777766e-02, -7.54098508e-05,
       -2.06652899e-02, -

In [73]:
model.likelihoods

[-0.6931471805599453,
 -0.5182731876053012,
 -0.4796228585645618,
 -0.4556134912768147,
 -0.43886383278695484,
 -0.42653690352731677,
 -0.41708838431260487,
 -0.409603459022619,
 -0.4035114884154514,
 -0.3984407741512817,
 -0.3941400249277944,
 -0.39043384210621335,
 -0.3871963891116437,
 -0.38433521058125947,
 -0.381780947513023,
 -0.37948060277941487,
 -0.3773930141895813,
 -0.37548573988013006,
 -0.37373287055140775,
 -0.3721134639542405,
 -0.3706104057699039,
 -0.3692095681224713,
 -0.3678991793562068,
 -0.3666693460786654,
 -0.3655116864882112,
 -0.36441904608193243,
 -0.36338527506971574,
 -0.36240505251405686,
 -0.3614737462101086,
 -0.3605873001601037,
 -0.359742143539213,
 -0.3589351165363182,
 -0.3581634095460259,
 -0.3574245129997418,
 -0.35671617573167586,
 -0.3560363702352787,
 -0.3553832635158741,
 -0.3547551925141357,
 -0.3541506432831256,
 -0.3535682332635244,
 -0.3530066961286633,
 -0.35246486877105937,
 -0.3519416800815186,
 -0.3514361412352291,
 -0.35094733724997074,