In [None]:
import pandas as pd
import numpy as np
import os 
from math import e
from sklearn.linear_model import LogisticRegression

In [None]:
class LogisticRegressionScratch(object):
    def __init__(self, tolerance = 10**-8, max_iterations = 20):
        
        self.tolerance = tolerance
        self.max_iterations = max_iterations
        self.weights_array = None #current weights + intercept 
        self.prior_w = None # previous weights + intercept 
        
        # final values for the weights and intercept
        self.weights = None
        self.intercept = None 

        
    def predict_proba(self, X):
       
        XW = np.dot(X, self.weights_array)
        P = 1 / (1 + np.exp(-XW))
        return P

    
    
    def compute_gradient(self, X, y, P):
        
        G = -np.dot((y-P).T, X)      
        return G
        
    def compute_hessian(self, X, P):
    
        Q = P * (1 - P) 
        XQ = X.T * Q
        H = np.dot(XQ, X)
        return H


    def update_weights(self, X, y):
      
        P = self.predict_proba(X)
        G = self.compute_gradient(X,y,P)
        H = self.compute_hessian(X,P)
        self.prior_w = self.weights_array.copy() 
        self.weights_array -= np.dot(np.linalg.inv(H), G)
        
        
        
           
    def check_stop(self):
     
        w_old_norm = self.prior_w / np.linalg.norm(self.prior_w)
        w_new_norm = self.weights_array / np.linalg.norm(self.weights_array)
        diff = w_old_norm - w_new_norm
        distance = np.sqrt(np.dot(diff,diff))
        if distance < self.tolerance:
            stop = True
        if distance > self.tolerance:
            stop = False 
        return stop
        
        
    def fit(self, X, y):
     
        #setting initial weights +  extra dimension for the intercept
        self.weights_array = np.zeros(X.shape[1] + 1)
        
        #Initializing the slope parameter to log(base rate/(1-base rate))
        self.weights_array[-1] = np.log(y.mean() / (1-y.mean()))
        
        #creating a new X matrix that includes a column of ones for the intercept
        X_int = np.hstack((X, np.ones((X.shape[0],1))))

        for i in range(self.max_iterations):
            self.update_weights(X_int, y)
            
            stop = self.check_stop()
            if stop:
                self.set_final_weights()
                self.set_final_intercept()
                break
                
    
    def set_final_weights(self):
        self.weights = self.weights_array[0:-1]
        
    def set_final_intercept(self):
        self.intercept = self.weights_array[-1]  
        
    def get_weights(self):
        return self.weights
    
    def get_intercept(self):
        return self.intercept
        

In [None]:
filename = os.path.join(os.getcwd(), "data", "airbnbData_train.csv")
df = pd.read_csv(filename, header=0)

In [None]:
# Setting features for predicting super host
feature_list = ['review_scores_rating','review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_value','host_response_rate','host_acceptance_rate']
feature_list

In [None]:
# Setting labels for predicting super host + features
X = df[feature_list]
y = df['host_is_superhost']

In [None]:
# Instance of linear regression
lr = LogisticRegressionScratch()
lr.fit(X,y)

In [None]:
# Resulting weights and intercepts
print('The fitted weights and intercept are:')
print(lr.get_weights(), lr.get_intercept())

In [None]:
# Scikit-learn LogisticRegression model
lr_sk = LogisticRegression(C=10**10)
lr_sk.fit(X,y)

In [None]:
# Resulting weights and intercept
print('The fitted weights and intercept with sklearn are:')
print(lr_sk.coef_, lr_sk.intercept_)

In [None]:
# Fitting the logistic regression model lr on the training data
