In [1]:
import pandas as pd

df = pd.read_csv('melb_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [2]:
# removing irrelevant columns 
df = df.drop(['Address','Method','SellerG', 'Date','Bedroom2', 'Propertycount','BuildingArea', 'YearBuilt','CouncilArea','Lattitude','Longtitude'], axis=1)

In [3]:
df.Car.describe()

count    13518.000000
mean         1.610075
std          0.962634
min          0.000000
25%          1.000000
50%          2.000000
75%          2.000000
max         10.000000
Name: Car, dtype: float64

In [4]:
#fill in missing numeric values with median for Car
df.Car = df.Car.fillna(df.Car.median())

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Suburb      13580 non-null  object 
 1   Rooms       13580 non-null  int64  
 2   Type        13580 non-null  object 
 3   Price       13580 non-null  float64
 4   Distance    13580 non-null  float64
 5   Postcode    13580 non-null  float64
 6   Bathroom    13580 non-null  float64
 7   Car         13580 non-null  float64
 8   Landsize    13580 non-null  float64
 9   Regionname  13580 non-null  object 
dtypes: float64(6), int64(1), object(3)
memory usage: 1.0+ MB


In [5]:
from sklearn.preprocessing import MinMaxScaler

cols_to_norm = ['Rooms','Distance','Bathroom','Car','Landsize']
df[cols_to_norm] = MinMaxScaler().fit_transform(df[cols_to_norm])

In [6]:
df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Postcode,Bathroom,Car,Landsize,Regionname
0,Abbotsford,0.111111,h,1480000.0,0.051975,3067.0,0.125,0.1,0.000466,Northern Metropolitan
1,Abbotsford,0.111111,h,1035000.0,0.051975,3067.0,0.125,0.0,0.00036,Northern Metropolitan
2,Abbotsford,0.222222,h,1465000.0,0.051975,3067.0,0.25,0.0,0.000309,Northern Metropolitan
3,Abbotsford,0.222222,h,850000.0,0.051975,3067.0,0.25,0.1,0.000217,Northern Metropolitan
4,Abbotsford,0.333333,h,1600000.0,0.051975,3067.0,0.125,0.2,0.000277,Northern Metropolitan


In [7]:
df = pd.concat([df,pd.get_dummies(df['Type'], prefix='Type')],axis=1)
df.drop(['Type'],axis=1, inplace=True)

df = pd.concat([df,pd.get_dummies(df['Regionname'], prefix='Regionname')],axis=1)
df.drop(['Regionname'],axis=1, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 19 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Suburb                                 13580 non-null  object 
 1   Rooms                                  13580 non-null  float64
 2   Price                                  13580 non-null  float64
 3   Distance                               13580 non-null  float64
 4   Postcode                               13580 non-null  float64
 5   Bathroom                               13580 non-null  float64
 6   Car                                    13580 non-null  float64
 7   Landsize                               13580 non-null  float64
 8   Type_h                                 13580 non-null  uint8  
 9   Type_t                                 13580 non-null  uint8  
 10  Type_u                                 13580 non-null  uint8  
 11  Re

In [8]:
df = pd.concat([df,pd.get_dummies(df['Postcode'], prefix='Postcode')],axis=1)
df.drop(['Postcode'],axis=1, inplace=True)

df = pd.concat([df,pd.get_dummies(df['Suburb'], prefix='Suburb')],axis=1)
df.drop(['Suburb'],axis=1, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Columns: 529 entries, Rooms to Suburb_Yarraville
dtypes: float64(6), uint8(523)
memory usage: 7.4 MB


In [9]:
bins = [0, 500000, 1000000, 10000000]
category = [0, 1, 2]
df['Price_Category'] = pd.cut(df['Price'], bins, labels=category)

df.head()

Unnamed: 0,Rooms,Price,Distance,Bathroom,Car,Landsize,Type_h,Type_t,Type_u,Regionname_Eastern Metropolitan,...,Suburb_Williamstown,Suburb_Williamstown North,Suburb_Windsor,Suburb_Wollert,Suburb_Wonga Park,Suburb_Wyndham Vale,Suburb_Yallambie,Suburb_Yarra Glen,Suburb_Yarraville,Price_Category
0,0.111111,1480000.0,0.051975,0.125,0.1,0.000466,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,0.111111,1035000.0,0.051975,0.125,0.0,0.00036,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,0.222222,1465000.0,0.051975,0.25,0.0,0.000309,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0.222222,850000.0,0.051975,0.25,0.1,0.000217,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0.333333,1600000.0,0.051975,0.125,0.2,0.000277,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [10]:
totals = df['Price_Category'].value_counts()
y = df.Price_Category
df.drop(['Price'],axis=1, inplace=True)
df.drop(['Price_Category'],axis=1, inplace=True)
totals

1    6240
2    5743
0    1597
Name: Price_Category, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, train_size=0.8, shuffle=True)
y_train = y_train.values.tolist()
y_test = y_test.values.tolist()

In [12]:
total_0 = y_train.count(0)
total_1 = y_train.count(1)
total_2 = y_train.count(2)

print(total_0)
print(total_1)
print(total_2)

1283
4967
4614


# 2. Modeling

In [13]:
import numpy as np
from scipy.special import expit
from numpy.linalg import pinv
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression as SKLogisticRegression
from scipy.optimize import fmin_bfgs # maybe the most common bfgs algorithm in the world
from numpy import ma

In [14]:
class BinaryLogisticRegression:
    def __init__(self, eta, iterations=20, C=0.001, regularization='none'):
        self.eta = eta
        self.iters = iterations
        self.C = C
        self.regularization = regularization
        # internally we will store the weights as self.w_ to keep with sklearn conventions
        
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'Binary Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained Binary Logistic Regression Object'
        
    # convenience, private:
    @staticmethod
    def _add_bias(X):
        return np.hstack((np.ones((X.shape[0],1)),X)) # add bias term
    
    @staticmethod
    def _sigmoid(theta):
        # increase stability, redefine sigmoid operation
        return expit(theta) #1/(1+np.exp(-theta))
    
    # vectorized gradient calculation with regularization using L2 Norm
    def _get_gradient(self,X,y):
        ydiff = y-self.predict_proba(X,add_bias=False).ravel() # get y difference
        gradient = np.mean(X * ydiff[:,np.newaxis], axis=0) # make ydiff a column vector and multiply through
        
        gradient = gradient.reshape(self.w_.shape)
        
        if (self.regularization == 'L2'):
            gradient[1:] += -2 * self.w_[1:] * self.C
        elif(self.regularization == 'L1'):
            gradient[1:] += (np.sign(self.w_[1:])) * self.C
        elif(self.regularization == 'both'):
            gradient[1:] += (np.sign(self.w_[1:])) * self.C + (-2 * self.w_[1:] * self.C)
        
        return gradient
    
    # public:
    def predict_proba(self,X,add_bias=True):
        # add bias term if requested
        Xb = self._add_bias(X) if add_bias else X
        return self._sigmoid(Xb @ self.w_) # return the probability y=1
    
    def predict(self,X):
        return (self.predict_proba(X)>0.5) #return the actual prediction
    
    
    def fit(self, X, y):
        Xb = self._add_bias(X) # add bias term
        num_samples, num_features = Xb.shape
        
        self.w_ = np.zeros((num_features,1)) # init weight vector to zeros
        
        # for as many as the max iterations
        for _ in range(self.iters):
            gradient = self._get_gradient(Xb,y)
            self.w_ += gradient*self.eta # multiply by learning rate 
            # add bacause maximizing 


In [15]:
class BFGSBinaryLogisticRegression(BinaryLogisticRegression):
    
    @staticmethod
    def objective_gradient(w,X,y,C,regularization):
        print(f"W2: {w}")
        print(f"X2: {X.shape}")
        print(f"y2: {y.shape}")
        g = expit(X @ w)
        ydiff = y-g # get y difference
        gradient = np.mean(X * ydiff[:,np.newaxis], axis=0)
        gradient = gradient.reshape(w.shape)
        
        if (regularization == 'L2'):
            gradient[1:] += -2 * w[1:] * C
        elif(regularization == 'L1'):
            gradient[1:] += (np.sign(w[1:])) * C
        elif(regularization == 'both'):
            gradient[1:] += ((np.sign(w[1:])) * C) + (-2 * w[1:] * C)
        return -gradient
    
    @staticmethod
    def objective_function(w,X,y,C,regularization):
        g = expit(X @ w)
        # invert this because scipy minimizes, but we derived all formulas for maximzing
        return -np.sum(ma.log(g[y==1]))-np.sum(ma.log(1-g[y==0])) + C*sum(w**2) 
        #-np.sum(y*np.log(g)+(1-y)*np.log(1-g))
    
    # just overwrite fit function
    def fit(self, X, y):
        Xb = self._add_bias(X) # add bias term
        num_samples, num_features = Xb.shape
        
        self.w_ = fmin_bfgs(self.objective_function, # what to optimize
                            np.zeros((num_features,1)), # starting point
                            fprime=self.objective_gradient, # gradient function
                            args=(Xb,y,self.C,self.regularization), # extra args for gradient and objective function
                            gtol=1e-03, # stopping criteria for gradient, |v_k|
                            maxiter=self.iters, # stopping criteria iterations
                            disp=False)
        
        self.w_ = self.w_.reshape((num_features,1))

In [16]:
class MultiClassLogisticRegression:
    def __init__(self, eta, iterations=20, 
                 C=0.0001, 
                 solver=BFGSBinaryLogisticRegression,
                regularization='none'):
        self.eta = eta
        self.iters = iterations
        self.C = C
        self.solver = solver
        self.classifiers_ = []
        self.regularization = regularization
        # internally we will store the weights as self.w_ to keep with sklearn conventions
    
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'MultiClass Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained MultiClass Logistic Regression Object'
        
    def fit(self,X,y):
        num_samples, num_features = X.shape
        self.unique_ = np.sort(np.unique(y)) # get each unique class value
        num_unique_classes = len(self.unique_)
        self.classifiers_ = []
        for i,yval in enumerate(self.unique_): # for each unique value
            y_binary = np.array(y==yval).astype(int) # create a binary problem
            # train the binary classifier for this class
            
            hblr = self.solver(eta=self.eta,iterations=self.iters,C=self.C, regularization = self.regularization)
            hblr.fit(X,y_binary)

            # add the trained classifier to the list
            self.classifiers_.append(hblr)
            
        # save all the weights into one matrix, separate column for each class
        self.w_ = np.hstack([x.w_ for x in self.classifiers_]).T
        
    def predict_proba(self,X):
        probs = []
        for hblr in self.classifiers_:
            probs.append(hblr.predict_proba(X).reshape((len(X),1))) # get probability for each classifier
        
        return np.hstack(probs) # make into single matrix
    
    def predict(self,X):
        return np.argmax(self.predict_proba(X),axis=1) # take argmax along row

In [17]:
def run_lr(C,solver,regularization):
    lr = MultiClassLogisticRegression(eta=1, iterations=20, C=C, solver=solver, regularization=regularization)
    lr.fit(X_train, y_train)
    yhat = lr.predict(X_test)
    acc = accuracy_score(y_test,yhat)
    return acc

# 4. Exceptional Work

In [29]:
class BFGSBinaryLogisticRegression(BinaryLogisticRegression):
    
    @staticmethod
    def objective_gradient(w,X,y,C,regularization):
        print(f"W2: {w.shape}")
        print(f"X2: {X.shape}")
        print(f"y2: {y.shape}")
        g = expit(X @ w)
        ydiff = y-g # get y difference
        gradient = np.mean(X * ydiff[:,np.newaxis], axis=0)
        gradient = gradient.reshape(w.shape)
        
        if (regularization == 'L2'):
            gradient[1:] += -2 * w[1:] * C
        elif(regularization == 'L1'):
            gradient[1:] += (np.sign(w[1:])) * C
        elif(regularization == 'both'):
            gradient[1:] += ((np.sign(w[1:])) * C) + (-2 * w[1:] * C)
        return -gradient
    
    @staticmethod
    def objective_function(w,X,y,C,regularization):
        g = expit(X @ w)
        # invert this because scipy minimizes, but we derived all formulas for maximzing
        return -np.sum(ma.log(g[y==1]))-np.sum(ma.log(1-g[y==0])) + C*sum(w**2) 
        #-np.sum(y*np.log(g)+(1-y)*np.log(1-g))
    
    # just overwrite fit function
    def fit(self, X, y):
        Xb = self._add_bias(X) # add bias term
        num_samples, num_features = Xb.shape
        
        self.w_ = fmin_bfgs(self.objective_function, # what to optimize
                            np.zeros((num_features,1)), # starting point
                            fprime=self.objective_gradient, # gradient function
                            args=(Xb,y,self.C,self.regularization), # extra args for gradient and objective function
                            gtol=1e-03, # stopping criteria for gradient, |v_k|
                            maxiter=self.iters, # stopping criteria iterations
                            disp=False)
        
        self.w_ = self.w_.reshape((num_features,1))

In [33]:
class CustomBFGSBinaryLogisticRegression(BinaryLogisticRegression):
    
    @staticmethod
    def objective_gradient(w,X,y,C,regularization):
        print(f"W1: {w.shape}")
        print(f"X1: {X.shape}")
        print(f"y1: {y.shape}")

        g = expit(X @ w)
        ydiff = y-g # get y difference
        gradient = np.mean(X * ydiff[:,np.newaxis], axis=0)
        gradient = gradient.reshape(w.shape)
        
        if (regularization == 'L2'):
            gradient[1:] += -2 * w[1:] * C
        elif(regularization == 'L1'):
            gradient[1:] += (np.sign(w[1:])) * C
        elif(regularization == 'both'):
            gradient[1:] += ((np.sign(w[1:])) * C) + (-2 * w[1:] * C)
        return -gradient
    
    @staticmethod
    def objective_function(w,X,y,C,regularization):
        g = expit(X @ w)
        # invert this because scipy minimizes, but we derived all formulas for maximzing
        return -np.sum(ma.log(g[y==1]))-np.sum(ma.log(1-g[y==0])) + C*sum(w**2) 
        #-np.sum(y*np.log(g)+(1-y)*np.log(1-g))
    
    @staticmethod
    def BFGS(f, x0, fprime, args, gtol, maxiter):
        eta =.001
        h = np.identity(len(x0))
        hinverse = h
        print("Hello")
        x0 = np.arange(x0.shape[0])
        g = fprime(x0, args[0], args[1], args[2], args[3])
        i = 0
        w = x0
        while np.linalg.norm(g) > gtol:
            if i > maxiter:
                break
            i += 1
            p = -hinverse @ g
            w += p * eta
            s = p * eta
            v = fprime(w, args[0], args[1], args[2], args[3]) - g
            u = v - (H @ s)
            hnew = h + (v@v.T)/(v.T@s) - (H@s@s.T@H)/(s.T@H@s)
            hinverse = hinverse + (((s.T@v + hinverse)@(s@s.T))/(s.T@v)**2)-((hinverse@v@s.T + s@v.T@hinverrse)/(s.T@v))
            h = hnew
        return w

    # just overwrite fit function
    def fit(self, X, y):
        Xb = self._add_bias(X) # add bias term
        num_samples, num_features = Xb.shape
        
        self.w_ = self.BFGS(self.objective_function, # what to optimize
                            np.zeros((num_features,1)), # starting point
                            fprime=self.objective_gradient, # gradient function
                            args=(Xb,y,self.C,self.regularization), # extra args for gradient and objective function
                            gtol=1e-03, # stopping criteria for gradient, |v_k|
                            maxiter=self.iters, # stopping criteria iterations
                      )
        
        self.w_ = self.w_.reshape((num_features,1))

In [34]:
lr = MultiClassLogisticRegression(eta=1, iterations=1, C=.001, solver=CustomBFGSBinaryLogisticRegression, regularization='L2')
lr1 = MultiClassLogisticRegression(eta=1, iterations=1, C=.001, solver=BFGSBinaryLogisticRegression, regularization='L2')


In [35]:
lr1.fit(X_train, y_train)
lr.fit(X_train, y_train)
yhat = lr.predict(X_test)
acc = accuracy_score(y_test,yhat)

W2: (529,)
X2: (10864, 529)
y2: (10864,)
W2: (529,)
X2: (10864, 529)
y2: (10864,)
W2: (529,)
X2: (10864, 529)
y2: (10864,)
W2: (529,)
X2: (10864, 529)
y2: (10864,)
W2: (529,)
X2: (10864, 529)
y2: (10864,)
W2: (529,)
X2: (10864, 529)
y2: (10864,)
Hello
W1: (529,)
X1: (10864, 529)
y1: (10864,)


NameError: name 'eta' is not defined