# Logistic Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("darkgrid")

In [2]:
data = pd.read_excel('ML1_dataset.xlsx',sheet_name='ML3 - Banana')
data.head()

Unnamed: 0,Image,R,G,B,Ripeness
0,1,247.277,235.203,203.795,1
1,2,242.005,227.006,162.201,1
2,3,176.264,181.173,158.842,0
3,4,187.179,194.187,170.547,0
4,5,237.265,225.627,197.295,1


In [3]:
data['Ripeness'].value_counts()

1    21
0    21
Name: Ripeness, dtype: int64

In [4]:
#normalize features
data['R'] = data['R']/255
data['G'] = data['G']/255
data['B'] = data['B']/255

In [5]:
data.head(10)

Unnamed: 0,Image,R,G,B,Ripeness
0,1,0.969714,0.922365,0.799196,1
1,2,0.949039,0.89022,0.636082,1
2,3,0.691231,0.710482,0.62291,0
3,4,0.734035,0.761518,0.668812,0
4,5,0.930451,0.884812,0.773706,1
5,6,0.697047,0.614486,0.394533,1
6,7,0.773522,0.839506,0.671106,0
7,8,0.631867,0.593824,0.357459,1
8,9,0.893443,0.895733,0.704047,0
9,10,0.947004,0.896431,0.669286,1


### Split into training and test datasets

In [6]:
train_set = data.sample(30)
train_expected = np.array(train_set['Ripeness']).reshape(-1,1)

test_set = data.drop(train_set.index,axis=0)
test_expected = np.array(test_set['Ripeness']).reshape(-1,1)

print('Train dataset:',train_set.shape)
print('Test dataset:',test_set.shape)

Train dataset: (30, 5)
Test dataset: (12, 5)


### Train Set Feature Vector

In [7]:
#change into feature array
num_features = 4
feature = np.ones((len(train_set),num_features))
feature[:,1:] = train_set[['R','G','B']].to_numpy(dtype=float)

print(feature.shape)
print(train_expected.shape)

(30, 4)
(30, 1)


In [8]:
def sigmoid(z,beta):
    sigmoid = 1 / (1 + np.exp(-beta*z))
    return sigmoid

In [9]:
def logistic_regression(X,expected,beta,num_iterations,learning_constant):
    
    #initialize weight
    w = np.random.randn(1,X.shape[1])
    
    cost = []
    #calculate logistic regression
    for i in range(num_iterations):
        z = np.dot(X,w.T)
        
        #apply activation function
        a = sigmoid(z,beta)
        
        #calculate weight change
        dw = learning_constant * np.dot((expected-a).T,X)
        
        cost_func = 0.5*np.sum((a-expected)**2)
        cost.append(cost_func)
        
        if i == num_iterations-1:
            print(cost_func)
        
        #update weight
        w += dw
        
    return a, w, cost

In [10]:
a_train, w, cost = logistic_regression(feature,train_expected,0.2,1000,0.1)
print("w shape:",w.shape)

2.7476320616276526
w shape: (1, 4)


In [11]:
output_train = pd.DataFrame([a_train.flatten(),train_expected.flatten()],index=['predicted','expected']).T
output_train.sample(10)

Unnamed: 0,predicted,expected
15,0.459466,0.0
8,0.718896,1.0
6,0.699989,1.0
1,0.241731,0.0
29,0.540884,1.0
23,0.42083,0.0
11,0.71017,1.0
22,0.736208,1.0
20,0.627307,1.0
10,0.645134,0.0


In [12]:
def accuracy(X,y,w,beta,threshold):
    
    z = np.dot(X,w.T)
    a = sigmoid(z,beta)

    train_output = pd.DataFrame([a.flatten(),y.flatten()],index=['predicted','expected']).T
    true_pos = train_output[(train_output.predicted > threshold) & (train_output.expected == 1)]
    true_neg = train_output[(train_output.predicted < threshold) & (train_output.expected == 0)]
    correct = len(true_pos) + len(true_neg)
    accuracy = (correct/len(train_output))*100

    return accuracy

In [16]:
for i in range(1,10):
    print("For threshold of {:1f}, training accuracy is {}".format(0.1*i,accuracy(feature,train_expected,w,0.2,0.1*i)))

For threshold of 0.100000, training accuracy is 60.0
For threshold of 0.200000, training accuracy is 60.0
For threshold of 0.300000, training accuracy is 66.66666666666666
For threshold of 0.400000, training accuracy is 66.66666666666666
For threshold of 0.500000, training accuracy is 73.33333333333333
For threshold of 0.600000, training accuracy is 73.33333333333333
For threshold of 0.700000, training accuracy is 76.66666666666667
For threshold of 0.800000, training accuracy is 40.0
For threshold of 0.900000, training accuracy is 40.0


In [None]:
x = np.arange(0,len(cost))

plt.title('Logistic Regression Cost Function')
plt.xlabel('Number of iterations')
plt.ylabel('Cost Function')

plt.plot(x,cost)
# plt.savefig('lr_costfunction_10000.png',bbox_inches='tight',dpi=300)
plt.show()

### Test logistic regression model with a test sample

In [18]:
#change into feature array
test = np.ones((len(test_set),num_features))
test[:,1:] = test_set[['R','G','B']].to_numpy(dtype=float)

for i in range(1,10):
    print("For threshold of {:1f}, test accuracy is {}".format(0.1*i,accuracy(test,test_expected,w,0.2,0.1*i)))

For threshold of 0.100000, test accuracy is 25.0
For threshold of 0.200000, test accuracy is 33.33333333333333
For threshold of 0.300000, test accuracy is 33.33333333333333
For threshold of 0.400000, test accuracy is 33.33333333333333
For threshold of 0.500000, test accuracy is 33.33333333333333
For threshold of 0.600000, test accuracy is 66.66666666666666
For threshold of 0.700000, test accuracy is 91.66666666666666
For threshold of 0.800000, test accuracy is 75.0
For threshold of 0.900000, test accuracy is 75.0
