# Assignment 3.

In [1]:
import numpy as np 
import os
import sys
import cv2
import matplotlib.pyplot as plt
import skimage
import random
from sklearn import model_selection

Function return the closet cluster centroid to each sample in data

In [2]:
def findClosetCentroids(data,centroids):
    """Return the closet centroid index to each data point"""
    nSample=data.shape[0]
    #number of clusters
    K=centroids.shape[0]
    index=np.zeros((nSample,1))
    temp=np.zeros((K,1))
    for i in range(nSample):
        for j in range(K):
            temp[j]=np.sum((data[i,:]-centroids[j,:])**2)
            index[i]=np.argmin(temp)
    return index

Function to update the cluster's centroid base on the mean of members in each cluster

In [3]:
def updateCentroids(data,index,K):
    """Update the centroids"""
    temp=np.zeros((K,data.shape[1]))
    count=np.zeros((K,1))
    for i in range(index.shape[0]):
        temp[int(index[i])]+=data[i]
        count[int(index[i])]+=1
    for i in range(K):
        if count[i,0]==0:
            count[i,0]=1
    centroids=temp/count
    return centroids,count

Initialize centroids list randomlly, each one is picked in the data sample

In [None]:
def randomInitCentroids(data,K):
    centroids=np.zeros((K,data.shape[1]))
    for i in range(K):
        flag=True
        while flag:
            flag=False
            centroids[i]=data[np.random.randint(0,data.shape[0])]
            for j in range(i):
                if (centroids[i]==centroids[j]).all():
                    flag=True
                    break
    return centroids

Implement K-mean clustering algorithm

In [None]:
def KmeanClustering(data,K,epoch=500):
    """K-mean clustering implementation"""
    centroids=randomInitCentroids(data,K)
    for i in range(epoch):
        index=findClosetCentroids(data,centroids)
        centroids,count=updateCentroids(data,index,centroids.shape[0])
    return centroids,count

Logistic regression implementation

In [None]:
class LogisticRegression:
    def __init__(self, threshold=0.5):
        self.threshold = threshold
        
    def predict(self, x):
        z = np.sum(x@self.W, axis=1)
        return 1 / (1 + np.exp(-z))
    
    #fit Weight by a batch
    def learn(self, x, y, learning_rate):
        y_hat = self.predict(x).reshape((30,1))
        dif=y-y_hat
        new_W = x.T@dif
        self.W = self.W + learning_rate * new_W
        
    def evaluate(self, x, y):
        y_pred = self.predict(x).reshape(y.shape)
        return {
            "loss": -np.mean(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred)),
            "accuracy": np.sum((y_pred > self.threshold).astype(int) == y) / y.shape[0]
        }
    
    def fit(
        self, x, y, x_valid = None, y_valid = None,
        learning_rate = 0.001,
        learning_rate_decay = 1,
        batch_size = 32,
        epoch = 1,
        verbose = False
    ):
        self.W = np.random.rand(x.shape[1],1)
        if x_valid is None:
            x_valid = x
        if y_valid is None:
            y_valid = y
        step = x.shape[0] // batch_size + (x.shape[0] % batch_size != 0)
        metric_graph = {
            "loss": [],
            "accuracy": []
        }
        for e in range(epoch):
            for i in range(step):
                self.learn(
                    x[batch_size * i : batch_size * (i + 1),],
                    y[batch_size * i : batch_size * (i + 1),],
                    learning_rate
                )
                metrics = self.evaluate(x_valid, y_valid)
                if (e <= 5 or (i + 1) == step) and verbose:
                    metrics = self.evaluate(x_valid, y_valid)
                    print("Epoch %d Step %d: Loss %f, Acc %f" % (e + 1, i + 1, metrics["loss"], metrics["accuracy"]))
            
            metrics = self.evaluate(x_valid, y_valid)
            metric_graph["loss"].append(metrics["loss"])
            metric_graph["accuracy"].append(metrics["accuracy"])
            learning_rate *= learning_rate_decay
        
        plt.plot(metric_graph["loss"])
        plt.title("Loss")
        plt.show()
        plt.title("Accuracy")
        plt.plot(metric_graph["accuracy"])
        plt.show()

Feature extraction using K-mean clustering.
Finding the most dominant colors, sort by frequency, and then flatten

In [None]:
K=3
nIterations=20

country_filename=[os.path.join("countryside",name) for name in os.listdir("countryside")]
metro_filename=[os.path.join("metropolitian",name) for name in os.listdir("metropolitian")]
X=np.ones((len(country_filename)+len(metro_filename),3*K+1))
y=np.zeros((len(country_filename)+len(metro_filename),1))

for idx,filename in enumerate(country_filename+metro_filename):
    print(filename)
    img=skimage.io.imread(filename)
    img=skimage.transform.resize(img,(img.shape[0]//16,img.shape[1]//16,3))
    data=img.reshape(-1,3)
    centroids,count=KmeanClustering(data,K,nIterations)
    centroids_dict={count[i,0]:centroids[i,:] for i in range(K)}
    #sorted by color frequency
    sorted_count=sorted(centroids_dict,reverse=True)
    sorted_centroids=[centroids_dict[i] for i in sorted_count]
    centroids=np.concatenate(sorted_centroids)
    X[idx,1:]=centroids
    #label sample
    if filename[0]=='c':
        y[idx]=1
    else:
        y[idx]=0
#training and validation set splitting: 75% for training and 25% for validation set
x,x_valid,y,y_valid=model_selection.train_test_split(X,y,test_size=0.25)    

countryside/000125.jpeg
countryside/000211.jpeg


Logistic training and reporting accuracy on validation set

In [None]:
model=LogisticRegression(0.5)
model.fit(x,y,x_valid,y_valid,epoch=500,verbose=True)