### Importing all the required Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Open Source Computer Vision Library
import cv2 

# keeps track of the objects it has already serialized, so that later references to the same object won’t be serialized again
import pickle

# provides a portable way of using operating system dependent functionality. 
import os
from os import listdir 
from os.path import isfile, join


from sklearn.model_selection import train_test_split # Split arrays or matrices into random train and test subsets
from sklearn.model_selection import cross_val_score # Evaluate a score by cross-validation

from sklearn.preprocessing import StandardScaler # Standardize features by removing the mean and scaling to unit variance
from sklearn.linear_model import SGDClassifier # Linear classifiers (SVM, logistic regression, a.o.) with SGD training.

from sklearn.metrics import accuracy_score # Accuracy classification score.
from sklearn.metrics import confusion_matrix #Compute confusion matrix to evaluate the accuracy of a classification

### Reading Images from a folder

In [2]:
mypath='C:/Users/FENNY/Desktop/SOS/Thyme-Feature_Extraction/Analystics_Achievers/Dataset/c_16'# change the path to read corresponding folders.
onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ]
images = np.empty(len(onlyfiles), dtype=object)
resized_image = np.empty(len(onlyfiles), dtype=object)
for n in range(0, len(onlyfiles)):
    
    images[n] = cv2.imread( join(mypath,onlyfiles[n]) ) # Reading all the images from folder and storing in numpy array.
    resized_image[n] = cv2.resize(images[n], (100,100)) # Resizing all the images into 100X100 pixels.
    

In [3]:
print(type(resized_image)) # to check the type.
print(resized_image.shape) # check the no of images present in folder.

<class 'numpy.ndarray'>
(962,)


### Segmentation by K-Mean Clustering.
K-Means is a least-squares partitioning method that divide a collection of objects into K groups. 

### cv2.kmeans() function in OpenCV for data clustering
#### Input Parameters

1. **samples** : It should be of np.float32 data type, and each feature should be put in a single column.<br>
<br>
2. **nclusters(K)** : Number of clusters required at end<br>
<br>
3. **criteria** : It is the iteration termination criteria. When this criteria is satisfied, algorithm iteration stops. Actually, it should be a tuple of 3 parameters. They are ( type, max_iter, epsilon ):<br>

3.1. type of termination criteria : It has 3 flags as below:<br>
**cv2.TERM_CRITERIA_EPS** - stop the algorithm iteration if specified accuracy, epsilon, is reached. <br>**cv2.TERM_CRITERIA_MAX_ITER** - stop the algorithm after the specified number of iterations, max_iter. <br>
**cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER** - stop the iteration when any of the above condition is met.<br>


3.2. **max_iter** - An integer specifying maximum number of iterations.<br>

3.3. **epsilon** - Required accuracy

In [4]:
# Initialize all the arrays as empty.
gray = np.empty(len(images), dtype=object)
Y = np.empty(len(images), dtype=object)
label = np.empty(len(images), dtype=object)
res = np.empty(len(images), dtype=object)
res2 = np.empty(len(images), dtype=object)


There are 3 features, say, R,G,B. So we need to reshape the image to an array of Mx3 size (M is number of pixels in image). And after the clustering, we apply centroid values (it is also R,G,B) to all pixels, such that resulting image will have specified number of colors. And again we need to reshape it back to the shape of original image.

In [5]:
for i in range(0,n):
    Y[i]=resized_image[i].reshape((-1,3))
    Y[i]= np.float32(Y[i])
    
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
K=4 # No of clusters=4

for i in range(0,n):    
    ret,label[i],center=cv2.kmeans(Y[i],K,None,criteria,10,cv2.KMEANS_RANDOM_CENTERS)# Using cv2.Kmeans() Function
    
center = np.uint8(center) #  array of centers of clusters.

for i in range(0,n):    
    res[i] = center[label[i].flatten()]
    res2[i] = res[i].reshape((resized_image[i].shape))
    gray[i] = cv2.cvtColor(res2[i], cv2.COLOR_BGR2GRAY)


In [6]:
# Flatten the segmented array.
for i in range(0,n):
        res2[i]=res2[i].flatten()

In [7]:
# Variants of numpy.stack function to stack so as to make a single array vertically.
abc = np.empty(30000) # 100*100*3
for i in range(961):# Size of folder-1
    abc = np.vstack((abc, res2[i]))
abc  = abc[1:,:]
abc.shape

(961, 30000)

In [8]:
# Convert ndarray into Dataframe.
df16=pd.DataFrame(data=abc)
df16['label']=16 # Label the images whichever class it may belong to.

Doing the above procedure for 9 folders and appending it into single Dataframe.

In [9]:
with open('dfc16-24', 'rb') as pickle_file:
    df = pickle.load(pickle_file)# list_of_sent of summary

In [10]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29991,29992,29993,29994,29995,29996,29997,29998,29999,label
0,23.0,88.0,84.0,23.0,88.0,84.0,23.0,88.0,84.0,23.0,...,137.0,135.0,144.0,137.0,135.0,144.0,137.0,135.0,144.0,16
1,45.0,142.0,137.0,45.0,142.0,137.0,45.0,142.0,137.0,45.0,...,23.0,88.0,84.0,23.0,88.0,84.0,23.0,88.0,84.0,16
2,108.0,105.0,116.0,108.0,105.0,116.0,108.0,105.0,116.0,108.0,...,23.0,88.0,84.0,23.0,88.0,84.0,45.0,142.0,137.0,16
3,45.0,142.0,137.0,45.0,142.0,137.0,45.0,142.0,137.0,45.0,...,23.0,88.0,84.0,23.0,88.0,84.0,45.0,142.0,137.0,16
4,45.0,142.0,137.0,45.0,142.0,137.0,45.0,142.0,137.0,45.0,...,108.0,105.0,116.0,108.0,105.0,116.0,108.0,105.0,116.0,16


#### Slicing the dataframe and taking 5000 points for creating model

In [45]:
X=df.iloc[:5000,:30000].values
y=df.iloc[:5000,30000].values

In [46]:
X.shape

(5000, 30000)

In [47]:
y.shape

(5000,)

In [48]:
s=StandardScaler()
X=s.fit_transform(X)# Perform column Standardization 

In [51]:
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.2) # Split the dataset into train and test as 80-20 split

#### SGD(stochastic gradient descent) Classifier

**Parameters**<br>

**alpha** : float,
Constant that multiplies the regularization term. Defaults to 0.0001 Also used to compute learning_rate when set to ‘optimal’.
<br><br>
**loss** : str, default: ‘hinge’

The loss function to be used. Defaults to ‘hinge’, which gives a linear SVM.

The possible options are ‘hinge’, ‘log’, ‘modified_huber’, ‘squared_hinge’, ‘perceptron’, or a regression loss: ‘squared_loss’, ‘huber’, ‘epsilon_insensitive’, or ‘squared_epsilon_insensitive’.

### Loss=Hinge (SVM Classifier)

In [60]:
alpha=[0.000001, 0.00001, 0.0001,0.001,0.01,0.1,1]
cv_scores = []
for k in alpha:
    clf=SGDClassifier(loss='hinge',alpha=k)
    clf.fit(X_train,Y_train)
    scores = cross_val_score(clf, X_train, Y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
    print(scores)
MSE = [1 - x for x in cv_scores]



[0.50742574 0.45771144 0.48877805 0.44389027 0.46882793 0.40100251
 0.46867168 0.43718593 0.46733668 0.43073048]




[0.52970297 0.51741294 0.5286783  0.53117207 0.5361596  0.4887218
 0.48370927 0.50251256 0.54522613 0.49118388]




[0.60891089 0.59452736 0.65835411 0.56857855 0.56359102 0.59649123
 0.6641604  0.62311558 0.61557789 0.58186398]




[0.67079208 0.71393035 0.6882793  0.66832918 0.73815461 0.67919799
 0.69674185 0.68341709 0.65829146 0.64231738]




[0.72772277 0.67412935 0.72817955 0.70074813 0.74563591 0.70927318
 0.71428571 0.71356784 0.70854271 0.69017632]




[0.76732673 0.73383085 0.76309227 0.75810474 0.77306733 0.73182957
 0.75689223 0.76884422 0.70100503 0.73299748]




[0.76732673 0.64676617 0.71571072 0.71321696 0.74314214 0.71177945
 0.70927318 0.70603015 0.71105528 0.70780856]


NameError: name 'neigh' is not defined

In [61]:
# Get the optimal value of alpha.
optimal_k = alpha[MSE.index(min(MSE))]
print('\nThe optimal number of neighbors is :',optimal_k)


The optimal number of neighbors is : 0.1


In [63]:
# Fit the model with optimal value of alpha
clf=SGDClassifier(loss='hinge',alpha=optimal_k)
clf.fit(X_train,Y_train)



SGDClassifier(alpha=0.1, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [64]:
y_predict = clf.predict(X_test) # apply the model to Test dataset

In [66]:
acc=accuracy_score(Y_test,y_predict,normalize=True)*float(100)
acc

73.4

In [68]:
confusion_matrix(Y_test, y_predict).T # confusion matrix of 9 by 9

array([[202,   0,  46,   0,   0,   0,   0,   0,   0],
       [  0,   2,   0,   0,   0,   1,   0,   0,   0],
       [ 10,   0,  41,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,  87,   4,   0,   6,   5,   3],
       [  0,   2,   0,   2,   8,   1,   0,   0,  16],
       [  0,   1,   0,   0,   5,  24,   0,   0,   7],
       [  0,   0,   0,   1,   0,   0,   0,   0,   0],
       [  0,   0,   0,   1,   0,   0,   0,  41,   0],
       [  4,  11,   0,  20,  65,  53,   0,   2, 329]], dtype=int64)

### Loss=Log (Logistic Regression Classifier)

In [69]:
alpha=[0.000001, 0.00001, 0.0001,0.001,0.01,0.1,1]
cv_scores = []
for k in alpha:
    clf=SGDClassifier(loss='log',alpha=k)
    clf.fit(X_train,Y_train)
    scores = cross_val_score(clf, X_train, Y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
    print(scores)
MSE = [1 - x for x in cv_scores]
    



[0.48267327 0.47263682 0.49376559 0.40399002 0.47381546 0.44862155
 0.47368421 0.47236181 0.36432161 0.44080605]




[0.4950495  0.460199   0.5436409  0.48379052 0.5361596  0.5037594
 0.52882206 0.47487437 0.45477387 0.51889169]




[0.61138614 0.63432836 0.64837905 0.57605985 0.63591022 0.5764411
 0.60401003 0.61809045 0.54773869 0.60201511]




[0.67574257 0.67910448 0.68329177 0.67331671 0.680798   0.66666667
 0.6641604  0.66582915 0.70603015 0.6675063 ]




[0.76732673 0.70895522 0.73815461 0.70573566 0.7680798  0.68671679
 0.70426065 0.71859296 0.71105528 0.67254408]




[0.77970297 0.75373134 0.73566085 0.71820449 0.7755611  0.71177945
 0.72431078 0.7361809  0.74623116 0.74307305]




[0.75       0.70646766 0.72817955 0.72069825 0.74064838 0.70175439
 0.6716792  0.74120603 0.73115578 0.71788413]


In [70]:
optimal_k = alpha[MSE.index(min(MSE))]
print('\nThe optimal value of alpha is :',optimal_k)    


The optimal value of alpha is : 0.1


In [71]:
clf=SGDClassifier(loss='log',alpha=optimal_k)
clf.fit(X_train,Y_train)



SGDClassifier(alpha=0.1, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [72]:
y_predict = clf.predict(X_test)
acc=accuracy_score(Y_test,y_predict,normalize=True)*float(100)
acc

75.7

In [74]:
confusion_matrix(Y_test, y_predict).T # confusion matrix of 9 by 9

array([[210,   1,  58,   0,   0,   0,   0,   0,   0],
       [  0,   2,   0,   0,   0,   0,   0,   0,   0],
       [  6,   0,  29,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,  90,   4,   0,   5,   4,   2],
       [  0,   0,   0,   2,  11,   0,   0,   0,   9],
       [  0,   2,   0,   0,   1,  32,   0,   0,   4],
       [  0,   0,   0,   3,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   1,   0,   0,  43,   0],
       [  0,  11,   0,  16,  65,  47,   1,   1, 340]], dtype=int64)

#### Observations
Clearly out of 9 classes majority of class 1 class 4, class 8 and class 9 are classisifed correctly whereas there are wrong classisifcation in class 2, class 3,class 5 class 6.