In [7]:
import cv2 as cv
import os
import pandas as pd
import numpy as np
import json
import pickle
import time

In [8]:
import winsound
duration = 1250  # milliseconds
freq = 600  # Hz
winsound.Beep(freq, duration)

In [9]:
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

In [10]:
start_program = time.time()

In [11]:
#Load the model
#pickle.loads(kmeans_batch, open("KMeans_All_Trial_1.sav", 'wb'))
n_cluster = 1500
file_name = "_".join(("Trial_1_KMeans","c"+str(n_cluster),"b32_rs0.sav"))
with open(file_name, 'rb') as f_name:
    kmeans_batch = pickle.load(f_name)

In [12]:
print(kmeans_batch)

MiniBatchKMeans(batch_size=32, compute_labels=True, init='k-means++',
                init_size=None, max_iter=1000, max_no_improvement=10,
                n_clusters=1500, n_init=3, random_state=0,
                reassignment_ratio=0.01, tol=0.0, verbose=0)


In [13]:
# For each image, a histogram needs to be created!
# For each descriptor, a cluster number or centroid label is given.
# Then for the entire image, a histogram is generated. 
# What if we separate the images into grids?

In [14]:
#Let's import all the feature descriptors!
path_dir = os.getcwd()
train_car_dir = "\\".join((path_dir, "Car_Split", "train"))
car_splits = os.listdir(train_car_dir)
train_noise_dir = "\\".join((path_dir, "Noise_Split", "train"))
noise_splits = os.listdir(train_noise_dir)

In [15]:
print(car_splits)

['car_train_desc_split_1_v1.pkl', 'car_train_desc_split_2_v1.pkl', 'car_train_desc_split_3_v1.pkl', 'car_train_desc_split_4_v1.pkl']


In [16]:
#Exacting all the different splits of the car dataset - for train!
split_path = "\\".join((train_car_dir, car_splits[0]))
with open(split_path, 'rb') as file_name:
    car_pickle_0 = pickle.load(file_name)

split_path = "\\".join((train_car_dir, car_splits[1]))
with open(split_path, 'rb') as file_name:
    car_pickle_1 = pickle.load(file_name)

split_path = "\\".join((train_car_dir, car_splits[2]))
with open(split_path, 'rb') as file_name:
    car_pickle_2 = pickle.load(file_name)

split_path = "\\".join((train_car_dir, car_splits[3]))
with open(split_path, 'rb') as file_name:
    car_pickle_3 = pickle.load(file_name)

In [17]:
#Exacting all the different splits of the noise dataset - for train!
split_path = "\\".join((train_noise_dir, noise_splits[0]))
with open(split_path, 'rb') as file_name:
    noise_pickle_0 = pickle.load(file_name)

split_path = "\\".join((train_noise_dir, noise_splits[1]))
with open(split_path, 'rb') as file_name:
    noise_pickle_1 = pickle.load(file_name)

split_path = "\\".join((train_noise_dir, noise_splits[2]))
with open(split_path, 'rb') as file_name:
    noise_pickle_2 = pickle.load(file_name)

split_path = "\\".join((train_noise_dir, noise_splits[3]))
with open(split_path, 'rb') as file_name:
    noise_pickle_3 = pickle.load(file_name)

In [18]:
def cluster_labels(pickle_file, model):
    start_time = time.time()
    #pickle_file = car_descriptors
    
    img_cluster = []
    for img_desc in pickle_file:
        cluster_desc = []
        if len(img_desc)> 0:
            #for desc in img_desc:
            cluster_desc = model.predict(img_desc)
            img_cluster.append(cluster_desc)
    print(time.time() - start_time)
    return(img_cluster)

In [19]:
kmeans_batch

MiniBatchKMeans(batch_size=32, compute_labels=True, init='k-means++',
                init_size=None, max_iter=1000, max_no_improvement=10,
                n_clusters=1500, n_init=3, random_state=0,
                reassignment_ratio=0.01, tol=0.0, verbose=0)

In [20]:
car_desc_0 = cluster_labels(pickle_file = car_pickle_0, model = kmeans_batch)
noise_desc_0 = cluster_labels(noise_pickle_0, model = kmeans_batch)
print("\n")
car_desc_1 = cluster_labels(pickle_file = car_pickle_1, model = kmeans_batch)
noise_desc_1 = cluster_labels(pickle_file = noise_pickle_1, model = kmeans_batch)
print("\n")
car_desc_2 = cluster_labels(pickle_file = car_pickle_2, model = kmeans_batch)
noise_desc_2 = cluster_labels(pickle_file = noise_pickle_2, model = kmeans_batch)
print("\n")
car_desc_3 = cluster_labels(pickle_file = car_pickle_3, model = kmeans_batch)
noise_desc_3 = cluster_labels(pickle_file = noise_pickle_3, model = kmeans_batch)

41.747334718704224
44.2984185218811


34.4528865814209
54.902817249298096


27.3659827709198
33.29621458053589


50.74464297294617
52.744256258010864


In [21]:
car_clusters = np.concatenate([car_desc_0, car_desc_1, car_desc_2, car_desc_3])
noise_clusters = np.concatenate([noise_desc_0, noise_desc_1, noise_desc_2, noise_desc_3])

In [22]:
display(np.shape(car_clusters))
display(np.shape(noise_clusters))

(6548,)

(6774,)

In [23]:
def cluster_histogram(img_clusters, n_clusters = 500):
    start_time = time.time()
    hist_arr = []
    for img in img_clusters:
        hist = np.zeros(n_clusters)
        for cluster in img:
            hist[cluster] += 1
        hist_arr.append(hist)
    print(time.time() - start_time)
    return(hist_arr)

In [24]:
#Shouldn't take more than 3 seconds for each function!
car_hist = cluster_histogram(img_clusters = car_clusters, n_clusters = n_cluster)
noise_hist = cluster_histogram(img_clusters = noise_clusters, n_clusters = n_cluster)

1.9990572929382324
2.5523197650909424


In [25]:
#data_df = pd.DataFrame(np.concatenate([car_hist, noise_hist]))

In [26]:
# car_df = pd.DataFrame(car_hist)
# sscaler = StandardScaler()
# car_df_ss = pd.DataFrame(sscaler.fit_transform(car_df))

In [27]:
mm_scalar = MinMaxScaler()
car_df = pd.DataFrame(car_hist)
car_df_mm = pd.DataFrame(mm_scalar.fit_transform(car_df))

In [28]:
noise_df = pd.DataFrame(noise_hist)
noise_df_mm = pd.DataFrame(mm_scalar.transform(noise_df))
# sscaler = StandardScaler()
# noise_df_ss = pd.DataFrame(sscaler.fit_transform(noise_df))

In [29]:
car_df_mm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499
0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.4,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.2,0.0,0.2,0.0,0.0,0.166667
2,0.0,0.25,0.0,0.0,0.2,0.0,0.0,0.25,0.2,0.142857,...,0.0,0.0,0.0,0.0,0.2,0.333333,0.2,0.0,0.0,0.0
3,0.0,0.25,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.714286,0.0,0.0
4,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.6,0.285714,...,0.0,0.0,0.0,0.2,0.0,0.166667,0.2,0.142857,0.0,0.0


In [30]:
# noise_df = pd.DataFrame(noise_clusters)
# #noise_df = noise_df.fillna(value = -1) 
# # The problem is that, we cannot assign zero since zero cluster exists. 
# # We also cannot assign 502 or 501 since we give more priority in that case!
# # Only negative value seemed likely 
# preprocess_sscaler = StandardScaler()
# noise_df = preprocess_sscaler.fit_transform(noise_df)
# noise_df = pd.DataFrame(noise_clusters)
# #noise_df = noise_df.fillna(value = -1) 
# # The problem is that, we cannot assign zero since zero cluster exists. 
# # We also cannot assign 502 or 501 since we give more priority in that case!
# # Only negative value seemed likely 
# preprocess_sscaler = MinMaxScaler()
# noise_df = pd.DataFrame(preprocess_sscaler.fit_transform(noise_df))
# car_df = pd.DataFrame(car_pp_df)
# noise_df = pd.DataFrame(noise_df)
# sample_car_mm = car_df_mm.drop(columns = [500, 501])
# sample_noise_mm = noise_df_mm.drop(columns = [500, 501])

# sample_car_ss = car_df_ss.drop(columns = [500, 501])
# sample_noise_ss = noise_df_ss.drop(columns = [500, 501])

In [31]:
sample_car_mm = car_df_mm
sample_noise_mm = noise_df_mm

# sample_car_ss = car_df_ss
# sample_noise_ss = noise_df_ss

In [32]:
# It happens that sometimes there are no freq found for that particular cluster
print(np.any(sample_car_mm.isna())) 
print(np.any(sample_noise_mm.isna()))

False
False


In [33]:
# sample_car = sample_car.fillna(value = -1)
# sample_noise = sample_noise.fillna(value = -1)

In [34]:
car_name_mm = "_".join(("Car","Hist", "MinMax","C"+str(n_cluster)+".csv"))
noise_name_mm = "_".join(("Noise","Hist", "MinMax","C"+str(n_cluster)+".csv"))


In [35]:
print(car_name_mm)

Car_Hist_MinMax_C1500.csv


In [36]:
sample_car_mm.to_csv(car_name_mm, sep = ',', index = False)
#sample_car_ss.to_csv(car_name_ss, sep = ',', index = False)

In [37]:
sample_noise_mm.to_csv(noise_name_mm, sep = ',', index= False)
#sample_noise_ss.to_csv(noise_name_ss, sep = ',', index= False)

In [38]:
#Dumping the trained model!
mm_name = "_".join(("MinMax","C"+str(n_cluster)+".sav"))
print(mm_name)
pickle.dump(mm_scalar, open(mm_name, 'wb'))

MinMax_C1500.sav


In [39]:
print(time.time() - start_program)
winsound.Beep(freq, duration)

411.3257837295532


In [79]:
#Run above for different clusters!
Error
#Experiments! Do not run below!

NameError: name 'Error' is not defined

In [73]:
#car_df.iloc[:, -1].value_counts()

399.0    1
Name: 501, dtype: int64

In [85]:
# sample_df = car_df
# sample_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,492,493,494,495,496,497,498,499,500,501
count,6548.0,6548.0,6548.0,6547.0,6547.0,6547.0,6546.0,6545.0,6545.0,6544.0,...,4837.0,4813.0,4785.0,4754.0,4736.0,4711.0,4689.0,4663.0,1.0,1.0
mean,0.496664,0.495801,0.496177,0.49376,0.499219,0.501779,0.489562,0.497996,0.497606,0.493351,...,0.493042,0.49147,0.487756,0.4943,0.487503,0.490444,0.496392,0.488711,0.288,0.798
std,0.285253,0.289804,0.286167,0.287957,0.288914,0.285597,0.285544,0.286654,0.289569,0.288133,...,0.288491,0.2859,0.288671,0.288194,0.286987,0.288758,0.285028,0.285562,,
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.288,0.798
25%,0.252,0.246,0.25,0.246,0.248,0.256,0.244,0.254,0.244,0.248,...,0.244,0.246,0.238,0.248,0.244,0.242,0.252,0.248,0.288,0.798
50%,0.5,0.492,0.49,0.49,0.5,0.5,0.485,0.496,0.5,0.488,...,0.49,0.486,0.48,0.49,0.48,0.486,0.492,0.484,0.288,0.798
75%,0.74,0.748,0.742,0.744,0.752,0.748,0.736,0.746,0.748,0.742,...,0.74,0.736,0.728,0.74,0.7285,0.738,0.734,0.725,0.288,0.798
max,0.998,0.998,0.998,0.998,0.998,0.998,0.998,0.998,0.998,0.998,...,0.998,0.998,0.998,0.998,0.998,0.998,0.998,0.998,0.288,0.798


In [78]:
# np.any(sample_df.isin([0]))

True

In [43]:
# np.shape(pickle_file)

(6554,)

In [None]:
# sample_df[df[‘Name’]==’Donna’].index.values

In [41]:
# img_cluster[0]

array([ 90,  63, 126, 393,  51, 341, 191, 308, 336, 362, 319, 296, 392,
       495, 255, 409,  63, 383, 270,  23, 368,  37, 160, 496, 405,  52,
       304,  21, 182, 379,   5, 225,  12, 113,  95, 173, 212, 134, 389,
       382,   7, 400, 370, 236, 254, 238,  12, 146, 219, 481, 441,  76,
        62, 342, 257, 473, 185, 221,  73,  47, 254, 369, 163,  54, 432,
       177, 471, 132, 418,  13, 370, 277, 419,  95,  65, 300, 348, 259,
       190, 497, 134,  73, 233, 263,  33, 453, 170, 208,  65, 435,  19,
       253, 202,   4, 111, 263, 419, 107, 250, 364, 138, 263, 263, 236,
        85, 228, 252, 369, 172,  61,  93, 123,  20, 484, 481, 283, 484,
        20, 168, 459,  40, 238, 201, 292,  85, 422, 399,  53, 284, 241,
       319, 139, 135, 401, 367, 231,  12, 254, 497, 246, 280, 405, 382,
       226, 191, 130, 169,  43,  75, 173, 232, 231,  47, 160, 454, 495,
       111,  65,  11, 393, 478, 455,  76,  50, 490,  74, 285, 133,  78,
        22, 419, 299, 324, 307, 320, 264,  67, 105,  59, 332,  9

In [45]:
# np.shape(hist_arr)

(6548, 500)

In [14]:
# np.array(pickle_file[0][1])

array([ 55,  45, 107,  86, 150, 242,  94, 115, 113, 194, 219, 195, 113,
       126,  53, 112, 126, 239,  21, 175, 252, 155,  52, 238,  59,  46,
       143, 130,  37, 247, 246, 246])

In [29]:
#  desc = np.array(pickle_file[0])
# #  print(np.shape(desc.T))
#  print(kmeans_batch.predict(desc))

(32, 500)
[ 90  63 126 393  51 341 191 308 336 362 319 296 392 495 255 409  63 383
 270  23 368  37 160 496 405  52 304  21 182 379   5 225  12 113  95 173
 212 134 389 382   7 400 370 236 254 238  12 146 219 481 441  76  62 342
 257 473 185 221  73  47 254 369 163  54 432 177 471 132 418  13 370 277
 419  95  65 300 348 259 190 497 134  73 233 263  33 453 170 208  65 435
  19 253 202   4 111 263 419 107 250 364 138 263 263 236  85 228 252 369
 172  61  93 123  20 484 481 283 484  20 168 459  40 238 201 292  85 422
 399  53 284 241 319 139 135 401 367 231  12 254 497 246 280 405 382 226
 191 130 169  43  75 173 232 231  47 160 454 495 111  65  11 393 478 455
  76  50 490  74 285 133  78  22 419 299 324 307 320 264  67 105  59 332
  90 151 325 379  56 164 252   2 220 128  54 233 328 197  94 426 234  11
 224 283 252  95 255  75 282 141  43 116 248 163  95 121 496 382 254 486
 297 135 342 110   0 292  20  20  19 123  63 220 203 387 249 123 354 422
 446 292 469 383 158  51 167 473 324 258 