In [1]:
import os
import time
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
import cv2

In [2]:
names = [name[:-4] for name in sorted(os.listdir('4/core_CSVs/'))]

assert(
    names == [name[:-4] for name in sorted(os.listdir('4/core_encodings/'))]
)

short_names = [name[8:14] + name[-2:] for name in names]

names

['2019-11-07_to_2019-11-17',
 '2019-11-08_to_2019-11-18',
 '2019-11-09_to_2019-11-19',
 '2019-11-10_to_2019-11-20',
 '2019-11-11_to_2019-11-21',
 '2019-11-12_to_2019-11-22',
 '2019-11-13_to_2019-11-23',
 '2019-11-14_to_2019-11-24',
 '2019-11-15_to_2019-11-25',
 '2019-11-16_to_2019-11-26',
 '2019-11-17_to_2019-11-27']

In [3]:
csvs = {short_name: pd.read_csv('4/core_CSVs/' + name + '.csv', index_col=0) for name, short_name in zip(names, short_names)}

csvs[short_names[0]].info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53270 entries, 0 to 53269
Data columns (total 13 columns):
date             53270 non-null object
frame            53270 non-null int64
face_in_frame    53270 non-null int64
h_face           53270 non-null int64
RGB              53270 non-null object
score            53270 non-null float64
face_type        53270 non-null int64
gender           53270 non-null object
gender_conf      53270 non-null float64
Concesionario    53270 non-null int64
mean_RGB         53270 non-null float64
jump_date        53270 non-null bool
match            53270 non-null int64
dtypes: bool(1), float64(3), int64(6), object(3)
memory usage: 5.3+ MB


# Compare two clusterings for 21
(21 'had' ZERO visits)

In [4]:
clustering_a = csvs['11_to_21']
clustering_b = csvs['12_to_22']

print(clustering_a.shape, clustering_b.shape)

(50665, 12) (52334, 12)


In [5]:
clustering_a_21 = clustering_a[clustering_a.date == '2019-11-21']
clustering_b_21 = clustering_b[clustering_b.date == '2019-11-21']

print(len(clustering_a_21), clustering_a_21.match.unique())
print(len(clustering_b_21), clustering_b_21.match.unique())

1994 [ 0 83]
1935 [0]


In [6]:
img_numbers = sorted([int(name[:-4]) for name in os.listdir('4/frames_cnn_hog/2019-11-21/')])
len(img_numbers)

7839

In [7]:
core_imgs_a = []
non_core_imgs_a = []

core_imgs_b = []
non_core_imgs_b = []

for number in img_numbers:
    if number in clustering_a_21.frame.values:
        core_imgs_a.append(number)
    else:
        non_core_imgs_a.append(number)
    
    if number in clustering_b_21.frame.values:
        core_imgs_b.append(number)
    else:
        non_core_imgs_b.append(number)
        
print(len(core_imgs_a), len(non_core_imgs_a))
print(len(core_imgs_b), len(non_core_imgs_b))

1983 5856
1926 5913


In [8]:
def see_frames(imgs_dir, img_numbers, ms_waitKey=80, text=None):
    for number in img_numbers:
        img = cv2.imread(os.path.join(imgs_dir, str(number) + '.jpg'))
        
        if text:
            cv2.putText(img, text, (20, 50), cv2.FONT_HERSHEY_TRIPLEX, 1, (0, 0, 200), 2)
        
        cv2.imshow(imgs_dir, img)
        key = cv2.waitKey(ms_waitKey) & 0xFF
        if key == ord('q'):
            break
    
    cv2.destroyWindow(imgs_dir)

In [9]:
see_frames('4/frames_cnn_hog/2019-11-21/', core_imgs_a)

## Is any clustering of 21 decent?
(panic)

In [10]:
for sn in csvs:
    clustering = csvs[sn]
    if '2019-11-21' in clustering.date.unique():
        print(sn)
        clustering_21 = clustering[clustering.date == '2019-11-21']
        print(len(clustering_21), clustering_21.match.unique())
        print()

11_to_21
1994 [ 0 83]

12_to_22
1935 [0]

13_to_23
1849 [0]

14_to_24
1835 [0]

15_to_25
1828 [0]

16_to_26
1800 [0]

17_to_27
1759 [0]



## Let's just cluster 21

(panic panic)

In [10]:
CSVs_dir = '4/CSVs_cnn_hog/'
date = '2019-11-21'
door = 4

encodings_dir = '4/encodings_cnn_hog/'
n_jobs = 6
eps = 0.38
min_samples = 5
max_h_face = 182
min_rgb_mean = 20
min_hog_score = 0
face_types = [0, 1, 2]
h_face_inside = 143
worker_visits_threshold = 2

df_day = pd.read_csv(os.path.join(CSVs_dir, (date + '_1.csv')),
                index_col=0)
df_day['Concesionario'] = door
df_day['mean_RGB'] = (df_day['RGB'].str[1: -1].str.split(expand=True)
                      .astype(np.float64).mean(axis=1))

npy_day = np.load(os.path.join(encodings_dir, (date + '_1.npy')))

filter_faces = ((df_day.h_face <= max_h_face) &
                (df_day.mean_RGB >= min_rgb_mean) &
                (df_day.score >= min_hog_score) &
                (df_day.face_type.isin(face_types)))
df_day = df_day[filter_faces].reset_index(drop=True)
npy_day = npy_day[filter_faces]

core_df = df_day
core_npy = npy_day

clt = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=n_jobs)

clt.fit(core_npy)

core_indices = clt.core_sample_indices_
core_df = core_df.loc[core_indices, :].reset_index(drop=True)
core_df['match'] = clt.labels_[core_indices]

# visitors = set([core_df.loc[i, 'match'] for i in core_df.index if
#                 core_df.loc[i, 'h_face'] > h_face_inside])

# visitors_boolean = core_df['match'].isin(visitors)
# core_df = core_df[visitors_boolean].reset_index(drop=True)

core_npy = clt.components_#[visitors_boolean]


In [11]:
len(core_df.match.unique())

159

In [12]:
visitors = set([core_df.loc[i, 'match'] for i in core_df.index if
                core_df.loc[i, 'h_face'] > h_face_inside])

visitors_boolean = core_df['match'].isin(visitors)
visitors_df = core_df[visitors_boolean].reset_index(drop=True)

visitors_df.match.unique()

array([ 2,  4, 15, 17, 30, 31, 79, 83, 85, 86])

In [13]:
non_visitors_df = core_df[~visitors_boolean].reset_index(drop=True)

non_visitors_df.match.unique()

array([  0,   1,   3,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
        16,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
        45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,
        58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,
        71,  72,  73,  74,  75,  76,  77,  78,  80,  81,  82,  84,  87,
        88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100,
       101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
       114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
       127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
       140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152,
       153, 154, 155, 156, 157, 158])

In [15]:
see_frames('4/frames_cnn_hog/2019-11-21/', non_visitors_df.frame.values)

### See visitors

In [17]:
def see_person(imgs_dir, df, person, ms_waitKey=80):
    person_df = df[df.match == person]
    frames = person_df.frame.values
    
    see_frames(imgs_dir, frames, ms_waitKey=ms_waitKey, text=str(person))

In [26]:
visitors_list = visitors_df.match.unique()
see_person('4/frames_cnn_hog/2019-11-21/', visitors_df, visitors_list[0])

In [24]:
for i in range(10):
    try:
        see_person('4/frames_cnn_hog/2019-11-21/', 
                   visitors_df, visitors_list[i])
    except KeyboardInterrupt:
        break
    time.sleep(1)

cv2.destroyAllWindows()

KeyboardInterrupt: 