## nested image classifier

### (Re)processing test data

Once implemented, it was clear from reviewing predictions on test data that a few images in the `test_data/` directory had been missed by the initial processing and review, and were labelled incorrectly.   

This workbook follows a similar flow to the original processing_master_1 workbook, to review the images (by class) in `test_data/` and correct if incorrectly labelled.

In [1]:

# import libraries and nested_utilities
import nested_utilities as nutil
import os
import shutil
import pandas as pd

#
# create a timestamped directory for this processing path
out_dir = os.path.join('./data_catalogues/processing_'+nutil.timestamp())
os.mkdir(out_dir)
out_dir

'./data_catalogues/processing_2018-03-26_10-58-17'

#### 2. Catalogue ./data/test_data

First, catalogue the files in `test_data/`, then find (and remove) any duplicates.

In [2]:
# catalogues base_data folders

test_data_cat = nutil.build_catalogue('./data/test_data/')
test_data_cat.to_csv(os.path.join(out_dir, 'test_data_cat.csv'))
print(test_data_cat.shape)
print(test_data_cat.info())

test_data_cat.sample(12).head(12)

(5232, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5232 entries, 0 to 5231
Data columns (total 4 columns):
id            5232 non-null object
room          5232 non-null object
filename      5232 non-null object
image_path    5232 non-null object
dtypes: object(4)
memory usage: 163.6+ KB
None


Unnamed: 0,id,room,filename,image_path
3692,086518db96255916215692bdebea1aba920ed0ca,kitchen,086518db96255916215692bdebea1aba920ed0ca.jpg,./data/test_data/kitchen/086518db9625591621569...
1625,071547a9adda4a8e5ad9596631190fe80818980a,diningroom,071547a9adda4a8e5ad9596631190fe80818980a.jpg,./data/test_data/diningroom/071547a9adda4a8e5a...
2467,0717ec2d198dbee1c3f0d90e506c8e559e5d5749,front,0717ec2d198dbee1c3f0d90e506c8e559e5d5749.jpg,./data/test_data/front/0717ec2d198dbee1c3f0d90...
4065,05fef1dc1b679b23a49f39a34b61f24d2b65583f,livingroom,05fef1dc1b679b23a49f39a34b61f24d2b65583f.jpg,./data/test_data/livingroom/05fef1dc1b679b23a4...
788,05d782166995f57c456d8a29904e269386d8610e,bedroom,05d782166995f57c456d8a29904e269386d8610e.jpg,./data/test_data/bedroom/05d782166995f57c456d8...
3451,0697a1f885d01e5f9298c52ca7c23a71817706e5,kitchen,0697a1f885d01e5f9298c52ca7c23a71817706e5.jpg,./data/test_data/kitchen/0697a1f885d01e5f9298c...
4312,0813cfda8c174f8eef6baef617438b214865cc5a,livingroom,0813cfda8c174f8eef6baef617438b214865cc5a.jpg,./data/test_data/livingroom/0813cfda8c174f8eef...
677,052136ac1eaa05d6d08e6e4988d40fd7fe95f221,bedroom,052136ac1eaa05d6d08e6e4988d40fd7fe95f221.jpg,./data/test_data/bedroom/052136ac1eaa05d6d08e6...
2250,001f5071144bf10ce0922ce3168595f23a219b29,front,001f5071144bf10ce0922ce3168595f23a219b29.jpg,./data/test_data/front/001f5071144bf10ce0922ce...
2625,08b0f89fd56f01593d7291e487f2838f32a437f1,front,08b0f89fd56f01593d7291e487f2838f32a437f1.jpg,./data/test_data/front/08b0f89fd56f01593d7291e...


In [3]:
# finds any duplicates
id_val_counts = test_data_cat['id'].value_counts()
duplicates = id_val_counts.loc[id_val_counts > 1]
duplicates

Series([], Name: id, dtype: int64)

In [24]:
# checks on duplicates remaining
# should return an empty series

id_val_counts_2 = base_data_cat['id'].value_counts()
id_val_counts_2.loc[id_val_counts_2 > 1]


Series([], Name: id, dtype: int64)

In [25]:
print(base_data_cat.shape)
print(base_data_cat.info())

(20529, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20529 entries, 0 to 20528
Data columns (total 4 columns):
id            20529 non-null object
room          20529 non-null object
filename      20529 non-null object
image_path    20529 non-null object
dtypes: object(4)
memory usage: 641.6+ KB
None


### 3. MANUAL STEP

Go throw each folder, and move anything where label is incorrect / innappropriate into
'uncertain' folder.


bathroom -  DONE   
bedroom -  DONE   
conservatory - DONE   
diningroom -  DONE     
empty -   DONE    
entrance -  DONE   
kitchen - DONE   
livingroom - DONE    
(2 duplicate removed)    
misc_int -   DONE 
study - DONE   
carpark - DONE   
front - DONE
(2 duplicate deleted)    
garden - DONE (lots uncertain)   
graphic - DONE   
misc_ext - DONE   
read - DONE




### 4. WHEN MANUAL SORTING COMPLETE 

Re-catalogue with images in 'uncertain' folder

In [4]:
uncertain_data_cat = nutil.build_catalogue('./data/test_data/')
uncertain_data_cat.to_csv(os.path.join(out_dir, 'uncertain_test_data.csv'))

In [5]:
# filter down to JUST uncertain rooms
uncertain_data_cat = uncertain_data_cat.loc[uncertain_data_cat['room'] == 'uncertain']
uncertain_data_cat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61 entries, 5168 to 5228
Data columns (total 4 columns):
id            61 non-null object
room          61 non-null object
filename      61 non-null object
image_path    61 non-null object
dtypes: object(4)
memory usage: 2.4+ KB


In [6]:
# then merge (on id) with base_data_cat to get 'original' room

uncert_with_base = pd.merge(uncertain_data_cat, test_data_cat, 
                            how='left', on=['id', 'filename'], 
                            suffixes=('_uncertain', '_base'))

uncert_with_base = uncert_with_base[['id', 'room_uncertain', 'room_base',
                                     'filename','image_path_uncertain',
                                     'image_path_base']]

uncert_with_base.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61 entries, 0 to 60
Data columns (total 6 columns):
id                      61 non-null object
room_uncertain          61 non-null object
room_base               61 non-null object
filename                61 non-null object
image_path_uncertain    61 non-null object
image_path_base         61 non-null object
dtypes: object(6)
memory usage: 3.3+ KB


In [7]:
uncert_with_base.head(12)

Unnamed: 0,id,room_uncertain,room_base,filename,image_path_uncertain,image_path_base
0,044b375c47db713c7ae3082afcad912475450009,uncertain,garden,044b375c47db713c7ae3082afcad912475450009.jpg,./data/test_data/uncertain/044b375c47db713c7ae...,./data/test_data/garden/044b375c47db713c7ae308...
1,051e05e35a4daabf3a45ca74d21cd61c0e3994e7,uncertain,misc_int,051e05e35a4daabf3a45ca74d21cd61c0e3994e7.jpg,./data/test_data/uncertain/051e05e35a4daabf3a4...,./data/test_data/misc_int/051e05e35a4daabf3a45...
2,0524a32869b2aa89a22dd75bfcad196b2ea54bac,uncertain,garden,0524a32869b2aa89a22dd75bfcad196b2ea54bac.jpg,./data/test_data/uncertain/0524a32869b2aa89a22...,./data/test_data/garden/0524a32869b2aa89a22dd7...
3,052fc4c0c011640fcb89bcc65ec9418b6a60aeb7,uncertain,garden,052fc4c0c011640fcb89bcc65ec9418b6a60aeb7.jpg,./data/test_data/uncertain/052fc4c0c011640fcb8...,./data/test_data/garden/052fc4c0c011640fcb89bc...
4,0533cb32f4da11ceb879331498eb74fa433aec9e,uncertain,garden,0533cb32f4da11ceb879331498eb74fa433aec9e.jpg,./data/test_data/uncertain/0533cb32f4da11ceb87...,./data/test_data/garden/0533cb32f4da11ceb87933...
5,0539618c63187566f87dc05c79c48f9156de104e,uncertain,misc_int,0539618c63187566f87dc05c79c48f9156de104e.jpg,./data/test_data/uncertain/0539618c63187566f87...,./data/test_data/misc_int/0539618c63187566f87d...
6,0540c34de6290e11a1cd378568e5e5d3280fa3db,uncertain,garden,0540c34de6290e11a1cd378568e5e5d3280fa3db.jpg,./data/test_data/uncertain/0540c34de6290e11a1c...,./data/test_data/garden/0540c34de6290e11a1cd37...
7,054e066672f430334c2b608a4dc3f3cecd091e95,uncertain,garden,054e066672f430334c2b608a4dc3f3cecd091e95.jpg,./data/test_data/uncertain/054e066672f430334c2...,./data/test_data/garden/054e066672f430334c2b60...
8,054eb7cedf6efb260e80e1b4fa355e40b0cc247e,uncertain,garden,054eb7cedf6efb260e80e1b4fa355e40b0cc247e.jpg,./data/test_data/uncertain/054eb7cedf6efb260e8...,./data/test_data/garden/054eb7cedf6efb260e80e1...
9,054fa0301ad6bacb8c1e34401c1858e1f6f95554,uncertain,garden,054fa0301ad6bacb8c1e34401c1858e1f6f95554.jpg,./data/test_data/uncertain/054fa0301ad6bacb8c1...,./data/test_data/garden/054fa0301ad6bacb8c1e34...


In [8]:
#uncert_with_base.to_csv(os.path.join(out_dir, 'uncert_with_test.csv'))



### 6. After re-classifying 'uncertain' images - catalogue again


In [13]:
reclassified_test_data_cat = nutil.build_catalogue('./data/test_data/')
#reclassified_test_data_cat.to_csv(os.path.join(out_dir, 'reclassified_test_data.csv'))

In [14]:
reclassified_test_data_cat

Unnamed: 0,id,room,filename,image_path
0,003457d1fb62155b4531aba3e5b39f4d57bed9f9,bathroom,003457d1fb62155b4531aba3e5b39f4d57bed9f9.jpg,./data/test_data/bathroom/003457d1fb62155b4531...
1,0038cad4553a9922d24a70a4d77abb9d2c1261f9,bathroom,0038cad4553a9922d24a70a4d77abb9d2c1261f9.jpg,./data/test_data/bathroom/0038cad4553a9922d24a...
2,003f5052b875f64f4f5ce99948ae87f35d44b009,bathroom,003f5052b875f64f4f5ce99948ae87f35d44b009.jpg,./data/test_data/bathroom/003f5052b875f64f4f5c...
3,009e866e3eb114c3bc8579d780a4c333ddd38da9,bathroom,009e866e3eb114c3bc8579d780a4c333ddd38da9.jpg,./data/test_data/bathroom/009e866e3eb114c3bc85...
4,00da56e13e015b383effff3fe8560ae7fa7cb1f9,bathroom,00da56e13e015b383effff3fe8560ae7fa7cb1f9.jpg,./data/test_data/bathroom/00da56e13e015b383eff...
5,00fd9ed80bc7fd342aa288ac089b65a727a699a9,bathroom,00fd9ed80bc7fd342aa288ac089b65a727a699a9.jpg,./data/test_data/bathroom/00fd9ed80bc7fd342aa2...
6,011c734ba3011c3761772cda9eca6f711ed49919,bathroom,011c734ba3011c3761772cda9eca6f711ed49919.jpg,./data/test_data/bathroom/011c734ba3011c376177...
7,01316046c5542bdee356b63e000ce2be4b1d85b9,bathroom,01316046c5542bdee356b63e000ce2be4b1d85b9.jpg,./data/test_data/bathroom/01316046c5542bdee356...
8,013d33d80424a7ed689cbfcc3bb076d8e5062ac9,bathroom,013d33d80424a7ed689cbfcc3bb076d8e5062ac9.jpg,./data/test_data/bathroom/013d33d80424a7ed689c...
9,0165472f27f0a22ac7c7fb4384824e9f75d070f9,bathroom,0165472f27f0a22ac7c7fb4384824e9f75d070f9.jpg,./data/test_data/bathroom/0165472f27f0a22ac7c7...


In [15]:
reclass_with_base_and_uncertain = pd.merge(uncert_with_base, reclassified_test_data_cat,
                                          how='left', on=['id', 'filename'],
                                          suffixes=('_','_reclass'))

reclass_with_base_and_uncertain.rename(columns={'room':'room_reclass', 'image_path':'image_path_reclass'}, inplace=True)

reclass_with_base_and_uncertain = reclass_with_base_and_uncertain[['id', 'room_reclass','room_base',
                                                                    'room_uncertain','filename', 
                                                                  'image_path_reclass', 'image_path_base',
                                                                  'image_path_uncertain']]

In [16]:
#reclass_with_base_and_uncertain.to_csv(os.path.join(out_dir, 'reclass_test_and_test_orig.csv'))

In [17]:
reclass_with_base_and_uncertain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61 entries, 0 to 60
Data columns (total 8 columns):
id                      61 non-null object
room_reclass            61 non-null object
room_base               61 non-null object
room_uncertain          61 non-null object
filename                61 non-null object
image_path_reclass      61 non-null object
image_path_base         61 non-null object
image_path_uncertain    61 non-null object
dtypes: object(8)
memory usage: 4.3+ KB


In [18]:
reclass_with_base_and_uncertain.columns

Index(['id', 'room_reclass', 'room_base', 'room_uncertain', 'filename',
       'image_path_reclass', 'image_path_base', 'image_path_uncertain'],
      dtype='object')

In [19]:
# check there are no images reclassified as original label

reclass_with_base_and_uncertain.loc[reclass_with_base_and_uncertain['room_reclass'] == 
                                    reclass_with_base_and_uncertain['room_base']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 8 columns):
id                      0 non-null object
room_reclass            0 non-null object
room_base               0 non-null object
room_uncertain          0 non-null object
filename                0 non-null object
image_path_reclass      0 non-null object
image_path_base         0 non-null object
image_path_uncertain    0 non-null object
dtypes: object(8)
memory usage: 0.0+ bytes
