In [1]:
import os
import sys
import re
import pandas as pd
import glob
import numpy as np
import json
import tqdm
import datetime
import matplotlib.pyplot as plt
import seaborn as sb
from PIL import Image

sys.path.append('../')
import warnings
warnings.filterwarnings('ignore')

from src.vision.transientrgs import TransientRegressService
from src.vision.iodoorcls import InOutdoorClassifyService

# Create label data frame

In [2]:
print(InOutdoorClassifyService.get_instance())
print(TransientRegressService.get_instance())

<src.vision.iodoorcls.InOutdoorClassifyInstance object at 0x000001959D2DC310>
<src.vision.transientrgs.TransientRegressInstance object at 0x000001959D2DC1C0>


In [34]:
def extract_attributes(img):
    place_data = InOutdoorClassifyService.get_instance().predict(img)
    transient_data = TransientRegressService.get_instance().predict(img)

    data = {
        'place_data': place_data,
        'transient_data': transient_data
    }
    return data

def get_outdoor(attr):
    if type(attr) is str:
        attr = attr.replace('\'', '"')
        attr = json.loads(attr)
    iodoor_votes = [p['ioclass'] == 'outdoor' for p in attr['place_data']]
    if sum(iodoor_votes) > 2:
        return True
    return False

def read_exif(x):
    try:
        return open(
            x.replace('mirflickr25k\\', 'mirflickr25k\\meta\\exif\\').replace('jpg', 'txt').replace('im', 'exif'), 'r').read()
    except:
        return ''
    
def extract_oldest_time(exif):
    if not exif:
        return ''
    moments = []
    for s in re.findall(r'\d{4}:\d{2}:\d{2} \d{2}:\d{2}:\d{2}', exif):
        try:
            t = datetime.datetime.strptime(s, '%Y:%m:%d %H:%M:%S')
            moments.append(t)
        except:
            continue

    if len(moments) == 0:
        return ''

    oldest_moment = min(moments)
    return oldest_moment.strftime('%Y:%m:%d %H:%M:%S') # get oldest datetime in the file

In [4]:
data_dir = 'G:\\My Drive\\MIRFLICKR\\'

In [5]:
imgs = glob.glob(os.path.join(data_dir, 'mirflickr25k', '*.jpg'))
df = pd.DataFrame({'img': imgs})

In [6]:
df['attr'] = df['img'].apply(lambda x: extract_attributes(x))

In [7]:
df['exif'] = df['img'].apply(lambda x: read_exif(x))

In [8]:
df['time'] = df['exif'].apply(lambda x: extract_oldest_time(x))

In [35]:
df['is_outdoor'] = df['attr'].apply(lambda x: get_outdoor(x))

# Or load existed label data frame

In [2]:
df = pd.read_csv('label.tsv', sep='\t', index_col=0)

# Filter out images

In [3]:
df = df[(df['time'].notna()) & (df['is_outdoor'] == True)]

In [4]:
print(len(df))
df.head()

10697


Unnamed: 0,img,attr,exif,time,is_outdoor
2,G:\My Drive\MIRFLICKR\mirflickr25k\im10037.jpg,"{'place_data': [{'class': 'pond', 'ioclass': '...",-Image Width\n2288\n-Image Length\n2373\n-Bits...,2007:11:24 17:06:09,True
5,G:\My Drive\MIRFLICKR\mirflickr25k\im1001.jpg,"{'place_data': [{'class': 'industrial_area', '...",-Make\nPanasonic\n-Model\nDMC-FX12\n-Orientati...,2008:05:26 14:35:17,True
6,G:\My Drive\MIRFLICKR\mirflickr25k\im1.jpg,"{'place_data': [{'class': 'picnic_area', 'iocl...",-Image Width\n2769\n-Image Length\n2769\n-Bits...,2008:06:21 16:12:37,True
8,G:\My Drive\MIRFLICKR\mirflickr25k\im100.jpg,"{'place_data': [{'class': 'rice_paddy', 'iocla...",-Image Description\nThe field filled with a qu...,2008:06:22 07:58:38,True
9,G:\My Drive\MIRFLICKR\mirflickr25k\im1000.jpg,"{'place_data': [{'class': 'sky', 'ioclass': 'o...",-Make\nNIKON CORPORATION\n-Model\nNIKON D50\n-...,2008:06:05 02:25:33,True


In [5]:
df['attr'] = df['attr'].apply(lambda x: eval(x))

In [8]:
df['time'] = df['time'].apply(lambda s: datetime.datetime.strptime(s, '%Y:%m:%d %H:%M:%S'))

In [21]:
def is_daylight_or_night(t):
    if t.hour > 4 and t.hour < 17:
        if t.hour > 10 and t.hour < 15:
            return 'midday'
        return 'daylight'
    return 'night'

def check_unmatch_time(r):
    tag = is_daylight_or_night(r['time'])
    transient = [s['class'].split('/')[1] for s in r['attr']['transient_data'] if 'time/' in s['class']]
    if tag in transient:
        return 'matched'
    return 'unmatched'

In [22]:
df['match'] = df.apply(check_unmatch_time, axis=1)

In [32]:
df['norm_time'] = df.time.apply(lambda t: (t.hour*60*60+t.minute*60+t.second)/86400)

In [33]:
filter_df = df[df['match'] == 'matched']

In [35]:
filter_df.head()

Unnamed: 0,img,attr,exif,time,is_outdoor,match,norm_time
6,G:\My Drive\MIRFLICKR\mirflickr25k\im1.jpg,"{'place_data': [{'class': 'picnic_area', 'iocl...",-Image Width\n2769\n-Image Length\n2769\n-Bits...,2008-06-21 16:12:37,True,matched,0.675428
8,G:\My Drive\MIRFLICKR\mirflickr25k\im100.jpg,"{'place_data': [{'class': 'rice_paddy', 'iocla...",-Image Description\nThe field filled with a qu...,2008-06-22 07:58:38,True,matched,0.332384
10,G:\My Drive\MIRFLICKR\mirflickr25k\im10000.jpg,"{'place_data': [{'class': 'embassy', 'ioclass'...",-Make\nCanon\n-Model\nCanon EOS 20D\n-X-Resolu...,2007-11-29 01:05:59,True,matched,0.045822
21,G:\My Drive\MIRFLICKR\mirflickr25k\im10101.jpg,"{'place_data': [{'class': 'rock_arch', 'ioclas...",-Make\nNIKON\n-Model\nCOOLPIX L1\n-Orientation...,2007-11-25 05:06:16,True,matched,0.212685
25,G:\My Drive\MIRFLICKR\mirflickr25k\im101.jpg,"{'place_data': [{'class': 'hospital', 'ioclass...",-Image Description\nSONY DSC\n-Make\nSONY\n-Mo...,2008-06-28 22:01:52,True,matched,0.917963


In [34]:
filter_df.to_csv('label_train.tsv', sep='\t')

# Generate HTML for labeling

In [28]:
header = '''
<!DOCTYPE html>
<html>
<style>
table, th, td {
  border:1px solid black;
}
</style>
<head>
  <script type="text/javascript">
    var remove_files = [];
    function add_to_remove(file_path){
      document.getElementById(file_path).disabled = true;
      remove_files.push(file_path);
      //alert(remove_files);
    };

    function download() {
      var text = remove_files.join('\\n');
      var element = document.createElement('a');
      element.setAttribute('href', 'data:text/plain;charset=utf-8,' + encodeURIComponent(text));
      element.setAttribute('download', 'download.txt');

      element.style.display = 'none';
      document.body.appendChild(element);

      element.click();

      document.body.removeChild(element);
    };
  </script>
</head>
<body>

<h2>HTML View Image Time</h2>

<table style="width:100%">
  <tr>
    <th>Image</th>
    <th>Time</th>
    <th>Label</th>
  </tr>
'''
footer = '''
</table>
</body>
</html>
'''

In [29]:
def generate_html(df):
    table_content = ''
    idx = 0
    for _, row in df.iterrows():
        idx += 1
        table_content += '''
            <tr>
              <td><p>{}</p><img src="{}"></td>
              <td>{}</td>
              <td><button onclick="add_to_remove(this.id);" id="{}">Remove</button><br>
                  <button onclick="download();">Download</button></td>
            </tr>
        '''.format(row['img'], row['img'], row['time'], row['img'])

        if idx%1000 == 0:
            with open('view_image_time_'+str(idx)+'.html', 'w') as html:
                html.write(header+table_content+footer)
            table_content = ''

In [30]:
generate_html(filter_df)