In [1]:
import os
import time
import sys
import pandas as pd
import tensorflow as tf
import help_functions as hf
from configs import configs
import urllib


Using TensorFlow backend.


In [5]:

hf.setup_gpu(gpu_nr=0) # if some of the GPUs is busy, choose one (0 or 1)

# ===========================================
# =========== HYPER-PARAMETERS ==============
# ===========================================

config = configs[0]
print(config)
os.mkdir(config['results_folder'])

# Save outputs to log file
# old_stdout = sys.stdout
# log_file = open(config['results_folder'] + '/log.txt', 'w')
# sys.stdout = log_file


# ============================================
# ================= LOAD DATA ================
# ============================================
train, _ = hf.get_flow(df_file=config['data_folder'] + '/train_df.json.bz2',
                              batch_size=config['batch_size'],
                              image_dimension=config['image_dimension'])
print('LOG: finished getting training flow')

y_true = hf.get_y_true(shape=(train.samples, len(train.class_indices)), classes=train.classes)

val_stop, val_stop_df = hf.get_flow(df_file=config['data_folder'] + '/val_df.json.bz2',
                          batch_size=config['batch_size'],
                          image_dimension=config['image_dimension'])
print(f'val_stop_df shape: {val_stop_df.shape}')
print('LOG: Got the validation flow')


2 Physical GPUs, 1 Logical GPU
{'batch_size': 64, 'epochs': 100, 'image_dimension': 64, 'monitor': 'pr_auc', 'random_initialization': False, 'class_weights': False, 'hierarchical': False, 'loss_function': 'sample_weight', 'data_folder': '/home/matvieir/wiki_image_classification/src/classification/data/jpg-data', 'results_folder': 'results_paper/230703_sample_weight_100epochs'}
Found 70000 validated image filenames belonging to 28 classes.
LOG: finished getting training flow
Found 7500 validated image filenames belonging to 28 classes.
val_stop_df shape: (7500, 5)
LOG: Got the validation flow


In [6]:
# Load human-labeled set
def get_human_flow(human_df_address):
    human_df = pd.read_parquet(human_df_address)
    human_df['labels'] = human_df.apply(lambda x: list(x.labels), axis=1) # otherwise the labels column will be a list of lists
    human_df['url'] = human_df.apply(lambda x: '/scratch/WIT_Dataset/images/' + x.url.split('/wikipedia/commons/')[1], axis=1)
    human_df['url'] = human_df['url'].apply(lambda encoded_filename : urllib.parse.unquote(encoded_filename).encode().decode('unicode-escape'))
    print(f'----------------------- \nhuman_df shape: {human_df.shape}\n\n\n')
    human, human_df = hf.get_flow(df=human_df, batch_size=config['batch_size'], image_dimension=config['image_dimension'])
    return human, human_df
human, human_df = get_human_flow('../../data/evaluation/annotated_validation.parquet')

----------------------- 
human_df shape: (327, 5)



Found 285 validated image filenames belonging to 28 classes.




In [7]:
# ============================================
# ============= CREATE MODEL =================
# ============================================
print('LOG: creating model')
start = time.time()
model = hf.create_model(n_labels=len(train.class_indices), 
                        image_dimension=config['image_dimension'],
                        random_initialization=config['random_initialization'],
                        loss=config['loss_function'],
                        y_true=y_true)

LOG: creating model

Number of layers in basemodel: 339
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
efficientnetb2 (Functional)  (None, 2, 2, 1408)        7768569   
_________________________________________________________________
flatten (Flatten)            (None, 5632)              0         
_________________________________________________________________
dense (Dense)                (None, 128)               721024    
_________________________________________________________________
dense_1 (Dense)              (None, 28)                3612      
Total params: 8,493,205
Trainable params: 8,425,630
Non-trainable params: 67,575
_________________________________________________________________


In [69]:
# train.reset()
# val_stop.reset()
# human.reset()

In [8]:
# import csv
# class EvaluateCallback(tf.keras.callbacks.Callback):
#     def __init__(self, human_dataflow, val_dataflow, history_csv_path):
#         super(EvaluateCallback, self).__init__()
#         # self.human_dataflow = human_dataflow
#         # self.val_dataflow = val_dataflow
#         self.history_csv_path = history_csv_path
#         print('model summary from EvaluateCallback')
#         model.summary()

#     def on_epoch_end(self, epoch, logs=None):

#         print('\n---------- PREDICTING VAL ---------')
#         # val_evaluation_results = model.evaluate(val_stop, verbose=1)
#         # print(f'\nval_evaluation_results: {val_evaluation_results}')


#         print('\n---------- PREDICTING HUMAN ---------')
#         human_evaluation_results = model.evaluate(human, verbose=1)
#         print(f'\nhuman_evaluation_results: {human_evaluation_results}')
#         # human_loss = human_evaluation_results[0]
#         # human_metrics_values = human_evaluation_results[1:]

#         # with open(self.history_csv_path, 'a') as file:
#         #     writer = csv.writer(file)
#         #     writer.writerow([epoch] + [human_loss] + human_metrics_values)

import csv
class EvaluateCallback(tf.keras.callbacks.Callback):
    def __init__(self, human_dataflow, history_csv_path):
        super(EvaluateCallback, self).__init__()
        self.human_dataflow = human_dataflow
        self.history_csv_path = history_csv_path

    def on_epoch_end(self, epoch, logs=None):

        print('\n---------- PREDICTING HUMAN ---------')
        human_evaluation_results = self.model.evaluate(self.human_dataflow, verbose=1)
        print(f'\nhuman_evaluation_results: {human_evaluation_results}')
        human_loss = human_evaluation_results[0]
        human_metrics_values = human_evaluation_results[1:]

        with open(self.history_csv_path, 'a') as file:
            writer = csv.writer(file)
            writer.writerow([epoch] + [human_loss] + human_metrics_values)

In [9]:
print(train.__len__())
print(val_stop.__len__())
print(human.__len__())

1094
118
5


In [11]:
# ============================================
# ================ TRAIN MODEL ===============
# ============================================

history_callback = tf.keras.callbacks.CSVLogger(f"{config['results_folder']}/history.csv", separator=',', append=True)
evaluate_callback = EvaluateCallback(human_dataflow=human, history_csv_path=f"{config['results_folder']}/history_human.csv")

history = model.fit(
        train,
        verbose=1, # one line per epoch 
        # validation_data=val_stop,
        steps_per_epoch=50,
        epochs=1,
        callbacks=[history_callback, 
                   evaluate_callback
                   ])

---------- PREDICTING HUMAN ---------

human_evaluation_results: [6.135798931121826, 0.619172990322113, 0.13159547746181488, 0.6046175956726074, 0.1448349952697754]
