
# Frame-Level Models
 - DBOF(audio, rgb, audio+rgb)
 - MLP (audio, rgb, audio+rgb)

In [1]:
#Import libraries

import sys, os.path
FOLDER = os.path.abspath(os.path.join(os.getcwd() ,"../"))
metric_dir = (FOLDER+ '/metrics')
sys.path.append(metric_dir)
from keras.models import Model
from keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate
from keras.optimizers import adam_v2,gradient_descent_v2
from keras.layers.advanced_activations import LeakyReLU
import numpy as np
import pandas as pd
import glob
import os
from time import time
import tensorflow as tf
from keras.layers import GlobalAveragePooling1D
from keras.callbacks import EarlyStopping
from keras.metrics import TopKCategoricalAccuracy
from keras.callbacks import EarlyStopping, Callback
import wandb
from report import report_performance,make_top_n_pred_df

In [None]:
#using the weights and biases website to log our inputs
wandb.login()
wandb.init(project="my-test-project")

In [13]:
label_dict = pd.read_csv("vocabulary.csv")
# sample_frame averages all the frames audio or rgb information
def sample_frame(frame_data,n_frame_sample):

  rgb_by_vid = list(map(
      lambda frames: 
      np.array(frames)[
        np.random.choice(len(frames),size=n_frame_sample)
        ],frame_data))

  X= np.array(rgb_by_vid)
  return(X)

# Pre-Processing Records

In [17]:
FOLDER = '/Users/marlynehakizimana/Documents/SPRING/IDS705/FinalProj/frame/' #path to: train, val, test folders with *tfrecord files
def process_records(data,tp='test'):
    tfiles = sorted(glob.glob(os.path.join(FOLDER, tp, '*tfrecord')))
    
    ids,aud,rgbs, lbs = [],[],[],[]
    for fn in tfiles :
        
        for example in tf.data.TFRecordDataset(fn):#.take(500):#tf.python_io.tf_record_iterator(fn):
            tf_example = tf.train.SequenceExample()#tf.train.Example.FromString(example)
            rt=tf_example.ParseFromString(example.numpy())
            yss = np.array(tf_example.context.feature["labels"].int64_list.value)
            out = np.zeros(2000).astype(np.int8) #number of classes 1000
            rgb=[]
            audio=[]
            frames=len(tf_example.feature_lists.feature_list['rgb'].feature)
            #print("long",len(yss))
            if np.sum([True for i in yss if i<=1000])==len(yss):
                for y in yss:
                    out[y] = 1
                for k in range(frames):#np.random.randint(0,,100):
                    rgb.append(np.array(tf.io.decode_raw(tf_example.feature_lists.feature_list['rgb'].feature[k].bytes_list.value[0],tf.uint8)))
                    audio.append(np.array(tf.io.decode_raw(tf_example.feature_lists.feature_list['audio'].feature[k].bytes_list.value[0],tf.uint8)))
                ids.append(tf_example.context.feature["id"].bytes_list.value[0].decode(encoding="UTF-8"))
                lbs.append(out)
                aud.append(audio)
                rgbs.append(rgb)
    
       
    return np.array(ids),np.array(aud), np.array(rgbs), np.array(lbs)

In [None]:
def process_labels(data, tp='test'):
    tfiles = sorted(glob.glob(os.path.join(FOLDER, tp, '*tfrecord')))
    ids,ys=[],[]
    for fn in tfiles:
        
        for example in tf.data.TFRecordDataset(fn):#.take(500):
            tf_example = tf.train.SequenceExample()
            rt=tf_example.ParseFromString(example.numpy())
            
            yss = tf_example.context.feature["labels"].int64_list.value
            if np.sum([True for i in yss if i<=1000])==len(yss):
                ids.append(tf_example.context.feature["id"].bytes_list.value[0].decode(encoding="UTF-8"))
                ys.append(yss)
            
    return ys, np.array(ids) # returns original ids 


In [None]:
#input data for each style of feature
def prepare_input(input_type = "rgb",X_rgb_train=None,X_audio_train = None, y_train=None,
                     X_rgb_val=None,X_audio_val= None,y_val=None):
                    if input_type == "rgb":
                        X_train = X_rgb_train
                        X_val = X_rgb_val
                    elif input_type == "audio":
                        X_train = X_audio_train
                        X_val = X_audio_val
                    elif input_type == "both":
                        X_train = tf.concat([X_rgb_train, X_audio_train],1)
                        X_val = tf.concat([X_rgb_val, X_audio_val],1)
                    else:
                        print("invalid input type")
                        raise ValueError
                    return X_train,X_val,y_train,y_val

In [None]:
_, x1_val, x2_val, y_val = process_records('validation','validation')
_, x1_train, x2_train, y_train=process_records('train','train')
idx, x1_test, x2_test, y_test=process_records('test','test')
ylabels,ids=process_labels('test','test')
y_val_labels,ids1=process_labels('validation','validation')

x1_val=sample_frame(x1_val,50)
x2_val=sample_frame(x2_val,50)
x1_train=sample_frame(x1_train,50)
x2_train=sample_frame(x2_train,50)
x1_test=sample_frame(x1_test,50)
x2_test=sample_frame(x2_test,50)
X_train,Y_train=[x1_train,x2_train],y_train
X_val,Y_val=[x1_val,x2_val],y_val
idx,X_test,Y_test,ylabels=idx,[x1_test,x2_test],y_test,ylabels

# DBOF MODEL for RGB

In [None]:
wandb.login()
wandb.init(project="model-classification", entity="marlhakizi")

In [None]:
def fc_block(x,n=1024):
    x = Dense(n)(x)
    x = LeakyReLU()(x)
    x = BatchNormalization()(x)
    x=GlobalAveragePooling1D(data_format='channels_last',keepdims=False)(x)
    x = Dropout(0.2)(x)
    return x
def build_mod():
    in1 = Input((50,1024), name='x1')
    x1 = fc_block(in1)
    x=x1
    out = Dense(2000, activation='sigmoid', name='output')(x)
    model = Model(inputs=[in1], outputs=out)
    opt = tf.keras.optimizers.SGD(learning_rate=wandb.config['learning_rate'])
    model.compile(optimizer=opt, loss=wandb.config['loss'],metrics=[{"Top1 Accuracy":TopKCategoricalAccuracy(k=1)}])
    return model

In [None]:
class ValLog(Callback):
  """ Custom callback to log validation information
  at the end of each training epoch"""
  def __init__(self,X_val,num_log_batches=1):
    self.num_batches = num_log_batches
    self.X_val=X_val
    self.flat_class_names = label_dict.Name[:1000].values

  def on_epoch_end(self, epoch, logs={}):
    x_val,y_val,yval_labels,idx=X_val,y_val,y_val_labels,ids1
    val_preds = self.model.predict(x_val)
    gAP,PERR, HIT1,F1=report_performance(val_preds,y_val,verbose = True,thresh_step = 0.01,thresh=0.5)
    wandb.log({"Validation GAP":gAP,"Validation Hit@1":HIT1,"Validation F1-Score":F1})
    pred_df = make_top_n_pred_df(idx,val_preds,yval_labels,top_n_pred =5,get_names=True)
    yu=pd.DataFrame({'VideoId':pred_df.pesudo_id,"True labels": [str(i) for i in pred_df.label_true],"Pred labels":[str(i) for i in pred_df.label_pred],'Confidence':pred_df.predict_proba})
    predictions_table = wandb.Table(dataframe = yu)
    wandb.run.log({"validation_dta" : predictions_table})

wandb.config={
  "learning_rate": 0.01,
  "epochs": 800,
  "batch_size": 1000,
  "architecture": "Frame_level_pooling",
  "loss":'categorical_crossentropy',
}
model_audio = build_mod()
callback = EarlyStopping(monitor='loss', patience=10)
X_train,X_val,y_train,y_val=prepare_input(input_type = "rgb",X_rgb_train=X_train[1],X_audio_train = X_train[0], y_train=Y_train,
                     X_rgb_val=X_val[1],X_audio_val= X_val[0],y_val=Y_val)
model_audio.fit(X_train,y_train,epochs=wandb.config['epochs'],batch_size=wandb.config['batch_size'],
              validation_data = (X_val,y_val),callbacks=[callback,ValLog(num_log_batches=20,)])
loss,accuracy=model_audio.evaluate(X_test[1],Y_test)
wandb.log({"Test HitAt1 Accuracy": round(accuracy*100,2)})

In [None]:
model_audio.save_weights('weights.h5')
model_audio.load_weights('weights.h5')
y_predproba_test=model_audio.predict(X_test[1], verbose=1, batch_size=10)
gAP_test,PERR_test, HIT1_test,F1_test= report_performance(y_predproba_test,Y_test,verbose=True, thresh=0.5)



pred_df = make_top_n_pred_df(ids,y_predproba_test,ylabels,top_n_pred =5,get_names=False)

In [45]:
pred_df_raw_audio = pd.DataFrame({"pseudo_id": ids,
                            "y_true":tf.cast(Y_test,tf.int32).numpy().tolist(),
                            "y_predproba":y_predproba_test.tolist()})

In [46]:
# pred_df.to_pickle('data/framelevel_rgb.pkl')
# pred_df_raw_audio.to_pickle('data/framelevel_rgb_raw.pkl')

# DBOF MODEL for AUDIO

In [None]:
def build_mod():
    in1 = Input((50,128), name='x1')
    x = fc_block(in1)
    out = Dense(2000, activation='sigmoid', name='output')(x)
    
    model = Model(inputs=[in1], outputs=out)
    print(model.summary())
    opt = tf.keras.optimizers.SGD(learning_rate=wandb.config['learning_rate'])
    model.compile(optimizer=opt, loss=wandb.config['loss'],metrics=[{"Top1 Accuracy":TopKCategoricalAccuracy(k=1)}])
    return model

wandb.config={
  "learning_rate": 0.001,
  "epochs": 800,
  "batch_size": 1000,
  "architecture": "Frame_level_pooling",
  "loss":'categorical_crossentropy',
}
X_train,X_val,y_train,y_val=prepare_input(input_type = "audio",X_rgb_train=X_train[1],X_audio_train = X_train[0], y_train=Y_train,
                     X_rgb_val=X_val[1],X_audio_val= X_val[0],y_val=Y_val)
model_audio = build_mod()
callback = EarlyStopping(monitor='loss', patience=10)
model_audio.fit(X_train,Y_train,epochs=wandb.config['epochs'],batch_size=wandb.config['batch_size'],
              validation_data = (X_val,Y_val),callbacks=[callback,ValLog(X_val,num_log_batches=20,)])


In [None]:
model_audio.save_weights('weights.h5')
model_audio.load_weights('weights.h5')
y_predproba_test=model_audio.predict(X_test[0], verbose=1, batch_size=10)
gAP_test,PERR_test, HIT1_test,F1_test= report_performance(y_predproba_test,Y_test,verbose=True, thresh=0.5)
pred_df = make_top_n_pred_df(ids,y_predproba_test,ylabels,top_n_pred =5,get_names=False)

In [29]:
pred_df_raw_audio = pd.DataFrame({"pseudo_id": ids,
                            "y_true":tf.cast(Y_test,tf.int32).numpy().tolist(),
                            "y_predproba":y_predproba_test.tolist()})

In [32]:
#pred_df.to_pickle('data/framelevel_audio.pkl')
#pred_df_raw_audio.to_pickle('data/framelevel_audio_raw.pkl')

# DBOF MODEL for RGB+AUDIO

In [None]:
wandb.finish()

In [None]:
def build_mod():
    in1 = Input((50,128), name='x1')
    x1 = fc_block(in1)
    in2 = Input((50,1024), name='x2')
    x2 = fc_block(in2)
    x = Concatenate(axis=-1)([x1, x2])
    out = Dense(2000, activation='sigmoid', name='output')(x)
    model = Model(inputs=[in1,in2], outputs=out)
    opt = adam_v2.Adam(learning_rate=wandb.config['learning_rate'])
    model.compile(optimizer=opt, loss=wandb.config['loss'],metrics=[{"Top1 Accuracy":TopKCategoricalAccuracy(k=1)}])
    return model

X_train,X_val,y_train,y_val=prepare_input(input_type = "both",X_rgb_train=X_train[1],X_audio_train = X_train[0], y_train=Y_train,
                     X_rgb_val=X_val[1],X_audio_val= X_val[0],y_val=Y_val)
model_all = build_mod()
callback = EarlyStopping(monitor='val_loss', patience=10,restore_best_weights=True)
model_all.fit(X_train,y_train,epochs=wandb.config['epochs'],batch_size=wandb.config['batch_size'],
              validation_data = (X_val,y_val),callbacks=[callback,ValLog(X_val,num_log_batches=20)])


In [None]:
model_all.save_weights('weights.h5')
model_all.load_weights('weights.h5')
y_predproba_test=model_all.predict([X_test[0],X_test[1]], verbose=1, batch_size=10)
gAP_test,PERR_test, HIT1_test,F1_test= report_performance(y_predproba_test,Y_test,verbose=True, thresh=0.5)
pred_df = make_top_n_pred_df(ids,y_predproba_test,ylabels,top_n_pred =5,get_names=False)

In [46]:
pred_df_raw_all = pd.DataFrame({"pseudo_id": ids,
                            "y_true":tf.cast(Y_test,tf.int32).numpy().tolist(),
                            "y_predproba":y_predproba_test.tolist()})
#pred_df_raw_all.to_pickle('data/framelevel_all_raw.pkl')
#pred_df.to_pickle('data/framelevel_all.pkl')

# MLP Model for AUDIO

In [48]:
def avg_pooling(frame_data):
    # take avaerge across the frames for each video
    avg_rgb_by_vid = list(map(
        lambda frames: 
        np.array(frames).mean(axis=0),frame_data))

    X= np.array(avg_rgb_by_vid)
    return X

In [50]:
def process_records(data,tp='test'):
    tfiles = sorted(glob.glob(os.path.join(FOLDER, tp, '*tfrecord')))
    ids,aud,rgbs, lbs = [],[],[],[]
    for fn in tfiles :
        for example in tf.data.TFRecordDataset(fn):
            tf_example = tf.train.SequenceExample()
            rt=tf_example.ParseFromString(example.numpy())
            yss = np.array(tf_example.context.feature["labels"].int64_list.value)
            out = np.zeros(2000).astype(np.int8) #number of classes 1000
            rgb=[]
            audio=[]
            frames=len(tf_example.feature_lists.feature_list['rgb'].feature)
            if np.sum([True for i in yss if i<=1000])==len(yss):
                for y in yss:
                    out[y] = 1
                for k in range(frames):
                    rgb.append(np.array(tf.io.decode_raw(tf_example.feature_lists.feature_list['rgb'].feature[k].bytes_list.value[0],tf.uint8)))
                    audio.append(np.array(tf.io.decode_raw(tf_example.feature_lists.feature_list['audio'].feature[k].bytes_list.value[0],tf.uint8)))
                ids.append(tf_example.context.feature["id"].bytes_list.value[0].decode(encoding="UTF-8"))
                lbs.append(out)

                aud.append(np.mean(np.array(audio),axis=0))
                rgbs.append(np.mean(np.array(rgb),axis=0))
    
       
    return np.array(ids),np.array(aud), np.array(rgbs), np.array(lbs)

## Video-level Data Preprocessing

In [None]:
_, x1_val, x2_val, y_val = process_records('validation','validation')
_, x1_train, x2_train, y_train=process_records('train','train')
idx, x1_test, x2_test, y_test=process_records('test','test')
ylabels,ids=process_labels('test','test')

y_val_labels,ids1=process_labels('validation','validation')
X_train,Y_train=[x1_train,x2_train],y_train
X_val,Y_val=[x1_val,x2_val],y_val
idx,X_test,Y_test,ylabels=idx,[x1_test,x2_test],y_test,ylabels

In [None]:
wandb.finish()
wandb.init(project="model-classification", entity="marlhakizi")

In [None]:
def fc_block(x, n=4096, d=0.2):
    x = Dense(n)(x)
    x = BatchNormalization()(x)
    x = LeakyReLU()(x)
    x = Dropout(d)(x)
    return x

In [None]:
def build_mod():
    in1 = Input((128,), name='x1')
    x = fc_block(in1)
    x = Dense(1024)(x)
    out = Dense(2000, activation='sigmoid', name='output')(x)
    model = Model(inputs=[in1], outputs=out)
    opt = tf.keras.optimizers.SGD(learning_rate=wandb.config['learning_rate'])
    model.compile(optimizer=opt, loss=wandb.config['loss'],metrics=[{"Top1 Accuracy":TopKCategoricalAccuracy(k=1)}])
    return model

wandb.config={
  "learning_rate": 0.0001,
  "epochs": 800,
  "batch_size": 300,
  "architecture": "Frame_level_pooling",
  "loss":'categorical_crossentropy',
}
X_train,X_val,y_train,y_val=prepare_input(input_type = "rgb",X_rgb_train=X_train[1],X_audio_train = X_train[0], y_train=Y_train,
                     X_rgb_val=X_val[1],X_audio_val= X_val[0],y_val=Y_val)
model_dense_audio = build_mod()
callback = EarlyStopping(monitor='loss', patience=10)
model_dense_audio.fit(X_train,Y_train,epochs=wandb.config['epochs'],batch_size=wandb.config['batch_size'],
              validation_data = (X_val,Y_val),callbacks=[callback,ValLog(X_val,num_log_batches=20)])

In [None]:
model_dense_audio.save_weights('weights.h5')
model_dense_audio.load_weights('weights.h5')
y_predproba_test=model_dense_audio.predict(X_test[0], verbose=1, batch_size=10)
gAP_test,PERR_test, HIT1_test,F1_test= report_performance(y_predproba_test,Y_test,verbose=True, thresh=0.5)
pred_df = make_top_n_pred_df(ids,y_predproba_test,ylabels,top_n_pred =5,get_names=False)

In [67]:
pred_df_raw_all = pd.DataFrame({"pseudo_id": ids,
                            "y_true":tf.cast(Y_test,tf.int32).numpy().tolist(),
                            "y_predproba":y_predproba_test.tolist()})
#pred_df_raw_all.to_pickle('data/framelevel_mlp_audio_raw.pkl')
#pred_df.to_pickle('data/framelevel_mlp_audio.pkl')

# MLP Model for RGB

In [None]:
def build_mod():
    in1 = Input((1024,), name='x1')
    x = fc_block(in1)
    out = Dense(2000, activation='sigmoid', name='output')(x)
    
    model = Model(inputs=[in1], outputs=out)
    opt = tf.keras.optimizers.SGD(learning_rate=wandb.config['learning_rate'])
    model.compile(optimizer=opt, loss=wandb.config['loss'],metrics=[{"Top1 Accuracy":TopKCategoricalAccuracy(k=1)}])
    return model

wandb.config={
  "learning_rate": 0.0001,
  "epochs": 800,
  "batch_size": 300,
  "architecture": "Frame_level_pooling",
  "loss":'categorical_crossentropy',
}
X_train,X_val,y_train,y_val=prepare_input(input_type = "audio",X_rgb_train=X_train[1],X_audio_train = X_train[0], y_train=Y_train,
                     X_rgb_val=X_val[1],X_audio_val= X_val[0],y_val=Y_val)
model_rgb = build_mod()
callback = EarlyStopping(monitor='loss', patience=10)
model_rgb.fit(X_train,Y_train,epochs=wandb.config['epochs'],batch_size=wandb.config['batch_size'],
              validation_data = (X_val,Y_val),callbacks=[callback,ValLog(num_log_batches=20)])

In [None]:
model_rgb.save_weights('weights.h5')
model_rgb.load_weights('weights.h5')
y_predproba_test=model_rgb.predict(X_test[1], verbose=1, batch_size=10)
gAP_test,PERR_test, HIT1_test,F1_test= report_performance(y_predproba_test,Y_test,verbose=True, thresh=0.5)
pred_df = make_top_n_pred_df(ids,y_predproba_test,ylabels,top_n_pred =5,get_names=False)

In [73]:
pred_df_raw_all = pd.DataFrame({"pseudo_id": ids,
                            "y_true":tf.cast(Y_test,tf.int32).numpy().tolist(),
                            "y_predproba":y_predproba_test.tolist()})
#pred_df_raw_all.to_pickle('data/framelevel_mlp_rgb_raw.pkl')
#pred_df.to_pickle('data/framelevel_mlp_rgb.pkl')

# MLP Model for AUDIO+RGB

In [None]:
def build_mod():
    in1 = Input((128,))
    in2 = Input((1024,))
    x = Concatenate(axis=-1)([in1, in2])

    x = fc_block(x)
    out = Dense(2000, activation='sigmoid', name='output')(x)
    
    model = Model(inputs=[in1,in2], outputs=out)
    opt = gradient_descent_v2.SGD(learning_rate=wandb.config['learning_rate'])
    model.compile(optimizer=opt, loss=wandb.config['loss'],metrics=[{"Top1 Accuracy":TopKCategoricalAccuracy(k=1)}])
    return model

wandb.config={
  "learning_rate": 0.01,
  "epochs": 800,
  "batch_size": 300,
  "architecture": "Frame_level_pooling",
  "loss":tf.keras.losses.BinaryCrossentropy()

}
X_train,X_val,y_train,y_val=prepare_input(input_type = "both",X_rgb_train=X_train[1],X_audio_train = X_train[0], y_train=Y_train,
                     X_rgb_val=X_val[1],X_audio_val= X_val[0],y_val=Y_val)
model_all = build_mod()
callback = EarlyStopping(monitor='val_loss', patience=10,restore_best_weights=True)
model_all.fit(X_train,Y_train,epochs=wandb.config['epochs'],batch_size=wandb.config['batch_size'],
              validation_data = (X_val,Y_val),callbacks=[callback,ValLog(X_val,num_log_batches=20)])
wandb.log({"Test HitAt1 Accuracy": round(accuracy*100,2)})

In [None]:
model_all.save_weights('weights.h5')
model_all.load_weights('weights.h5')
y_predproba_test=model_all.predict([X_test[0],X_test[1]], verbose=1, batch_size=10)
gAP_test,PERR_test, HIT1_test,F1_test= report_performance(y_predproba_test,Y_test,verbose=True, thresh=0.5)
pred_df = make_top_n_pred_df(ids,y_predproba_test,ylabels,top_n_pred =5,get_names=False)

In [None]:
pred_df_raw_all = pd.DataFrame({"pseudo_id": ids,
                            "y_true":tf.cast(Y_test,tf.int32).numpy().tolist(),
                            "y_predproba":y_predproba_test.tolist()})
# pred_df_raw_all.to_pickle('data/framelevel_mlp_all_raw.pkl')
# pred_df.to_pickle('data/framelevel_mlp_all.pkl')