In [1]:
# Data processing
import os
import pickle
import json
import pandas as pd
import numpy as np
import sys

# Multiprocessing
import multiprocessing
import tqdm

egocom_loc = "/datasets/cgn/EGOCOM/"
visual_loc = egocom_loc + "raw_features/egocom_audio_video_features/visual/"
audio_loc = egocom_loc + "raw_features/egocom_audio_video_features/audio/"
voxaudio_loc = egocom_loc + "raw_features/voxceleb_audio_features/"

In [2]:
def process(x):
    '''parallelized multiprocessing helper function to
    Convert string representation of features to np.array
    for each row in the dataframe.'''

    key, data = x
    data.video_feature = data.video_feature.apply(lambda x: np.array(eval(x), dtype=np.float32))
    return data

def process_vox(z):
    l = z.split('\t')
    return (int(l[0].split("/")[-1][4:7]), int(l[1]) // 1000 - history_length, np.array(eval(l[2]), dtype=np.float32))

In [3]:
# Fetch video information
video_info = pd.read_csv(egocom_loc + "video_info.csv")

In [9]:
video_info.head()

Unnamed: 0,video_id,conversation_id,video_speaker_id,num_speakers,speaker_name,speaker_gender,duration_seconds,word_count,speaker_is_host,tokenized_words,native_speaker,video_name,background_fan,background_music,cid,train,val,test
0,1,day_1__con_1__part1,1,3,curtis,male,275,488,True,Okay So I have some topics in my hand and I ll...,True,vid_001__day_1__con_1__person_1_part1,False,False,day_1__con_1,True,False,False
1,2,day_1__con_1__part2,1,3,curtis,male,295,446,True,That s good Alright that was one You have to T...,True,vid_002__day_1__con_1__person_1_part2,False,False,day_1__con_1,True,False,False
2,3,day_1__con_1__part3,1,3,curtis,male,295,497,True,Is it actually Dude that s like a platter to s...,True,vid_003__day_1__con_1__person_1_part3,False,False,day_1__con_1,True,False,False
3,4,day_1__con_1__part4,1,3,curtis,male,295,409,True,This might be too hard One to three Any whatev...,True,vid_004__day_1__con_1__person_1_part4,False,False,day_1__con_1,True,False,False
4,5,day_1__con_1__part5,1,3,curtis,male,83,142,True,I m pretty sure when you guys say kids you re ...,True,vid_005__day_1__con_1__person_1_part5,False,False,day_1__con_1,True,False,False


In [7]:
video_info['speaker_name'].unique().shape

(29,)

In [4]:
include_audio = False
include_video = True
include_text = True
include_voxceleb_audio = True

for history_length in [4, 5, 10, 30]:
    print("\n\nHistory length: {}\n\n".format(history_length))

    ################
    # Prepare labels
    ################

    # get labels
    label_loc = egocom_loc + "raw_features/speaker_labels/raw_audio_speaker_labels_1.json"
    label_dict = json.load(open(label_loc, 'r'))

    # Process labels
    result = []
    for conversation_id, conv_df in video_info.groupby('conversation_id'):
        label_matrix = []
        for s in label_dict[conversation_id]:
            speaker_labels = [0, 0, 0]
            if s >= 0:
                speaker_labels[s-1] = 1
            label_matrix.append(speaker_labels)
        label_matrix = np.array(label_matrix).astype(bool)

        for i, row in conv_df.iterrows():
            video_id = row["video_id"]
            video_speaker_id = int(row["video_speaker_id"])

            # Advance speaker labels to match history_length start time
            binary_speaker_labels = label_matrix[history_length:, video_speaker_id - 1]
            multiclass_speaker_labels = label_dict[conversation_id][history_length:]
            assert(len(binary_speaker_labels) == len(multiclass_speaker_labels))
            
            # Prepare speaker label data.
            binary_speaker_labels = pd.DataFrame({
                "video_id" : [video_id]*len(binary_speaker_labels),
                "clip_id" : list(range(len(binary_speaker_labels))),
                "video_speaker_id" : [video_speaker_id]*len(binary_speaker_labels),
                "is_speaking" : binary_speaker_labels,
                "multiclass_speaker_label" : multiclass_speaker_labels,
            })
            result.append(binary_speaker_labels)
    speaker_labels = pd.concat(result)
    
    ############################
    # Processing visual features
    ############################

    fn_base = 'egocom_featex_visual_480p_32fp_sliding_window_1_sec_clip_duration_{}_sec.tsv'
    fn_base2 = 'egocom_featex_round_2_visual_480p_32fp_sliding_window_1_sec_clip_duration_{}_sec.tsv'
    fn_base3 = 'egocom_featex_round_3_visual_480p_32fp_sliding_window_1_sec_clip_duration_{}_sec.tsv'
    hl = history_length
    vfn1, vfn2, vfn3 = fn_base.format(hl), fn_base2.format(hl), fn_base3.format(hl)

    # Get video data
    video_df1 = pd.read_csv(visual_loc + vfn1, sep='\t').drop('feature_name', axis = 1)
    video_df2 = pd.read_csv(visual_loc + vfn2, sep='\t').drop('feature_name', axis = 1)
    video_df3 = pd.read_csv(visual_loc + vfn3, sep='\t').drop('feature_name', axis = 1)
    video_df = pd.concat([video_df1[video_df1["video_id"] != 87], video_df2, video_df3])
    video_df = video_df.sort_values(by=['video_id', 'clip_id']).reset_index(drop=True)
    video_df.columns = ["video_id", "clip_id", "video_feature"]

    # Convert string representation of features to a numpy array
    jobs = list(video_df.groupby('video_id'))
    with multiprocessing.Pool(multiprocessing.cpu_count() // 2) as p:
        visual_features = list(tqdm.tqdm(p.imap(process, jobs), total=len(jobs)))

    # Remove videos that are shorter than the history length
    speaker_label_counts = speaker_labels.groupby('video_id')["clip_id"].count()
    bool_mask = [min(len(visual_features[i]), n) > history_length for i, n in enumerate(speaker_label_counts)]
    visual_features = [visual_features[i] for i, b in enumerate(bool_mask) if b]

    # Convert to a pandas dataframe
    video_features = pd.concat(visual_features)
    
    # Remove memory that is no longer being used.
    del video_df1, video_df2, video_df, visual_features


    ########################
    # Fetching text features
    ########################

    rfn = egocom_loc + "raw_features/text_features/egocom_text_window_1_sec_clip_duration_{}_sec.csv.gz".format(history_length)
    text_features = pd.read_csv(rfn)


    ###########################
    # Processing audio features
    ###########################
    
    if include_audio:
        fn_base = 'egocom_featex_audio_480p_32fp_sliding_window_1_sec_clip_'
        afn = fn_base + 'duration_{}_sec.pkl'.format(history_length)
        rfn = os.path.join(audio_loc, afn)
        audio_data = pickle.load(open(rfn, 'rb'), encoding='latin1')

        # # map data to id and transform to dataframe
        audio_features = dict(zip(list(audio_data.values())[1], list(audio_data.values())[0][:,:,0,0]))
        audio_features = pd.DataFrame(pd.Series(audio_features), columns = ['audio_feature']).reset_index()
        audio_features["clip_id"] = audio_features["index"] % 1000000
        audio_features["video_id"] = audio_features["index"] // 1000000
        audio_features = audio_features[["video_id", "clip_id", "audio_feature"]]
        audio_features = audio_features.sort_values(by=["video_id", "clip_id"]).reset_index(drop=True)


    ####################################
    # Processing voxceleb audio features
    ####################################

    with open(os.path.join(voxaudio_loc, "{}s_history.tsv".format(history_length)), 'r') as rf:
        jobs = rf.readlines()

    # Convert string representation of features to a numpy array
    with multiprocessing.Pool(multiprocessing.cpu_count() // 2) as p:
        voxceleb_audio_features = list(tqdm.tqdm(p.imap(process_vox, jobs), total=len(jobs)))

    vox_features = pd.DataFrame(voxceleb_audio_features, columns = ['video_id', 'clip_id', 'voxceleb_feature'])
    vox_features = vox_features.sort_values(by=["video_id", "clip_id"]).reset_index(drop=True)


    ####################
    # Combining features
    ####################

    # Expand video features
    expanded_video_feature = video_features.video_feature.apply(pd.Series)
    expanded_video_feature.rename(columns = lambda x : 'videofeat_' + str(x), inplace = True)
    video_features = pd.concat([video_features[:], expanded_video_feature[:]], axis=1)
    video_features = video_features.drop("video_feature", axis = 1)

    if include_audio:
        # Expand audio features
        expanded_audio_feature = audio_features.audio_feature.apply(pd.Series)
        expanded_audio_feature.rename(columns = lambda x : 'audiofeat_' + str(x), inplace = True)
        audio_features = pd.concat([audio_features[:], expanded_audio_feature[:]], axis=1)
        audio_features = audio_features.drop("audio_feature", axis = 1)

    # Expand voxceleb audio features
    expanded_vox_audio_feature = vox_features.voxceleb_feature.apply(pd.Series)
    expanded_vox_audio_feature.rename(columns = lambda x : 'voxaudiofeat_' + str(x), inplace = True)
    vox_features = pd.concat([vox_features[:], expanded_vox_audio_feature[:]], axis=1)
    vox_features = vox_features.drop("voxceleb_feature", axis = 1)

    kinds = ["speaker_labels"]
    if include_video:
        kinds += ["video_features"]
    if include_voxceleb_audio:
        kinds += ["vox_features"]
    if include_audio:
        kinds += ["audio_features"]
    if include_text:
        kinds += ["text_features"]
    for kind in kinds:
        print(kind, "shape:", eval(kind+".shape"))

    if include_video:
        features = pd.merge(speaker_labels, video_features, on=["video_id", "clip_id"])
    if include_voxceleb_audio:
        features = pd.merge(features, vox_features, on=["video_id", "clip_id"])
    if include_audio:
        features = pd.merge(features, audio_features, on=["video_id", "clip_id"])
    if include_text:
        features = pd.merge(features, text_features, on=["video_id", "clip_id"]).astype(np.float32)
        
    for col in ['video_id', 'clip_id', 'video_speaker_id', 'is_speaking', 'multiclass_speaker_label']:
        features[col] = features[col].astype(int)

    print('Combined features shape:', features.shape)
    wfn = egocom_loc + "egocom_features/{}".format("" if include_audio else "no_audio/")
    wfn += "egocom_features_history_{}sec.csv.gz".format(history_length)
    features.to_csv(wfn, index = False)



History length: 30




100%|██████████| 175/175 [00:53<00:00,  4.11it/s]
100%|██████████| 133385/133385 [00:10<00:00, 12709.29it/s]


speaker_labels shape: (133579, 5)
video_features shape: (179799, 2050)
vox_features shape: (133385, 514)
text_features shape: (132517, 302)
Combined features shape: (129245, 2865)


# Spot checks by hand just to double check the data is all there

In [36]:
example_video_id = 103

In [38]:
features[features["video_id"] == example_video_id].head(7)

Unnamed: 0,video_id,clip_id,video_speaker_id,is_speaking,multiclass_speaker_label,videofeat_0,videofeat_1,videofeat_2,videofeat_3,videofeat_4,...,textfeat_290,textfeat_291,textfeat_292,textfeat_293,textfeat_294,textfeat_295,textfeat_296,textfeat_297,textfeat_298,textfeat_299
16564,103,34,3,1,3,0.15648,0.477414,0.239192,0.163075,0.04673,...,-0.000541,0.009074,-0.007822,-0.011873,-0.006985,0.048612,-0.006977,-0.003371,-0.018853,-0.008745
16565,103,121,3,0,2,0.347946,0.670336,0.394521,0.145161,0.185763,...,-0.007709,0.003957,-0.040048,-0.004995,-0.011025,0.060045,0.008032,-0.001023,-0.001839,-0.005079
16566,103,127,3,0,1,0.264372,0.742878,0.463978,0.105797,0.097891,...,-0.007171,0.002314,-0.054292,-0.002222,-0.010692,0.063628,0.001039,-0.000278,-0.007424,-0.003511
16567,103,185,3,1,3,0.170867,0.721446,0.333568,0.193393,0.066277,...,-0.005281,0.009412,-0.032269,-0.006288,-0.009438,0.059162,0.008014,0.001651,-0.003917,-0.008694
16568,103,199,3,1,3,0.048776,0.562371,0.38872,0.14969,0.042304,...,-0.007198,0.009831,-0.034188,-0.007052,-0.00922,0.06652,0.007192,0.000513,-0.008346,-0.008864
16569,103,228,3,1,3,0.141822,0.537018,0.309961,0.132002,0.024338,...,-0.009802,0.008404,-0.03502,-0.004249,-0.010108,0.060781,0.004177,0.001874,-0.011867,-0.007469
16570,103,248,3,1,3,0.117398,0.194729,0.194117,0.177627,0.006712,...,-0.009472,0.006511,-0.026374,-0.005072,-0.01233,0.057954,0.00387,-0.003944,-0.01024,-0.004522


In [39]:
video_name = video_info["video_id"].unique()[example_video_id - 1]

In [40]:
example_conv_id = video_info[video_info["video_id"] == video_name]["conversation_id"].iloc[0]

In [41]:
label_dict[example_conv_id][history_length:history_length+10]

[1, 1, 1, 1, 1, -1, 3, 2, 2, 1]

In [99]:
visual_features[example_video_id - 1][:5]

Unnamed: 0,video_id,clip_id,video_feature
52992,103,0,"[0.29031527, 0.38000706, 0.41235313, 0.3192424..."
52993,103,1,"[0.23112316, 0.41078487, 0.23396023, 0.3016702..."
52994,103,2,"[0.29989845, 0.44903848, 0.40882987, 0.2679896..."
52995,103,3,"[0.21006092, 0.46809155, 0.5286632, 0.25458714..."
52996,103,4,"[0.22803378, 0.42719585, 0.38565657, 0.3782850..."


In [117]:
audio_features[audio_features["video_id"] == example_video_id][:5]

Unnamed: 0,video_id,clip_id,audiofeat_0,audiofeat_1,audiofeat_2,audiofeat_3,audiofeat_4,audiofeat_5,audiofeat_6,audiofeat_7,...,audiofeat_1078,audiofeat_1079,audiofeat_1080,audiofeat_1081,audiofeat_1082,audiofeat_1083,audiofeat_1084,audiofeat_1085,audiofeat_1086,audiofeat_1087
27310,103,5,0.0,0.135431,0.0,0.163185,0.28449,1.017407,0.738017,0.08642,...,0.676555,0.0,1.161103,0.523533,0.283989,0.829357,0.641407,0.316373,0.397134,0.156204
27311,103,10,0.0,0.270538,0.042484,0.338724,0.539095,0.73089,0.620804,0.104097,...,0.653954,0.0,1.041482,0.0,0.248131,0.904516,0.499591,0.391067,0.095162,0.166748
27312,103,33,0.0,0.23672,0.192774,0.372545,0.45342,0.774573,0.615093,0.074882,...,0.947882,0.0,1.161197,0.377865,0.176155,1.020026,0.66816,0.536593,0.621274,0.049073
27313,103,84,0.0,0.757181,0.163958,0.706198,0.359317,0.444866,0.550963,0.50957,...,0.0,0.0,0.50731,0.0,0.455685,0.035783,0.095541,0.0,0.07572,0.392692
27314,103,110,0.0,0.008448,0.267999,0.030694,0.241523,0.921572,0.487302,0.567639,...,0.563751,0.0,1.463946,0.111007,0.103446,0.980496,0.654527,0.564016,0.593797,0.0


In [115]:
features[features["video_id"] == example_video_id].iloc[:5,2052:2056]

Unnamed: 0,audiofeat_0,audiofeat_1,audiofeat_2,audiofeat_3
22139,0.0,0.135431,0.0,0.163185
22140,0.0,0.270538,0.042484,0.338724
22141,0.0,0.23672,0.192774,0.372545
22142,0.0,0.757181,0.163958,0.706198
22143,0.0,0.008448,0.267999,0.030694


In [124]:
text_features[text_features["video_id"] == example_video_id].iloc[[5,10,33,84,110]].iloc[:5]

Unnamed: 0,video_id,clip_id,textfeat_0,textfeat_1,textfeat_2,textfeat_3,textfeat_4,textfeat_5,textfeat_6,textfeat_7,...,textfeat_290,textfeat_291,textfeat_292,textfeat_293,textfeat_294,textfeat_295,textfeat_296,textfeat_297,textfeat_298,textfeat_299
40103,103,5,-0.005515,0.006285,0.013993,-0.000482,-0.018979,0.021974,-0.014417,0.007278,...,0.00371,0.03319,0.031289,-0.022179,-0.009255,0.029306,-0.0006,-0.012099,-0.006461,0.002132
40108,103,10,0.000494,-0.00937,0.055845,-0.009622,-0.00082,-0.022827,0.058497,0.013719,...,-0.00564,0.004532,0.007862,0.000463,-0.015229,0.051811,-0.024427,-0.01215,-0.031281,-0.004966
40131,103,33,0.002469,0.011267,0.095266,-0.020928,0.012211,-0.058161,0.069665,-0.001591,...,-0.014657,0.010207,-0.007963,0.003028,-0.014595,0.047441,-0.016477,0.003657,-0.01566,-0.013429
40182,103,84,0.007324,0.068344,-0.000591,-0.001605,-0.030067,-0.010131,0.104319,0.035723,...,0.012106,-0.002328,-0.025645,-0.002793,0.013141,0.074327,-0.000921,0.006707,0.008553,-0.005904
40208,103,110,-0.010155,-0.029877,0.10697,-0.002043,0.012839,-0.014283,0.106064,0.005318,...,-0.005151,0.014638,-0.103219,-0.019736,-0.015837,0.059858,0.011622,-0.00508,-0.01218,-0.009802
