In [1]:
import os
import json
import csv
import sys

import torch
import numpy as np
import random
from torch.utils.data import DataLoader, Dataset
from tfrecord.torch.dataset import TFRecordDataset, MultiTFRecordDataset

def get_all_children(category,aso):
    
    childs = aso[category]["child_ids"]
    childs_names = []
    for child in childs:
        child_name = {}
        child_name["name"] = aso[child]["name"]
        if "child_ids" in aso[child]: child_name["children"] = get_all_children(child,aso)
        childs_names.append(child_name)
    if childs_names : return childs_names


def preprocess_ontology(data_dir):
    
    f = open(data_dir + 'metadata/ontology.json')
    ont_data = json.load(f)
    f.close()

    ont = {}
    for category in ont_data:
        tmp = {}
        tmp["name"] = category["name"]
        tmp["restrictions"] = category["restrictions"]
        tmp["child_ids"] = category["child_ids"]
        tmp["parents_ids"] = []
        ont[category["id"]] = tmp

    for category in ont: # find parents
        for c in ont[category]["child_ids"]:
            ont[c]["parents_ids"].append(category)

    higher_categories=[] # higher_categories are the ones without parents
    for category in ont: 
        if ont[category]["parents_ids"] == []:
            higher_categories.append(category)
            
    return ont, higher_categories
        
def get_all_parents(id_, ont, parents_names):
    
    parents = ont[id_]["parents_ids"]
    
    if parents:
        for parent in parents:
            parent_name = ont[parents[0]]["name"]        
            get_all_parents(parents[0], ont, parents_names)
    
    parents_names.append(ont[id_]["name"])

In [None]:
data_dir = './'
train_features_dir = 'audioset_v1_embeddings/bal_train/'

index_to_id = {}
index_to_classname = {}
with open(data_dir + 'metadata/class_labels_indices.csv', mode='r') as file:
    reader = csv.reader(file)    
    for rows in reader:
        index_to_id[rows[0]] = rows[1]
        index_to_classname[rows[0]] = rows[2]


# List of all .tfrecord files
tfrecord_files = os.listdir(data_dir + train_features_dir)

# Data/class arrays
train_class_1 = []
train_class_2 = []
sounds_data = []

# TFRecord attributes
context_description = {"video_id": "byte", "labels": "int"}
sequence_description = {"audio_embedding": "byte"}

ont, highest_category = preprocess_ontology(data_dir)
print(highest_category)

class_1 = []
class_2 = []
for id_ in highest_category:
    class_2.append(ont[id_]['name'])
    for id1_ in ont[id_]['child_ids']:
        class_1.append(ont[id1_]['name'])


class_1 = np.unique(class_1)
class_1 = np.concatenate((class_1[:29], class_1[30::]))
class_2 = np.unique(class_2)

print(class_1)
print(class_2)

class_to_index_1 = {class_1[i]:i for i in range(len(class_1))}
class_to_index_2 = {class_2[i]:i for i in range(len(class_2))}
count = 0

# Load each tfrecord file
for filename in tfrecord_files:

    tfrecord_path = data_dir + train_features_dir + filename
    dataset = TFRecordDataset(tfrecord_path, index_path=None, description=context_description, sequence_description=sequence_description)
    loader = torch.utils.data.DataLoader(dataset, batch_size=1)

    for data in iter(loader):
        data_labels = data[0]['labels'].tolist()[0]
        data_labels = [int(i) for i in data_labels]

        # Collect class labels
        parent_names = []
        class_lowest_label = []
        for i in data[0]['labels'][0].numpy():
            get_all_parents(index_to_id[str(i)], ont, parent_names)
            class_lowest_label.append(i)
        parent_names = np.unique(parent_names)
        
        class_1_label = list(set(class_1) & set(parent_names))
        class_2_label = list(set(class_2) & set(parent_names))
        
        target_1_label = []
        target_2_label = []
            
        file_data = []
            
        for t in data[1]['audio_embedding']:
            file_data.append(t.numpy())

        if(len(file_data) < 10):
            for i in range(10 - len(file_data)):
                file_data.append(t.numpy())

        for class_ in class_1_label:
            target_1_label.append(class_to_index_1[class_])
        for class_ in class_2_label:
            target_2_label.append(class_to_index_2[class_])

        train_class_1.append(np.asarray(target_1_label))
        train_class_2.append(np.asarray(target_2_label))

        sounds_data.append(np.concatenate(file_data))

        count+=1

# train_class_1 = np.concatenate(train_class_1)
# train_class_2 = np.concatenate(train_class_2)

print(len(train_class_1))
print(len(train_class_2))
print(len(sounds_data))

# print(len(np.concatenate(sounds_data, axis=0)))


# Save all data files
save_dir = './data/'
np.save(save_dir + 'audioset_train_data.npy', np.asanyarray(sounds_data, dtype=object))
np.save(save_dir + 'audioset_train_labels_1.npy', np.asanyarray(train_class_1, dtype=object))
np.save(save_dir + 'audioset_train_labels_2.npy', np.asanyarray(train_class_2, dtype=object))


In [None]:
print(len(class_1))
print(len(class_2))
print(np.sum(train_class_1, axis=0))
print(np.sum(train_class_2, axis=0))
# print(np.sum(train_class_lowest, axis=0))

# print(class_1[29])

print(train_class_1[2])
print(len(sounds_data))


# print(index_to_classname)

In [None]:
save_dir = './data/'
np.save(save_dir + 'audioset_labels_1.npy', train_class_1)
np.save(save_dir + 'audioset_labels_2.npy', train_class_2)
# np.save(save_dir + 'audioset_val_labels_1.npy', val_class1_index)
# np.save(save_dir + 'audioset_val_labels_2.npy', val_class2_index)
print(len(train_class_1))

In [None]:
data_dir = './'
val_features_dir = 'audioset_v1_embeddings/unbal_train/'

with open(data_dir + 'metadata/class_labels_indices.csv', mode='r') as file:
    reader = csv.reader(file)
    index_to_id = {rows[0]:rows[1] for rows in reader}        

# List of all .tfrecord files
tfrecord_files = os.listdir(data_dir + val_features_dir)

# Data/class arrays
val_class_1 = []
val_class_2 = []

# TFRecord attributes
context_description = {"video_id": "byte", "labels": "int"}
sequence_description = {"audio_embedding": "byte"}

ont, highest_category = preprocess_ontology(data_dir)
print(highest_category)
#class_2_names = [ont[id_]['name'] for id_ in highest_category]

class_1 = []
class_2 = []
for id_ in highest_category:
    class_2.append(ont[id_]['name'])
    for id1_ in ont[id_]['child_ids']:
        class_1.append(ont[id1_]['name'])


class_1 = np.unique(class_1)
class_1 = np.concatenate((class_1[:29], class_1[30::]))
class_2 = np.unique(class_2)

class_to_index_1 = {class_1[i]:i for i in range(len(class_1))}
class_to_index_2 = {class_2[i]:i for i in range(len(class_2))}
count = 0

sounds_data = []

# Load each tfrecord file
for filename in tfrecord_files:

    tfrecord_path = data_dir + val_features_dir + filename
    dataset = TFRecordDataset(tfrecord_path, index_path=None, description=context_description, sequence_description=sequence_description)
    loader = torch.utils.data.DataLoader(dataset, batch_size=1)

    for data in iter(loader):
        data_labels = data[0]['labels'].tolist()[0]
        data_labels = [int(i) for i in data_labels]

        # Collect class labels
        parent_names = []
        # print(data_labels)
        for i in data[0]['labels'][0].numpy():
            get_all_parents(index_to_id[str(i)], ont, parent_names)
        parent_names = np.unique(parent_names)
        
        class_1_label = list(set(class_1) & set(parent_names))
        class_2_label = list(set(class_2) & set(parent_names))
        
        # target_1_label = np.zeros((len(class_1), ))
        # target_2_label = np.zeros((len(class_2), ))
        target_1_label = []
        target_2_label = []
            
        file_data = []

        # Collect feature vectors/classes
        if(len(parent_names) >= 1):
            for t in data[1]['audio_embedding']:
                    file_data.append(t.numpy())
            
            for class_ in class_1_label:
                target_1_label.append(class_to_index_1[class_])
            for class_ in class_2_label:
                target_2_label.append(class_to_index_2[class_])
            
            val_class_1.append(np.asarray(target_1_label))
            val_class_2.append(np.asarray(target_2_label))
            
            sounds_data.append(np.concatenate(file_data))
            count+=1
            
            # print(val_class_1)
            # print(val_class_2)

# val_class_1 = np.concatenate(val_class_1)
# val_class_2 = np.concatenate(val_class_2)
    

In [5]:
data_dir = './'

with open(data_dir + 'metadata/class_labels_indices.csv', mode='r') as file:
    reader = csv.reader(file)
    index_to_id = {rows[0]:rows[1] for rows in reader}

# Data/class arrays
val_class_1 = []
val_class_2 = []


ont, highest_category = preprocess_ontology(data_dir)
print(highest_category)
#class_2_names = [ont[id_]['name'] for id_ in highest_category]

class_1 = []
class_2 = []
for id_ in highest_category:
    class_2.append(ont[id_]['name'])
    for id1_ in ont[id_]['child_ids']:
        class_1.append(ont[id1_]['name'])


class_1 = np.unique(class_1)
class_1 = np.concatenate((class_1[:29], class_1[30::]))
class_2 = np.unique(class_2)

condensed_idx = [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38,
       39, 40, 41]
class_1 = class_1[condensed_idx]

class_to_condensed = np.zeros((42,))
for i in range(len(condensed_idx)):
    class_to_condensed[condensed_idx[i]] = int(i)

data_dir = './test/audioset_strong/'
sounds_data = np.load(data_dir + 'audioset_strong_unbal_train_data.npy', allow_pickle=True)
class1_index = np.load(data_dir + 'audioset_strong_unbal_train_labels_1.npy', allow_pickle=True)
class2_index = np.load(data_dir + 'audioset_strong_unbal_train_labels_2.npy', allow_pickle=True)

val_class_1 = np.concatenate(class1_index, axis=0)
val_class_1 = [np.asarray(lbl) for lbl in val_class_1]
val_class_1 = [class_to_condensed[lbl].astype('int') for lbl in val_class_1]

val_class_2 = np.concatenate(class2_index, axis=0)
val_class_2 = [np.asarray(lbl) for lbl in val_class_2]

num_val = 3000
num_class_1 = len(class_1) 
num_val_per_class = int(num_val / num_class_1)

val_data = []
np.random.seed(100)

num_clips = len(sounds_data)
logits_1 = np.zeros((num_clips, num_class_1))

for i in range(num_clips):
    logits_1[i][val_class_1[i].astype(int)] = 1 
    
print(val_class_1[:40])

['/m/0dgw9r', '/m/0jbk', '/m/04rlf', '/m/059j3w', '/t/dd00041', '/t/dd00098', '/t/dd00123']
[array([21,  2]), array([21,  2]), array([21,  2, 35]), array([21,  2, 35]), array([21,  2, 35]), array([21,  2, 35]), array([21,  2, 35]), array([21,  2, 35]), array([21,  2, 35]), array([21,  2, 35]), array([19]), array([19]), array([19]), array([19]), array([19]), array([19]), array([19]), array([19]), array([19]), array([19]), array([16]), array([16]), array([16]), array([16]), array([16]), array([16]), array([16]), array([16]), array([16]), array([16]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([ 1, 19]), array([1])]


In [6]:
val_class1_subset = []
val_class2_subset = []
val_data = []

for idx in range(num_class_1):
    
    idx_match = np.where( logits_1[:, idx] == 1) [0]
    if idx_match.size != 0:
        val_idx = np.random.choice(idx_match, num_val_per_class)

    for i in range(num_val_per_class):

        class_idx = int(val_idx[i] - val_idx[i] % 10)
        
        val_data.append(sounds_data[val_idx[i]])
        # print(val_class_1[class_idx])
        val_class1_subset.append(val_class_1[class_idx:class_idx+10])
        val_class2_subset.append(val_class_2[class_idx:class_idx+10])


        
for i in range(len(val_data)):
    if val_data[i].shape[0] < 10:
        val_data[i] = np.concatenate([val_data[i], np.repeat(val_data[i][-1], 10 - val_data[i].shape[0], axis=0).reshape(-1, 128)]) 

# Save all data files
save_dir = './data/'
np.save(save_dir + 'audioset_strong_val_data.npy', np.asanyarray(val_data, dtype=object))
np.save(save_dir + 'audioset_strong_val_labels_1.npy', np.asanyarray(val_class1_subset, dtype=object))
np.save(save_dir + 'audioset_strong_val_labels_2.npy', np.asanyarray(val_class2_subset, dtype=object))

In [4]:
print(val_class1_subset)

[[array([23,  0, 16, 14]), array([14,  0, 16, 27]), array([ 0, 27, 23, 16, 14]), array([ 0, 27, 23, 16, 14]), array([14,  0, 16, 27]), array([14,  0, 16, 27]), array([ 0, 27, 23, 16, 14]), array([ 0, 27, 23, 16, 14]), array([23,  0, 16, 14]), array([23,  0, 16, 14])], [array([ 8, 19]), array([8]), array([8, 0]), array([0]), array([0]), array([0]), array([ 0, 19]), array([0]), array([ 0, 34, 19]), array([34])], [array([0]), array([0]), array([0]), array([0]), array([0]), array([0]), array([ 0, 25]), array([ 0, 25]), array([ 0, 25]), array([ 0, 25])], [array([ 0, 16, 14]), array([ 0, 16, 14]), array([ 0, 16, 14]), array([ 0, 16, 14]), array([ 0, 16, 14]), array([ 0, 16, 14]), array([ 0, 16, 14]), array([ 0, 16, 14]), array([ 0, 16, 14]), array([ 0, 16, 14])], [array([14,  0, 19]), array([14,  0, 19]), array([14,  0, 19]), array([14,  0, 19]), array([14,  0, 19]), array([14,  0, 19]), array([14,  0, 19]), array([ 0, 19, 12, 16, 14]), array([ 0, 19, 12, 16, 14]), array([14,  0, 12, 19])], 

In [None]:
data_dir = './'

with open(data_dir + 'metadata/class_labels_indices.csv', mode='r') as file:
    reader = csv.reader(file)
    index_to_id = {rows[0]:rows[1] for rows in reader}        

# Data/class arrays
val_class_1 = []
val_class_2 = []


ont, highest_category = preprocess_ontology(data_dir)
print(highest_category)
#class_2_names = [ont[id_]['name'] for id_ in highest_category]

class_1 = []
class_2 = []
for id_ in highest_category:
    class_2.append(ont[id_]['name'])
    for id1_ in ont[id_]['child_ids']:
        class_1.append(ont[id1_]['name'])


class_1 = np.unique(class_1)
class_1 = np.concatenate((class_1[:29], class_1[30::]))
class_2 = np.unique(class_2)

condensed_idx = [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38,
       39, 40, 41]
class_1 = class_1[condensed_idx]

class_to_condensed = np.zeros((42,))
for i in range(len(condensed_idx)):
    class_to_condensed[condensed_idx[i]] = int(i)

data_dir = './test/audioset_strong/'
sounds_data = np.load(data_dir + 'audioset_strong_bal_train_data.npy', allow_pickle=True)
class1_index = np.load(data_dir + 'audioset_strong_bal_train_labels_1.npy', allow_pickle=True)
class2_index = np.load(data_dir + 'audioset_strong_bal_train_labels_2.npy', allow_pickle=True)

val_class_1 = np.concatenate(class1_index, axis=0)
val_class_1 = [np.asarray(lbl) for lbl in val_class_1]
val_class_1 = [class_to_condensed[lbl].astype('int') for lbl in val_class_1]

val_class_2 = np.concatenate(class2_index, axis=0)
val_class_2 = [np.asarray(lbl) for lbl in val_class_2]



# Save all data files
save_dir = './data/'
np.save(save_dir + 'audioset_strong_bal_train_data.npy', np.asanyarray(sounds_data, dtype=object))
np.save(save_dir + 'audioset_strong_bal_train_labels_1.npy', np.asanyarray(val_class_1, dtype=object))
np.save(save_dir + 'audioset_strong_bal_train_labels_2.npy', np.asanyarray(val_class_2, dtype=object))


In [None]:
print(len(val_class_1))
print(len(val_class_2))
print(len(sounds_data))

print(val_class_1[:10])

In [None]:
data_dir = './test/audioset_strong/'
sounds_data = np.load(data_dir + 'audioset_strong_unbal_train_data.npy', allow_pickle=True)
class1_index = np.load(data_dir + 'audioset_strong_unbal_train_labels_1.npy', allow_pickle=True)
class2_index = np.load(data_dir + 'audioset_strong_unbal_train_labels_2.npy', allow_pickle=True)

In [None]:
data_dir = './'
val_features_dir = 'audioset_v1_embeddings/eval/'

with open(data_dir + 'class_labels_indices.csv', mode='r') as file:
    reader = csv.reader(file)
    index_to_id = {rows[0]:rows[1] for rows in reader}        

# List of all .tfrecord files
tfrecord_files = os.listdir(data_dir + val_features_dir)

# Data/class arrays
train_class_1 = []
train_class_2 = []

# TFRecord attributes
context_description = {"video_id": "byte", "labels": "int"}
sequence_description = {"audio_embedding": "byte"}

ont, highest_category = preprocess_ontology(data_dir)
print(highest_category)
#class_2_names = [ont[id_]['name'] for id_ in highest_category]

# class_1 = []
# class_2 = []
# for id_ in highest_category:
#     class_2.append(ont[id_]['name'])
#     for id1_ in ont[id_]['child_ids']:
#         class_1.append(ont[id1_]['name'])


# class_1 = np.unique(class_1)
# class_2 = np.unique(class_2)

# class_to_index_1 = {class_1[i]:i for i in range(len(class_1))}
# class_to_index_2 = {class_2[i]:i for i in range(len(class_2))}
count = 0

sounds_data = []

# Load each tfrecord file
for filename in tfrecord_files:

    tfrecord_path = data_dir + val_features_dir + filename
    dataset = TFRecordDataset(tfrecord_path, index_path=None, description=context_description, sequence_description=sequence_description)
    loader = torch.utils.data.DataLoader(dataset, batch_size=1)

    for data in iter(loader):
        data_labels = data[0]['labels'].tolist()[0]
        data_labels = [int(i) for i in data_labels]

        # Collect class labels
        parent_names = []
        # print(data_labels)
        for i in data[0]['labels'][0].numpy():
            get_all_parents(index_to_id[str(i)], ont, parent_names)
        parent_names = np.unique(parent_names)
        
        class_1_label = list(set(class_1) & set(parent_names))
        class_2_label = list(set(class_2) & set(parent_names))
        
        # target_1_label = np.zeros((len(class_1), ))
        # target_2_label = np.zeros((len(class_2), ))
        target_1_label = []
        target_2_label = []
            
        file_data = []

        # Collect feature vectors/classes
        # if(len(parent_names) >= 1):
        for t in data[1]['audio_embedding']:
            file_data.append(t.numpy())
            
        if(len(file_data) < 10):
            for i in range(10 - len(file_data)):
                file_data.append(t.numpy())

        for class_ in class_1_label:
            target_1_label.append(class_to_index_1[class_])
        for class_ in class_2_label:
            target_2_label.append(class_to_index_2[class_])

        train_class_1.append(np.asarray(target_1_label))
        train_class_2.append(np.asarray(target_2_label))

        sounds_data.append(np.concatenate(file_data))
        count+=1
            
            # print(val_class_1)
            # print(val_class_2)

# val_class_1 = np.concatenate(val_class_1)
# val_class_2 = np.concatenate(val_class_2)

In [None]:
print(np.concatenate(sounds_data).shape)

In [None]:
# Save all data files
save_dir = './data/'
np.save(save_dir + 'audioset_train_data.npy', np.asanyarray(sounds_data, dtype=object))
np.save(save_dir + 'audioset_train_labels_1.npy', np.asanyarray(train_class_1, dtype=object))
np.save(save_dir + 'audioset_train_labels_2.npy', np.asanyarray(train_class_2, dtype=object))

In [None]:
# Test data

data_dir = './'
val_features_dir = 'audioset_v1_embeddings/eval/'

with open(data_dir + 'metadata/class_labels_indices.csv', mode='r') as file:
    reader = csv.reader(file)
    index_to_id = {rows[0]:rows[1] for rows in reader}        

# List of all .tfrecord files
tfrecord_files = os.listdir(data_dir + val_features_dir)

# Data/class arrays
test_class_1 = []
test_class_2 = []

# TFRecord attributes
context_description = {"video_id": "byte", "labels": "int"}
sequence_description = {"audio_embedding": "byte"}

ont, highest_category = preprocess_ontology(data_dir)
print(highest_category)
#class_2_names = [ont[id_]['name'] for id_ in highest_category]

class_1 = []
class_2 = []
for id_ in highest_category:
    class_2.append(ont[id_]['name'])
    for id1_ in ont[id_]['child_ids']:
        class_1.append(ont[id1_]['name'])


class_1 = np.unique(class_1)
class_1 = np.concatenate((class_1[:29], class_1[30::]))
class_2 = np.unique(class_2)

class_to_index_1 = {class_1[i]:i for i in range(len(class_1))}
class_to_index_2 = {class_2[i]:i for i in range(len(class_2))}
count = 0

sounds_data = []

# Load each tfrecord file
for filename in tfrecord_files:

    tfrecord_path = data_dir + val_features_dir + filename
    dataset = TFRecordDataset(tfrecord_path, index_path=None, description=context_description, sequence_description=sequence_description)
    loader = torch.utils.data.DataLoader(dataset, batch_size=1)

    for data in iter(loader):
        data_labels = data[0]['labels'].tolist()[0]
        data_labels = [int(i) for i in data_labels]

        # Collect class labels
        parent_names = []
        # print(data_labels)
        for i in data[0]['labels'][0].numpy():
            get_all_parents(index_to_id[str(i)], ont, parent_names)
        parent_names = np.unique(parent_names)
        
        class_1_label = list(set(class_1) & set(parent_names))
        class_2_label = list(set(class_2) & set(parent_names))
        
        # target_1_label = np.zeros((len(class_1), ))
        # target_2_label = np.zeros((len(class_2), ))
        target_1_label = []
        target_2_label = []
            
        file_data = []

        # Collect feature vectors/classes
        # if(len(parent_names) >= 1):
        for t in data[1]['audio_embedding']:
            file_data.append(t.numpy())
            
        if(len(file_data) < 10):
            for i in range(10 - len(file_data)):
                file_data.append(t.numpy())

        for class_ in class_1_label:
            target_1_label.append(class_to_index_1[class_])
        for class_ in class_2_label:
            target_2_label.append(class_to_index_2[class_])

        test_class_1.append(np.asarray(target_1_label))
        test_class_2.append(np.asarray(target_2_label))

        sounds_data.append(np.concatenate(file_data))
        count+=1
            
            # print(val_class_1)
            # print(val_class_2)

# Save all data files
save_dir = './data/'
np.save(save_dir + 'audioset_test_data.npy', np.asanyarray(sounds_data, dtype=object))
np.save(save_dir + 'audioset_test_labels_1.npy', np.asanyarray(test_class_1, dtype=object))
np.save(save_dir + 'audioset_test_labels_2.npy', np.asanyarray(test_class_2, dtype=object))

In [None]:
# Get Ontology Matrix

with open(data_dir + 'class_labels_indices.csv', mode='r') as file:
    reader = csv.reader(file)
    index_to_id = {rows[0]:rows[1] for rows in reader}        

# List of all .tfrecord files
tfrecord_files = os.listdir(data_dir + val_features_dir)

# Data/class arrays
train_class_1 = []
train_class_2 = []

# TFRecord attributes
context_description = {"video_id": "byte", "labels": "int"}
sequence_description = {"audio_embedding": "byte"}

ont, highest_category = preprocess_ontology(data_dir)
print(highest_category)
class_2_names = [ont[id_]['name'] for id_ in highest_category]

class_1 = []
class_1_ = []
class_2 = []
for id_ in highest_category:
    class_2.append(ont[id_]['name'])
    class_2_children = []
    for id1_ in ont[id_]['child_ids']:
        class_2_children.append(ont[id1_]['name'])
        class_1.append(ont[id1_]['name'])
    class_1_.append(class_2_children)
        
    # print(class_2_children)

# print(class_2)
print(class_1_)

class_1 = np.unique(class_1)
class_2 = np.unique(class_2)

class_to_index_1 = {class_1[i]:i for i in range(len(class_1))}
class_to_index_2 = {class_2[i]:i for i in range(len(class_2))}

M = np.zeros((7, 43))
for i in range(len(class_1_)):
    for j in class_1_[i]:
        idx = class_to_index_1[j]
        M[i][idx] = 1
        
print(M)






