In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
import json
import os
import time
import pandas as pd
import numpy as np

In [2]:
with open('../params.json', 'r') as f:
    params = json.load(f)

max_length = params['max_length']
padding_type = params['padding_type']
vocab_size = params['vocab_size']
embedding_dim = params['embedding_dim']
trunc_type = params['trunc_type']
oov_tok = params['oov_tok']

In [3]:
train_x=np.load('../processed/train_padded.npy')
train_y=np.load('../processed/train_y.npy')
val_x=np.load('../processed/val_padded.npy')
val_y=np.load('../processed/val_y.npy')
word_index = json.load(open('../processed/word_index.json'))
train_meta=pd.read_csv('../processed/train_meta.csv')
val_meta=pd.read_csv('../processed/val_meta.csv')

In [4]:
train_meta.columns

Index(['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions',
       'black', 'white', 'identity_any', 'severe_toxicity', 'obscene',
       'threat', 'insult', 'identity_attack', 'sexual_explicit', 'y',
       'from_source_domain'],
      dtype='object')

##### Encoding seperately

In [69]:

# def encode_labels(meta):
#     # example:
#     # female:1
#     # male:3
#     # lgbtq:5
#     # male+female:4
#     # male+lgbtq:8
#     # female+lgbtq:6
#     # male+female+lgbtq: 9
#     def encode_values(train_z):
#         categories=train_z.shape[1]
#         for i in range(categories):
#             train_z.iloc[:,i]=train_z.iloc[:,i].replace(1,2*i+1)
#         return(train_z)
#     z_gen=meta.iloc[:,:3]
#     z_rel=meta.iloc[:,3:6]
#     z_col=meta.iloc[:,6:8]
#     z_gen_encoded=encode_values(z_gen).sum(axis=1)
#     z_rel_encoded=encode_values(z_rel).sum(axis=1)
#     z_col_encoded=encode_values(z_col).sum(axis=1)
#     return ([z_gen_encoded.values,z_rel_encoded.values,z_col_encoded.values])
# train_z=encode_labels(train_meta)
# val_z=encode_labels(val_meta)
# # Step 3: Train classifiers and estimate P(y|z) using a simple neural network
# embedding_dim = 16

# text_input = tf.keras.layers.Input(shape=(max_length,))
# embedding_layer = tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_length)(text_input)
# flatten_text = tf.keras.layers.Flatten()(embedding_layer)

# # Create separate embedding layers for each identity information category
# meta_gen = tf.keras.layers.Input(shape=(1,))
# embedding_gen = tf.keras.layers.Embedding(input_dim=7, output_dim=embedding_dim)(meta_gen)
# flatten_gen = tf.keras.layers.Flatten()(embedding_gen)

# meta_rel = tf.keras.layers.Input(shape=(1,))
# embedding_rel = tf.keras.layers.Embedding(input_dim=7, output_dim=embedding_dim)(meta_rel)
# flatten_gen = tf.keras.layers.Flatten()(embedding_rel)

# meta_col = tf.keras.layers.Input(shape=(1,))
# embedding_col = tf.keras.layers.Embedding(input_dim=3, output_dim=embedding_dim)(meta_col)
# flatten_col = tf.keras.layers.Flatten()(embedding_col)


##### Encoding Together

In [5]:
# find values to be used for encoding:
import itertools

def all_sums(lst):
    sums = set()
    for r in range(2, len(lst) + 1):
        for combination in itertools.combinations(lst, r):
            sums.add(sum(combination))
    return sums

# Test the function
values=[2*i+5 for i in range(8)]
all_values=all_sums(values)
for i in values:
    if i in all_values:
        print(i)

In [6]:
def encode_labels(meta):
    # example:
    # female:1
    # male:3
    # lgbtq:5
    # male+female:4
    # male+lgbtq:8
    # female+lgbtq:6
    # male+female+lgbtq: 9 and so on
    def encode_values(train_z):
        categories=train_z.shape[1]
        for i in range(categories):
            train_z.iloc[:,i]=train_z.iloc[:,i].replace(1,2*i+5)
        return(train_z)
    z=meta.iloc[:,:8]
    z_encoded=encode_values(z)
    z_encode=z_encoded.sum(axis=1)
    return (z_encode.values)

In [7]:
train_z=encode_labels(train_meta)
val_z=encode_labels(val_meta)

In [8]:
len(np.unique(train_z))

78

In [9]:
pd.DataFrame({'value':train_z}).value_counts().sort_index()

value
0        160916
5         11395
7         18983
9          4378
11        17649
          ...  
84            2
85            1
87            6
91            1
96            2
Name: count, Length: 78, dtype: int64

In [10]:
len(word_index)

154222

In [11]:
# Step 3: Train classifiers and estimate P(y|z) using a simple neural network
embedding_dim = 17

text_input = tf.keras.layers.Input(shape=(max_length,))
embedding_layer = tf.keras.layers.Embedding(input_dim=len(word_index) + 1, output_dim=embedding_dim, input_length=max_length)(text_input)
flatten_text = tf.keras.layers.Flatten()(embedding_layer)

meta = tf.keras.layers.Input(shape=(1,))
embedding_meta = tf.keras.layers.Embedding(input_dim=78, output_dim=embedding_dim)(meta)
flatten_meta = tf.keras.layers.Flatten()(embedding_meta)

2024-01-23 18:26:48.909230: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-01-23 18:26:48.909258: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-01-23 18:26:48.909265: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-01-23 18:26:48.909302: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-01-23 18:26:48.909319: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [12]:
# Concatenate all embeddings
concatenated_embeddings = tf.keras.layers.concatenate([flatten_text, flatten_meta])

dense_layer = tf.keras.layers.Dense(16, activation='relu')(concatenated_embeddings)
output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(dense_layer)

prob_model = tf.keras.models.Model(inputs=[text_input, flatten_meta], outputs=output_layer)

prob_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

##### Model

In [13]:
prob_history=prob_model.fit([train_x,train_meta], train_y, epochs=5, batch_size=32,validation_data=([val_x,val_meta], val_y))

Epoch 1/5


2024-01-23 18:26:54.003718: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [15]:
probabilities = prob_model.predict([train_x, train_meta]).flatten()
prior_prob_y = np.mean(train_y)
weights = prior_prob_y / probabilities



In [16]:
probabilities

array([1.0000000e+00, 1.0000000e+00, 1.0000000e+00, ..., 6.9287011e-12,
       3.4749542e-10, 7.3590889e-10], dtype=float32)

In [17]:
prior_prob_y

0.11342306076859317

In [18]:
weights.shape

(269037,)

In [24]:
t = str(time.time())
path=os.path.join('../models',t)
if not os.path.exists(path):
    os.makedirs(path)
accuracy_path=str(round(prob_history.history['val_accuracy'][-1],2))
export_path = os.path.join(path,f'weighted_{accuracy_path}')
prob_model.save(f'{export_path}.keras')
json.dump(prob_history.history,open(f'{export_path}.json','w'))
with open(f'{export_path}_weights.npy','wb') as f:
    np.save(f, weights)