In [1]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [2]:
# train_x=np.load('./processed/train_padded.npy')
# train_z
# train_y=np.load('./processed/train_y.npy')
# val_x=np.load('./processed/val_padded.npy')
# val_z=pd.read_csv('./')
# val_y=np.load('./processed/val_y.npy')


In [3]:
# Generate some dummy data for illustration purposes
np.random.seed(42)
num_samples = 1000
max_length = 10
num_identity_categories = 3  # Adjust this based on the number of identity categories
x = np.random.rand(num_samples, max_length)  # Random features for text samples
z1 = np.random.choice(['male', 'female'], size=num_samples)  # Dummy identity information category 1
z2 = np.random.choice(['young', 'old'], size=num_samples)  # Dummy identity information category 2
z3 = np.random.choice(['student', 'professional'], size=num_samples)  # Dummy identity information category 3
y = np.random.randint(2, size=num_samples)  # Dummy labels (binary classification)

In [6]:
# Step 2: Preprocess text data and split the dataset
# tokenizer = tf.keras.preprocessing.text.Tokenizer()
# tokenizer.fit_on_texts(x)
# x_seq = tokenizer.texts_to_sequences(x)
# x_padded = tf.keras.preprocessing.sequence.pad_sequences(x_seq, maxlen=max_length, padding='post')

# Encode each identity information category separately
z1_encoded = np.array([1 if identity == 'female' else 0 for identity in z1])
z2_encoded = np.array([1 if identity == 'old' else 0 for identity in z2])
z3_encoded = np.array([1 if identity == 'professional' else 0 for identity in z3])

# x_train, x_test, z1_train, z1_test, z2_train, z2_test, z3_train, z3_test, y_train, y_test = train_test_split(
#     x_padded, z1_encoded, z2_encoded, z3_encoded, y, test_size=0.2, random_state=42
# )

In [None]:
# Step 3: Train classifiers and estimate P(y|z) using a simple neural network
embedding_dim = 8

text_input = tf.keras.layers.Input(shape=(max_length,))
embedding_layer = tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_length)(text_input)
flatten_text = tf.keras.layers.Flatten()(embedding_layer)

# Create separate embedding layers for each identity information category
identity_input1 = tf.keras.layers.Input(shape=(1,))
embedding_identity1 = tf.keras.layers.Embedding(input_dim=2, output_dim=embedding_dim)(identity_input1)
flatten_identity1 = tf.keras.layers.Flatten()(embedding_identity1)

identity_input2 = tf.keras.layers.Input(shape=(1,))
embedding_identity2 = tf.keras.layers.Embedding(input_dim=2, output_dim=embedding_dim)(identity_input2)
flatten_identity2 = tf.keras.layers.Flatten()(embedding_identity2)

identity_input3 = tf.keras.layers.Input(shape=(1,))
embedding_identity3 = tf.keras.layers.Embedding(input_dim=2, output_dim=embedding_dim)(identity_input3)
flatten_identity3 = tf.keras.layers.Flatten()(embedding_identity3)


In [None]:
# Concatenate all embeddings
concatenated_embeddings = tf.keras.layers.concatenate([flatten_text, flatten_identity1, flatten_identity2, flatten_identity3])

dense_layer = tf.keras.layers.Dense(16, activation='relu')(concatenated_embeddings)
output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(dense_layer)

model = tf.keras.models.Model(inputs=[text_input, identity_input1, identity_input2, identity_input3], outputs=output_layer)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit([x_train, z1_train, z2_train, z3_train], y_train, epochs=5, batch_size=32, validation_split=0.2)

In [None]:
# Step 4: Calculate instance weights
probabilities = model.predict([x_train, z1_train, z2_train, z3_train]).flatten()
prior_prob_y = np.mean(y_train)
weights = prior_prob_y / probabilities

In [None]:
# Step 5: Train models using calculated instance weights
weighted_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=8, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
weighted_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
weighted_model.fit([x_train, z_train], y_train, epochs=5, batch_size=32, sample_weight=weights, validation_split=0.2)

In [None]:
# Step 6: Evaluate the model
y_pred_weighted = (weighted_model.predict([x_test, z_test]) > 0.5).astype(int).flatten()
accuracy_weighted = accuracy_score(y_test, y_pred_weighted)
print("Accuracy with instance weights:", accuracy_weighted)
