In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter
import tensorflow as tf

In [None]:
# Custom metrics definitions
def recall_m(y_true, y_pred):
    true_positives = tf.reduce_sum(tf.round(tf.clip_by_value(y_true * y_pred, 0, 1)))
    possible_positives = tf.reduce_sum(tf.round(tf.clip_by_value(y_true, 0, 1)))
    recall = true_positives / (possible_positives + tf.keras.backend.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = tf.reduce_sum(tf.round(tf.clip_by_value(y_true * y_pred, 0, 1)))
    predicted_positives = tf.reduce_sum(tf.round(tf.clip_by_value(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + tf.keras.backend.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + tf.keras.backend.epsilon()))

In [None]:
# Load datasets
domain1_path = "/content/domain1_train_data.json"
domain2_path = "/content/domain2_train_data.json"
test_data_path = "/content/test_data.json"

df1 = pd.read_json(domain1_path, lines=True)
df2 = pd.read_json(domain2_path, lines=True)

ValueError: Unexpected character found when decoding array value (1)

In [None]:
# Prepare data function
MAX_LENGTH = 1200
def prepare_data(df):
    texts = [[token + 1 for token in text] for text in df['text']]
    X_pad = pad_sequences(texts, maxlen=MAX_LENGTH)
    y = np.array(df['label'])
    return X_pad, y

X_balanced, y_balanced = prepare_data(df1)
X_imbalanced, y_imbalanced = prepare_data(df2)

In [None]:
# Model definition
EMBEDDING_DIM = 50
VOCAB_SIZE = max(np.max(X_balanced), np.max(X_imbalanced)) + 1

def create_lstm_model(vocab_size, embedding_dim, max_length):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        LSTM(16, return_sequences=True),
        GlobalMaxPooling1D(),
        Dense(32, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    return model


# Train on the balanced dataset
model = create_lstm_model(VOCAB_SIZE, EMBEDDING_DIM, MAX_LENGTH)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', f1_m, precision_m, recall_m])
model_checkpoint_balanced = ModelCheckpoint('/content/model_balanced_weights.h5', save_best_only=True, monitor='val_loss', mode='min')
model.fit(X_balanced, y_balanced, batch_size=16, epochs=20, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=1), model_checkpoint_balanced])

# # Prepare the SMOTE-balanced version of the imbalanced dataset
# smote = SMOTE()
# X_smote, y_smote = smote.fit_resample(X_imbalanced, y_imbalanced)

# Continue training on the SMOTE-balanced imbalanced dataset
model.load_weights('/content/model_balanced_weights.h5')  # Load the best weights from the first training phase
model_checkpoint_imbalanced = ModelCheckpoint('/content/model_smote_imbalanced_weights.h5', save_best_only=True, monitor='val_loss', mode='min')
model.fit(X_imbalanced, y_imbalanced, batch_size=16, epochs=20, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=1), model_checkpoint_imbalanced])

# Optionally: Evaluate the final model performance on a separate test set or validation set

final_model_weights_path = '/content/final_model_weights.h5'
model.save_weights(final_model_weights_path)



Epoch 1/20
Epoch 2/20


  saving_api.save_model(


Epoch 3/20
Epoch 4/20
Epoch 1/20
Epoch 2/20
Epoch 3/20


In [None]:
final_model_weights_path = '/content/final_model_weights.h5'  # Update this path as necessary

# Assuming create_lstm_model is defined as shown previously
final_model = create_lstm_model(VOCAB_SIZE, EMBEDDING_DIM, MAX_LENGTH)

# Load the final model weights
final_model.load_weights(final_model_weights_path)

# Compile the model if you plan on using evaluation metrics or training further
final_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

X_test = pd.read_json("/content/test_data.json", lines=True)

# Assuming the preprocessing steps remain consistent with training
texts_test = [[token + 1 for token in text] for text in X_test['text']]
X_test_pad = pad_sequences(texts_test, maxlen=MAX_LENGTH)

# Make predictions with the final model
predictions = final_model.predict(X_test_pad).flatten()

# Convert predictions to binary labels
y_pred_final = (predictions > 0.5).astype(int)

submission_df = pd.DataFrame({'id': X_test['id'], 'class': y_pred_final})

# Display counts of predicted classes
print(Counter(y_pred_final))

# Save to CSV for submission
submission_file_path = '/content/kaggle_submission.csv'
submission_df.to_csv(submission_file_path, index=False)

print(f"Submission saved to {submission_file_path}")

In [None]:
# For df1
label_counts_df1 = df1['label'].value_counts()
print("Label distribution in df1:")
print(label_counts_df1)

# For df2
label_counts_df2 = df2['label'].value_counts()
print("\nLabel distribution in df2:")
print(label_counts_df2)


Label distribution in df1:
label
1    2500
0    2500
Name: count, dtype: int64

Label distribution in df2:
label
0    11500
1     1500
Name: count, dtype: int64


In [None]:
# Assume X_balanced_pad, y_balanced, create_lstm_model(), etc., are already defined as before

# Preparing the balanced dataset
X_balanced_pad = pad_sequences(texts, maxlen=MAX_LENGTH)
y_balanced = np.array(labels)

early_stop = EarlyStopping(monitor='val_loss', patience=1)  # Adjusted patience
checkpoint = ModelCheckpoint(model_weights_file, monitor="val_loss", mode="min", save_weights_only=True, save_best_only=True, verbose=1)


# Splitting the dataset
X_balanced_train, X_balanced_val, y_balanced_train, y_balanced_val = train_test_split(
    X_balanced_pad, y_balanced, test_size=0.2, stratify=y_balanced
)

# Creating the model
model = create_lstm_model(VOCAB_SIZE, EMBEDDING_DIM, MAX_LENGTH)

# Compiling the model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', f1_m, precision_m, recall_m])

# Training the model on the balanced dataset
history_balanced = model.fit(X_balanced_train, y_balanced_train,
                             epochs=20,
                             batch_size=64,
                             validation_data=(X_balanced_val, y_balanced_val),
                             callbacks=[early_stop, checkpoint])


NameError: name 'texts' is not defined

In [None]:
# Applying SMOTE to the imbalanced dataset
X_imbalanced_pad = pad_sequences(texts_im, maxlen=MAX_LENGTH)
y_imbalanced = np.array(labels_im)
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X_imbalanced_pad, y_imbalanced)

# Splitting the SMOTE-balanced dataset
X_smote_train, X_smote_val, y_smote_train, y_smote_val = train_test_split(
    X_smote, y_smote, test_size=0.2, stratify=y_smote
)

# Continue training on the SMOTE-balanced dataset
history_imbalanced_continued = model.fit(X_smote_train, y_smote_train,
                                         epochs=20,
                                         batch_size=64,
                                         validation_data=(X_smote_val, y_smote_val),
                                         callbacks=[early_stop, checkpoint])


In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import Counter

# Assuming MAX_LENGTH and model are defined somewhere above this snippet
# MAX_LENGTH = ...
# model = ...

# Load the test set
X_test = pd.read_json("/content/test_data.json", lines=True)

# Preprocess test data
texts_test = [[token + 1 for token in text] for text in X_test['text']]
X_test_pad = pad_sequences(texts_test, maxlen=MAX_LENGTH)

# Make predictions with the model
predictions = model.predict(X_test_pad).flatten()

# Convert predictions to binary labels
y_pred_final = (predictions > 0.5).astype(int)

# Prepare submission DataFrame
submission_df = pd.DataFrame({'id': X_test['id'], 'class': y_pred_final})

# Display counts of predicted classes
print(Counter(y_pred_final))

# Save to CSV for submission
submission_file_path = '/content/kaggle_submission.csv'
submission_df.to_csv(submission_file_path, index=False)
print(f"Submission saved to {submission_file_path}")

# Display the result DataFrame
print(submission_df.head())