In [None]:
pip install tensorflow==2.11 keras==2.11 transformers==4.27.4 nltk seaborn scikit-learn nltk emoji numpy pandas matplotlib

In [None]:
import nltk
nltk.download('omw-1.4')

In [None]:
import nltk

nltk.download('all')

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sns
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import os
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import re
import emoji
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
import pickle  # For saving models in Pickle format

# Set random seeds for reproducibility
seed_value = 42
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

# Output directory for saving plots and model
output_dir = 'acp_models/AWARE_Productivityfinalresult'
os.makedirs(output_dir, exist_ok=True)

# Load your dataset and filter for 'productivity' domain
print("Loading dataset...")
df = pd.read_csv("AWARE_Comprehensive.csv")
df = df[df['domain'] == 'productivity']
print(f"Dataset loaded: {len(df)} samples in 'productivity' domain.")

# Encode categorical labels
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['sentiment'])

# Initialize the BERT tokenizer and set max sequence length
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 256

# Initialize the BERT-based model
num_labels = len(df['encoded_label'].unique())
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Define NLTK functions
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = emoji.demojize(text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

def preprocess_data(df, tokenizer, max_length=256):
    texts = df['sentence'].apply(clean_text).tolist()
    labels = df['encoded_label'].tolist()
    tokenized = tokenizer(texts, padding=True, truncation=True, return_tensors='tf', max_length=max_length)
    return tokenized, labels

# Define stratified K-fold cross-validation
n_splits = 10
all_test_labels = []
all_predicted_labels = []
all_train_loss = []
all_train_accuracy = []
all_val_loss = []
all_val_accuracy = []

for fold, (train_indices, val_indices) in enumerate(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42).split(df['sentence'], df['encoded_label'])):
    print(f"Fold {fold + 1}...")
    fold_train_df = df.iloc[train_indices]
    fold_val_df = df.iloc[val_indices]
    train_df, val_df = train_test_split(fold_train_df, test_size=0.1, random_state=42)

    train_data, train_labels = preprocess_data(train_df, tokenizer, max_length)
    val_data, val_labels = preprocess_data(val_df, tokenizer, max_length)
    test_data, test_labels = preprocess_data(fold_val_df, tokenizer, max_length)

    train_data = {key: np.array(val) for key, val in train_data.items()}
    val_data = {key: np.array(val) for key, val in val_data.items()}
    test_data = {key: np.array(val) for key, val in test_data.items()}

    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)
    model_checkpoint = ModelCheckpoint(os.path.join(output_dir, f'best_model_fold_{fold}'), save_best_only=True)

    model.fit(
        train_data, np.array(train_labels),
        validation_data=(val_data, np.array(val_labels)),
        epochs=16, batch_size=8,
        callbacks=[early_stopping, model_checkpoint],
        verbose=2
    )

    test_predictions = model.predict(test_data)['logits']
    test_predicted_labels = np.argmax(test_predictions, axis=1)
    all_test_labels.extend(test_labels)
    all_predicted_labels.extend(test_predicted_labels)

# Print classification report
class_report = classification_report(all_test_labels, all_predicted_labels, target_names=label_encoder.classes_)
print("AWARE_Productivity: Classification Report Across Folds:\n", class_report)

# Save the trained model in .h5 format
h5_path = os.path.join(output_dir, "bert_model_weights.h5")
model.save_weights(h5_path)
print(f"Model weights saved as .h5 at: {h5_path}")

# Save the entire model using Pickle
pickle_path = os.path.join(output_dir, "bert_model.pkl")
with open(pickle_path, "wb") as f:
    pickle.dump(model, f)
print(f"Model saved as Pickle at: {pickle_path}")

# Save the model in HuggingFace format
model.save_pretrained(output_dir)
print(f"Model saved in HuggingFace format at: {output_dir}")

# Calculate the confusion matrix
conf_mat = confusion_matrix(all_test_labels, all_predicted_labels)

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Productivity Domain: Confusion Matrix')
plt.savefig(os.path.join(output_dir, 'confusion_matrix.png'))
plt.show()

# Final Message
print("Model training, evaluation, and saving completed for the 'productivity' domain.")
