In [3]:
pip install tensorflow==2.11 keras==2.11 transformers==4.27.4 nltk seaborn scikit-learn nltk emoji numpy pandas matplotlib

Collecting tensorflow==2.11
  Downloading tensorflow-2.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.1 kB)
Collecting keras==2.11
  Downloading keras-2.11.0-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting transformers==4.27.4
  Downloading transformers-4.27.4-py3-none-any.whl.metadata (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow==2.11)
  Downloading gast-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting protobuf<3.20,>=3.9.2 (from tensorflow==2.11)
  Downloading protobuf-3.19.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (787 bytes)
Collecting tensorboard<2.12,>=2.11 (from tensorflow==2.11)
  Downloading tensorboard-2.11.2-py3-none-any.whl.metadata (1.9 kB)
Collecting tensorflow-estimator<2.12,>=2.11.0 (from tensorflow=

In [1]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [2]:
import nltk

nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

In [3]:
from google.colab import drive
import os
drive.mount('/content/drive')
path_to_file = '/content/drive/MyDrive/Colab Notebooks/AWARE/'
os.chdir(path_to_file)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sns
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import os
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import re
import emoji
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
import pickle  # For saving models in Pickle format

# Set random seeds for reproducibility
seed_value = 42
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

# Output directory for saving plots and model
output_dir = 'acd_models/AWARE_GamesResults'
os.makedirs(output_dir, exist_ok=True)

# Load your dataset and filter for 'games' domain
print("Loading dataset...")
df = pd.read_csv("AWARE_Comprehensive.csv")
df = df[df['domain'] == 'games']
print(f"Dataset loaded: {len(df)} samples in 'games' domain.")

# Encode categorical labels
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['category'])

# Initialize the BERT tokenizer and set max sequence length
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 256

# Initialize the BERT-based model
num_labels = len(df['encoded_label'].unique())
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Define NLTK functions
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = emoji.demojize(text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

def preprocess_data(df, tokenizer, max_length=256):
    texts = df['sentence'].apply(clean_text).tolist()
    labels = df['encoded_label'].tolist()
    tokenized = tokenizer(texts, padding=True, truncation=True, return_tensors='tf', max_length=max_length)
    return tokenized, labels

# Define stratified K-fold cross-validation
n_splits = 10
all_test_labels = []
all_predicted_labels = []
all_train_loss = []
all_train_accuracy = []
all_val_loss = []
all_val_accuracy = []

for fold, (train_indices, val_indices) in enumerate(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42).split(df['sentence'], df['encoded_label'])):
    print(f"Fold {fold + 1}...")
    fold_train_df = df.iloc[train_indices]
    fold_val_df = df.iloc[val_indices]
    train_df, val_df = train_test_split(fold_train_df, test_size=0.1, random_state=42)

    train_data, train_labels = preprocess_data(train_df, tokenizer, max_length)
    val_data, val_labels = preprocess_data(val_df, tokenizer, max_length)
    test_data, test_labels = preprocess_data(fold_val_df, tokenizer, max_length)

    train_data = {key: np.array(val) for key, val in train_data.items()}
    val_data = {key: np.array(val) for key, val in val_data.items()}
    test_data = {key: np.array(val) for key, val in test_data.items()}

    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)
    model_checkpoint = ModelCheckpoint(os.path.join(output_dir, f'best_model_fold_{fold}'), save_best_only=True)

    model.fit(
        train_data, np.array(train_labels),
        validation_data=(val_data, np.array(val_labels)),
        epochs=16, batch_size=8,
        callbacks=[early_stopping, model_checkpoint],
        verbose=2
    )

    test_predictions = model.predict(test_data)['logits']
    test_predicted_labels = np.argmax(test_predictions, axis=1)
    all_test_labels.extend(test_labels)
    all_predicted_labels.extend(test_predicted_labels)

# Print classification report
class_report = classification_report(all_test_labels, all_predicted_labels, target_names=label_encoder.classes_)
print("AWARE_Games: Classification Report Across Folds:\n", class_report)

# Save the trained model in .h5 format
h5_path = os.path.join(output_dir, "bert_model_weights.h5")
model.save_weights(h5_path)
print(f"Model weights saved as .h5 at: {h5_path}")

# Save the entire model using Pickle
pickle_path = os.path.join(output_dir, "bert_model.pkl")
with open(pickle_path, "wb") as f:
    pickle.dump(model, f)
print(f"Model saved as Pickle at: {pickle_path}")

# Save the model in HuggingFace format
model.save_pretrained(output_dir)
print(f"Model saved in HuggingFace format at: {output_dir}")

# Calculate the confusion matrix
conf_mat = confusion_matrix(all_test_labels, all_predicted_labels)

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Games Domain: Confusion Matrix')
plt.savefig(os.path.join(output_dir, 'confusion_matrix.png'))
plt.show()

# Final Message
print("Model training, evaluation, and saving completed for the 'games' domain.")


Loading dataset...
Dataset loaded: 4080 samples in 'games' domain.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1...
Epoch 1/16
