# Google Colab-compatible script to preprocess comment data and fine-tune BERT MLM.

### Summary:
- Mounts Google Drive
- Cleans and filters a directory of JSON files containing comment threads
- Splits into train/test
- Saves test data
- Fine-tunes a BERT masked language model

In [None]:
!pip install transformers
!pip install scikit-learn

In [None]:
import os
import json
import re
import torch
from transformers import BertTokenizer, BertForMaskedLM, AdamW
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from google.colab import drive

In [None]:
# Mount Google Drive to access datasets and save outputs
drive.mount('/content/drive')

In [None]:
# Global vars
sentencelist = []  # Holds cleaned data
count = 0
TEST_SAMPLE_SIZE = 500  # Number of test samples

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def clean_data(text):
    """
    Cleans input text:
    - Removes unwanted patterns
    - Filters long/token-heavy/short sentences
    - Adds punctuation where needed
    """
    sentence_splitter = re.compile(r'[.!?] |\n')
    sentences = re.split(sentence_splitter, text)
    cleaned_text = ""
    pattern1 = re.compile(r"[<>{}]|[0-9][0-9]:|[0-9]+x[0-9]+|::|:[0-9]|@@")
    pattern2 = re.compile(r"^(.*/)([^/]*) |==|--|@[A-Za-z0-9]+")
    for sentence in sentences:
        if not re.findall(pattern1, sentence):
            sentence = re.sub(pattern2, '', sentence)
            tokens = tokenizer.tokenize(sentence)
            if 8 < len(sentence) <= 512 and len(tokens) <= 512:
                cleaned_text += sentence if sentence.endswith('.') else sentence + ". "
    return cleaned_text

def summarize_ticket(json_file):
    """
    Processes a single JSON file:
    - Extracts first comment
    - Cleans the comment
    - Appends to training data list if valid
    """
    global count, sentencelist
    data = json.load(json_file)
    url_pattern = re.compile(r"https?://[^\s]+")
    comment = data['comments'][0]['raw_text']
    comment = re.sub(url_pattern, "", comment)
    cleaned = clean_data(comment)
    count += 1
    if cleaned:
        sentencelist.append(cleaned)

def main():
    """
    Main runner:
    - Loads JSONs
    - Saves test data
    - Fine-tunes BERT
    """
    # Path config for Colab
    base_dir = "/content/drive/MyDrive/bert_project"
    json_dir = os.path.join(base_dir, "comments")
    test_dir = os.path.join(base_dir, "test_json")
    os.makedirs(test_dir, exist_ok=True)

    # Preprocess all files
    for file in os.listdir(json_dir):
        with open(os.path.join(json_dir, file), encoding="utf-8") as json_file:
            summarize_ticket(json_file)
    print("Total files processed:", count)

    # Train-test split
    train_data, test_data = sentencelist[:-TEST_SAMPLE_SIZE], sentencelist[-TEST_SAMPLE_SIZE:]

    # Save test samples
    for i, item in enumerate(test_data):
        with open(os.path.join(test_dir, f"test_sample_{i}.json"), "w", encoding="utf-8") as f:
            json.dump({"text": item}, f, ensure_ascii=False, indent=2)

    # Load model
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    model.train()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Training setup
    dataloader = DataLoader(train_data, batch_size=1, shuffle=True)
    optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=False)
    max_len = 82
    max_norm = 1.0
    epochs = 10

    # Training loop
    for epoch in range(epochs):
        for batch in dataloader:
            encoded = tokenizer.batch_encode_plus(
                batch,
                add_special_tokens=False,
                max_length=max_len,
                pad_to_max_length=True,
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt')
            input_ids = encoded['input_ids'].to(device)
            loss = model(input_ids, labels=input_ids)
            loss[0].backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
            optimizer.step()
            optimizer.zero_grad()
        print(f"Epoch {epoch + 1} completed with loss {loss[0].item():.4f}")

    # Save trained model to Drive
    model.save_pretrained(os.path.join(base_dir, "Fine_Tuned_BertForMaskedLM"))

if __name__ == "__main__":
    main()