In [None]:
import json
import re
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader


## **Read JSON file**

In [None]:
import json

texts = []
labels_raw = []

with open("z639_assignment1_training.json", "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        texts.append(obj.get("text", ""))
        labels_raw.append(obj.get("composite_toxic", None))

print("First text:", texts[0])
print("First label info:", labels_raw[0])


First text: WTF, y'all never made MRE fart balloons in the stumps?

Fucking kids these days.
First label info: [[False, 74], [True, 323], [False, 1028], [False, 324], [True, 1068]]


## **Convert annotation lists into binary labels**



In [None]:
labels = []

for comp in labels_raw:
    if comp is None:
        labels.append(0)   # default to non-toxic if missing
        continue

    true_count = sum(1 for item in comp if isinstance(item, list) and len(item) > 0 and item[0] is True)
    false_count = sum(1 for item in comp if isinstance(item, list) and len(item) > 0 and item[0] is False)

    if true_count > false_count:
        labels.append(1)
    else:
        labels.append(0)

print("First 10 labels:", labels[:10])


First 10 labels: [0, 0, 1, 1, 0, 0, 0, 1, 1, 0]


## **Put into a DataFrame**

In [None]:
import pandas as pd
df = pd.DataFrame({"text": texts, "label": labels})
print(df.head(10))
print("Label distribution:", df["label"].value_counts())


                                                text  label
0  WTF, y'all never made MRE fart balloons in the...      0
1  No apologies !! McCall has balls !  Ccp is not...      0
2  What ever you need to tell yourself to sleep a...      1
3                   @exZACKly @CBSNews Fuck off Nazi      1
4  Texas is a republican sponsored killing ground...      0
5  I get that calling the rapporteur a "fake job"...      0
6  Ok,  so when are people getting arrested for l...      0
7  @noon4s @AndrewJakeMIII @AP @elonmusk MAGA mor...      1
8  yeah i doubt that's false. Biden wants Drag qu...      1
9  >\t(unless signed contract exists, which is wh...      0
Label distribution: label
0    2974
1    1026
Name: count, dtype: int64


## **Split the data**

In [None]:
from sklearn.model_selection import train_test_split
from collections import Counter

# assume df exists with columns: "text" and "label" (from previous step)
print("Total examples:", len(df))
print("Overall label distribution:", Counter(df['label']))

# 60% train, 20% val, 20% test (stratified)
train_df, temp_df = train_test_split(df, test_size=0.40, stratify=df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.50, stratify=temp_df['label'], random_state=42)

print("Sizes -> Train:", len(train_df), "Val:", len(val_df), "Test:", len(test_df))
print("Train label dist:", Counter(train_df['label']))
print("Val label dist:",   Counter(val_df['label']))
print("Test label dist:",  Counter(test_df['label']))


Total examples: 4000
Overall label distribution: Counter({0: 2974, 1: 1026})
Sizes -> Train: 2400 Val: 800 Test: 800
Train label dist: Counter({0: 1784, 1: 616})
Val label dist: Counter({0: 595, 1: 205})
Test label dist: Counter({0: 595, 1: 205})


## **TF-IDF + Logistic Regression baseline**

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import joblib

# 1) Build TF-IDF
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2), stop_words='english')
X_train = tfidf.fit_transform(train_df['text'].astype(str).values)
X_val   = tfidf.transform(val_df['text'].astype(str).values)
X_test  = tfidf.transform(test_df['text'].astype(str).values)

# 2) Train Logistic Regression (class_weight balanced to help with imbalance)
clf = LogisticRegression(max_iter=400, class_weight='balanced', random_state=42)
clf.fit(X_train, train_df['label'].values)

# 3) Evaluate on validation set
val_preds = clf.predict(X_val)
print("Validation set metrics:")
print(classification_report(val_df['label'], val_preds, digits=4))
print("Accuracy (val):", accuracy_score(val_df['label'], val_preds))

# 4) Evaluate on test set
test_preds = clf.predict(X_test)
print("\nTest set metrics:")
print(classification_report(test_df['label'], test_preds, digits=4))
print("Accuracy (test):", accuracy_score(test_df['label'], test_preds))

# 5) show top features for toxic class
feature_names = np.array(tfidf.get_feature_names_out())
coefs = clf.coef_[0]
top_pos = np.argsort(coefs)[-20:][::-1]
top_neg = np.argsort(coefs)[:20]
print("\nTop tokens predicting TOXIC:")
print(feature_names[top_pos])
print("\nTop tokens predicting NON-TOXIC:")
print(feature_names[top_neg])

# 6) Save the vectorizer and model for later use
joblib.dump(tfidf, 'tfidf_vectorizer.joblib')
joblib.dump(clf, 'logreg_toxic_baseline.joblib')
print("\nSaved tfidf_vectorizer.joblib and logreg_toxic_baseline.joblib")


Validation set metrics:
              precision    recall  f1-score   support

           0     0.8108    0.8067    0.8088       595
           1     0.4471    0.4537    0.4504       205

    accuracy                         0.7163       800
   macro avg     0.6290    0.6302    0.6296       800
weighted avg     0.7176    0.7163    0.7169       800

Accuracy (val): 0.71625

Test set metrics:
              precision    recall  f1-score   support

           0     0.8311    0.8185    0.8247       595
           1     0.4953    0.5171    0.5060       205

    accuracy                         0.7412       800
   macro avg     0.6632    0.6678    0.6653       800
weighted avg     0.7450    0.7412    0.7430       800

Accuracy (test): 0.74125

Top tokens predicting TOXIC:
['fucking' 'stupid' 'fuck' 'morons' 'shit' 'idiot' 'trash' 'dumb' 'moron'
 'ass' 'losers' 'idiots' 'bitch' 'hate' 'bunch' 'black' 'rapist' 'little'
 'cock' 'deserves']

Top tokens predicting NON-TOXIC:
['time' 'probably' 'ru

## **BERT MODEL**

In [None]:
import torch
import numpy as np
from torch.utils.data import Dataset
from transformers import (BertTokenizer, BertForSequenceClassification,
                          TrainingArguments, Trainer)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 1. Load tokenizer and model
MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# 2. PyTorch Dataset wrapper
class ToxicDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = int(self.labels.iloc[idx])
        enc = self.tokenizer(
            text, truncation=True, padding='max_length',
            max_length=self.max_length, return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k,v in enc.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

train_dataset = ToxicDataset(train_df['text'], train_df['label'], tokenizer)
val_dataset   = ToxicDataset(val_df['text'],   val_df['label'],   tokenizer)
test_dataset  = ToxicDataset(test_df['text'],  test_df['label'],  tokenizer)

# 3. Metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
        "f1": f1_score(labels, preds, zero_division=0)
    }

# 4. Training arguments
training_args = TrainingArguments(
    output_dir="./bert_toxic_checkpoints",
    num_train_epochs=3,
    per_device_train_batch_size=16,   # reduce if OOM
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=2,
    fp16=True,   # mixed precision for speed on GPU
    seed=42
)

# 5. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 6. Train
train_result = trainer.train()
trainer.save_model("./bert_toxic_model")
print("Training done.")

# 7. Evaluate on test set
metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Test metrics:", metrics)

# 8. Sample predictions
samples = [
    "You are the worst person I've ever met.",
    "Thanks a lot, that helped me so much!",
    "I can't believe you think that, pathetic.",
    "Fantastic explanation — learned a lot."
]
enc = tokenizer(samples, truncation=True, padding=True, max_length=128, return_tensors="pt")
if torch.cuda.is_available(): model.cuda()
with torch.no_grad():
    outputs = model(**{k:v.to(model.device) for k,v in enc.items()})
    preds = outputs.logits.argmax(axis=-1).cpu().numpy()

for s,p in zip(samples, preds):
    print(f"TEXT: {s}\nPRED: {p} ({'toxic' if p==1 else 'not toxic'})\n")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfnfren[0m ([33mahujaar-indiana-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5293,0.435322,0.79625,0.599057,0.619512,0.609113
2,0.341,0.498196,0.795,0.622754,0.507317,0.55914
3,0.2147,0.54115,0.7825,0.583784,0.526829,0.553846


Training done.


Test metrics: {'eval_loss': 0.43805167078971863, 'eval_accuracy': 0.7825, 'eval_precision': 0.5720930232558139, 'eval_recall': 0.6, 'eval_f1': 0.5857142857142857, 'eval_runtime': 2.0769, 'eval_samples_per_second': 385.181, 'eval_steps_per_second': 12.037, 'epoch': 3.0}
TEXT: You are the worst person I've ever met.
PRED: 0 (not toxic)

TEXT: Thanks a lot, that helped me so much!
PRED: 0 (not toxic)

TEXT: I can't believe you think that, pathetic.
PRED: 1 (toxic)

TEXT: Fantastic explanation — learned a lot.
PRED: 0 (not toxic)



## **Prediction CSV**

In [None]:
import json
import pandas as pd

# 1. Load the test JSON
test_path = "z639_assignment1_test.json"
platform_ids = []
texts = []

with open(test_path, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        platform_ids.append(obj["platform_id"])
        texts.append(obj["text"])

print("Loaded", len(platform_ids), "test comments")

# 2. Run trained model (e.g., BERT) on test texts
enc = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt")

if torch.cuda.is_available():
    model.cuda()

preds = []
with torch.no_grad():
    for i in range(0, len(texts), 32):  # batch size 32 for efficiency
        batch = {k: v[i:i+32].to(model.device) for k,v in enc.items()}
        logits = model(**batch).logits
        batch_preds = logits.argmax(axis=-1).cpu().numpy()
        preds.extend(batch_preds)

# 3. Convert 0/1 to true/false
preds_bool = ["true" if p==1 else "false" for p in preds]

# 4. Make submission DataFrame
submission = pd.DataFrame({
    "platform_id": platform_ids,
    "prediction": preds_bool
})

# 5. Save as CSV
submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")


Loaded 500 test comments
Saved submission.csv
