In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import json
from sklearn.svm import OneClassSVM

In [None]:
def load_json_to_df(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return pd.DataFrame(data)

train_file_path = '/content/train (1).json'
test_file_path = '/content/test (1).json'

train_df = load_json_to_df(train_file_path)
test_df = load_json_to_df(test_file_path)


In [None]:
def preprocess_data(df):
    df['evidence_text'] = df['evidence'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
    df['text'] = df['statement'] + ' ' + df['evidence_text']
    return df


In [None]:
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

In [None]:
train_df.head()

Unnamed: 0,id,label,statement,reason,evidence,evidence_text,text
0,2635.json,false,Says the Annies List political group supports ...,Reason: The statement is not supported by the ...,[When we asked Bohac's campaign officials for ...,When we asked Bohac's campaign officials for e...,Says the Annies List political group supports ...
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",Reason: The statement accurately reflects Obam...,"[Among Obama's points: ""It's time for new lead...","Among Obama's points: ""It's time for new leade...","Hillary Clinton agrees with John McCain ""by vo..."
2,1123.json,false,Health care reform legislation is likely to ma...,Reason: The evidence provided clearly refutes ...,[So let’s recap. The release may have a point ...,So let’s recap. The release may have a point t...,Health care reform legislation is likely to ma...
3,12465.json,true,The Chicago Bears have had more starting quart...,Reason: The statement is accurate. According t...,"[""The Chicago Bears have had more starting qua...","""The Chicago Bears have had more starting quar...",The Chicago Bears have had more starting quart...
4,153.json,half-true,"""I'm the only person on this stage who has wor...",Reason: While it is true that Senator Barack O...,"[At a Democratic debate in Philadelphia, Sen. ...","At a Democratic debate in Philadelphia, Sen. B...","""I'm the only person on this stage who has wor..."


In [None]:
from google.colab import sheets
sheet = sheets.InteractiveSheet(df=train_df)

https://docs.google.com/spreadsheets/d/1fxzxvnhmYFhatYb4vu_VLLrfw0n7q2ICLs-sH3ZefOE/edit#gid=0


In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])
y_train = train_df['label']
y_test = test_df['label']

In [None]:
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# --- Logistic Regression (Baseline) ---
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_split, y_train_split)
y_pred_lr = lr_model.predict(X_val)

print("Logistic Regression Results:")
print(f'Validation Accuracy: {accuracy_score(y_val, y_pred_lr)}')
print(f'Validation F1 Score: {f1_score(y_val, y_pred_lr, average="weighted")}\n')

Logistic Regression Results:
Validation Accuracy: 0.25472636815920396
Validation F1 Score: 0.24646487595534483



In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
y_train = y_train.reset_index(drop=True)  # Fixes index alignment

def predict_oc_svm(X):
    scores = {}
    for label, model in oc_svm_models.items():
        # Get decision scores for all samples (shape: [n_samples])
        scores[label] = model.decision_function(X)

    # Convert scores to DataFrame (rows = samples, columns = labels)
    scores_df = pd.DataFrame(scores)

    # For each sample, select the label with the highest score
    return scores_df.idxmax(axis=1)

In [None]:
unique_labels = y_train.unique()
oc_svm_models = {}

for label in unique_labels:
    class_indices = y_train[y_train == label].index
    X_class = X_train[class_indices]

    oc_svm = OneClassSVM(kernel='rbf', gamma='scale', nu=0.1)
    oc_svm.fit(X_class)
    oc_svm_models[label] = oc_svm

# Predict using the fixed function
y_pred_ocsvm = predict_oc_svm(X_val)

# Evaluate
print("One-Class SVM Results:")
print(f'Validation Accuracy: {accuracy_score(y_val, y_pred_ocsvm)}')
print(f'Validation F1 Score: {f1_score(y_val, y_pred_ocsvm, average="weighted")}')


One-Class SVM Results:
Validation Accuracy: 0.21393034825870647
Validation F1 Score: 0.20352421455523767


In [None]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

print("Logistic Regression Results:")
print(f'Test Accuracy: {accuracy_score(y_test, y_pred_lr)}')
print(f'Test F1 Score: {f1_score(y_test, y_pred_lr, average="weighted")}')
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred_lr))
print('Classification Report:\n', classification_report(y_test, y_pred_lr))

Logistic Regression Results:
Test Accuracy: 0.2938425565081839
Test F1 Score: 0.2769979556917553
Confusion Matrix:
 [[ 33  65  57  41   0  18]
 [ 15 116  49  41   1  28]
 [ 23  66  94  46   0  38]
 [ 17  39  75  59   1  58]
 [ 11  40  14  16   2   9]
 [ 11  43  45  38   1  73]]
Classification Report:
               precision    recall  f1-score   support

 barely-true       0.30      0.15      0.20       214
       false       0.31      0.46      0.37       250
   half-true       0.28      0.35      0.31       267
 mostly-true       0.24      0.24      0.24       249
  pants-fire       0.40      0.02      0.04        92
        true       0.33      0.35      0.34       211

    accuracy                           0.29      1283
   macro avg       0.31      0.26      0.25      1283
weighted avg       0.30      0.29      0.28      1283



In [None]:
oc_svm_models = {}
for label in y_train.unique():
    class_indices = y_train[y_train == label].index
    model = OneClassSVM(kernel='rbf', nu=0.1)
    model.fit(X_train[class_indices])
    oc_svm_models[label] = model

# Prediction function (fixed)
def predict_ocsvm(X):
    scores = pd.DataFrame({label: model.decision_function(X)
                          for label, model in oc_svm_models.items()})
    return scores.idxmax(axis=1)

y_pred_ocsvm = predict_ocsvm(X_test)

print("\nOne-Class SVM Results:")
print(f'Test Accuracy: {accuracy_score(y_test, y_pred_ocsvm)}')
print(f'Test F1 Score: {f1_score(y_test, y_pred_ocsvm, average="weighted")}')
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred_ocsvm))
print('Classification Report:\n', classification_report(y_test, y_pred_ocsvm))


One-Class SVM Results:
Test Accuracy: 0.2244738893219018
Test F1 Score: 0.19982543010213663
Confusion Matrix:
 [[22 62 63 55  7  5]
 [18 82 66 64  9 11]
 [13 69 86 80  6 13]
 [12 57 81 87  3  9]
 [ 7 44 19 18  2  2]
 [ 9 48 62 79  4  9]]
Classification Report:
               precision    recall  f1-score   support

 barely-true       0.27      0.10      0.15       214
       false       0.23      0.33      0.27       250
   half-true       0.23      0.32      0.27       267
 mostly-true       0.23      0.35      0.28       249
  pants-fire       0.06      0.02      0.03        92
        true       0.18      0.04      0.07       211

    accuracy                           0.22      1283
   macro avg       0.20      0.19      0.18      1283
weighted avg       0.22      0.22      0.20      1283



In [None]:
# ----------------------------
# 1. Install Dependencies
# ----------------------------
!pip install sentence-transformers transformers datasets evaluate accelerate -Uqq




In [None]:
# ----------------------------
# 2. Load and Prepare Data
# ----------------------------
import pandas as pd
import json
from sklearn.model_selection import train_test_split

def load_json(file_path):
    with open(file_path, 'r') as f:
        return pd.DataFrame(json.load(f))

train_df = load_json('/content/train (1).json')
test_df = load_json('/content/test (1).json')

def preprocess_data(df):
    df['evidence_text'] = df['evidence'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
    df['text'] = df['statement'] + ' [SEP] ' + df['evidence_text']  # Better separation
    return df

train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

# ----------------------------
# 3. Approach 1: Sentence Transformers + Logistic Regression
# ----------------------------
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Generate embeddings
model = SentenceTransformer('all-mpnet-base-v2')
X_train = model.encode(train_df['text'], show_progress_bar=True)
X_test = model.encode(test_df['text'], show_progress_bar=True)
y_train = train_df['label']
y_test = test_df['label']

# Train with class weights
class_weights = dict(train_df['label'].value_counts(normalize=True))
lr_model = LogisticRegression(max_iter=1000, class_weight=class_weights)
lr_model.fit(X_train, y_train)

# Evaluate
y_pred = lr_model.predict(X_test)
print("Sentence Transformer + Logistic Regression Results:")
print(classification_report(y_test, y_pred))

# ----------------------------
# 4. Approach 2: Fine-tune BERT
# ----------------------------
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

# Convert to HuggingFace dataset
train_dataset = Dataset.from_dict({
    'text': train_df['text'].tolist(),
    'label': train_df['label'].astype('category').cat.codes.tolist()
})
test_dataset = Dataset.from_dict({
    'text': test_df['text'].tolist(),
    'label': test_df['label'].astype('category').cat.codes.tolist()
})

# Tokenization
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Model setup
label2id = {label: idx for idx, label in enumerate(train_df['label'].astype('category').cat.categories)}
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id),
    id2label={v:k for k,v in label2id.items()}
)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Trainer with class weights
from torch import nn
import numpy as np

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(list(class_weights.values()), dtype=torch.float))
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train and evaluate
trainer.train()
predictions = trainer.predict(test_dataset)
y_pred_bert = np.argmax(predictions.predictions, axis=1)

print("\nFine-tuned BERT Results:")
print(classification_report(test_dataset['label'], y_pred_bert, target_names=label2id.keys()))

# ----------------------------
# 5. Visualization
# ----------------------------
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 2, figsize=(20,8))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax[0], normalize='true')
ax[0].set_title('Sentence Transformer + LR')
ConfusionMatrixDisplay.from_predictions(test_dataset['label'], y_pred_bert, ax=ax[1], normalize='true')
ax[1].set_title('Fine-tuned BERT')
plt.show()