In [3]:
import pandas as pd

df_fake = pd.read_csv("data/Fake.csv")
df_real = pd.read_csv("data/True.csv")
df_fake["label"] = 1
df_real["label"] = 0
df_main = pd.concat([df_fake, df_real], ignore_index=True)[["text", "label"]].dropna()

df_fake2 = pd.read_csv("data/fine_tune_data/ISOT_Fake.csv")
df_real2 = pd.read_csv("data/fine_tune_data/ISOT_True.csv")

df_fake2["label"] = 1
df_real2["label"] = 0
df_isot = pd.concat([df_fake2, df_real2], ignore_index=True)[["text", "label"]].dropna()

df_liar = pd.read_csv("data/liar/train.tsv", sep='\t', header=None)
df_liar.columns = [
    "id", "label", "statement", "subject", "speaker", "job_title", "state_info",
    "party_affiliation", "barely_true_counts", "false_counts", "half_true_counts",
    "mostly_true_counts", "pants_on_fire_counts", "context"
]
true_labels = ["true", "half-true", "mostly-true"]
false_labels = ["false", "barely-true", "pants-fire"]
df_liar = df_liar[df_liar["label"].isin(true_labels + false_labels)].copy()
df_liar["label"] = df_liar["label"].apply(lambda x: 1 if x in false_labels else 0)
df_liar = df_liar[["statement", "label"]].rename(columns={"statement": "text"}).dropna()


In [4]:
import re

char_vocab = {ch: idx + 1 for idx, ch in enumerate("abcdefghijklmnopqrstuvwxyz0123456789 .,;!?-–()[]{}'\"")}
vocab_size = len(char_vocab) + 1 
MAX_LEN = 1014  

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9 .,;!?()\\[\\]{}\'"-]', '', text)
    return text

def text_to_sequence(text, max_len=MAX_LEN):
    text = clean_text(text)
    seq = [char_vocab.get(c, 0) for c in text[:max_len]]
    return seq + [0] * (max_len - len(seq))


In [8]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

sequences = dataset["text"].apply(text_to_sequence).tolist()
labels = dataset["label"].tolist()

X_train, X_test, y_train, y_test = train_test_split(
    sequences, labels, test_size=0.1, random_state=42
)

X_train_tensor = torch.tensor(X_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)


In [9]:
import torch.nn as nn
import torch.nn.functional as F

class CharCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_classes=2):
        super(CharCNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim)

        self.conv1 = nn.Conv1d(embed_dim, 256, kernel_size=7, padding=3)
        self.pool1 = nn.MaxPool1d(3)

        self.conv2 = nn.Conv1d(256, 256, kernel_size=7, padding=3)
        self.pool2 = nn.MaxPool1d(3)

        self.conv3 = nn.Conv1d(256, 256, kernel_size=3, padding=1)
        self.conv4 = nn.Conv1d(256, 256, kernel_size=3, padding=1)
        self.conv5 = nn.Conv1d(256, 256, kernel_size=3, padding=1)
        self.pool3 = nn.MaxPool1d(3)

        self.fc1 = nn.Linear(256 * 34, 1024)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(1024, 1024)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(1024, num_classes)

    def forward(self, x):
        x = self.embedding(x)          
        x = x.permute(0, 2, 1)        

        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = self.pool3(F.relu(self.conv5(x)))

        x = x.view(x.size(0), -1)
        x = self.dropout1(F.relu(self.fc1(x)))
        x = self.dropout2(F.relu(self.fc2(x)))
        return self.fc3(x)


In [None]:
torch.save(model.state_dict(), "base_model.pt")
print("✅ İlk eğitim tamamlandı, model kaydedildi.")

In [11]:
import wandb

wandb.init(project="charcnn-finetune", name="charcnn-run-isot", config={
    "vocab_size": vocab_size,
    "max_len": MAX_LEN,
    "batch_size": 64,
    "epochs": 5,
    "learning_rate": 0.001
})

config = wandb.config

model = CharCNN(vocab_size=config.vocab_size).to("cpu") 
model.load_state_dict(torch.load("base_model.pt"))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

def train_model(model, loader, criterion, optimizer, epochs=5):
    for epoch in range(epochs):
        model.train()
        total_loss, correct, total = 0.0, 0, 0

        for inputs, labels in loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        acc = correct / total
        avg_loss = total_loss / len(loader)

        wandb.log({
            "epoch": epoch + 1,
            "train_loss": avg_loss,
            "train_accuracy": acc,
            "learning_rate": config.learning_rate
        })

        print(f"Epoch {epoch+1}: Loss={avg_loss:.4f}, Accuracy={acc:.4f}")

[34m[1mwandb[0m: Currently logged in as: [33mdogaozyagci[0m ([33mveyselbayrakci-isik-universitesi[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




In [None]:
train_model(model, train_loader, criterion, optimizer, epochs=config.epochs)

In [None]:
def evaluate_model(model, loader):
    model.eval()
    correct, total = 0, 0
    total_loss = 0.0
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    acc = correct / total
    avg_loss = total_loss / len(loader)
    print(f"🔍 Test Loss: {avg_loss:.4f} | Accuracy: {acc:.4f}")

    wandb.log({
        "test_loss": avg_loss,
        "test_accuracy": acc
    })

In [None]:
evaluate_model(model, test_loader)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

all_preds = []
all_labels = []

model.eval()
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

cm = confusion_matrix(all_labels, all_preds)
print(classification_report(all_labels, all_preds))

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

In [None]:
dataset = df_isot.copy()

sequences = dataset["text"].apply(text_to_sequence).tolist()
labels = dataset["label"].tolist()

X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.1, random_state=42)
X_train_tensor = torch.tensor(X_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

In [None]:
wandb.init(project="charcnn-finetune", name="charcnn-run-isot", config={
    "vocab_size": vocab_size,
    "max_len": MAX_LEN,
    "batch_size": 64,
    "epochs": 5,
    "learning_rate": 0.0001
})

model = CharCNN(vocab_size=vocab_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_model(model, train_loader, criterion, optimizer, epochs=5)
evaluate_model(model, test_loader)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

all_preds = []
all_labels = []

model.eval()
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

cm = confusion_matrix(all_labels, all_preds)
print(classification_report(all_labels, all_preds))

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.savefig("confusion_matrix_isot.png")
plt.show()

In [None]:
wandb.log({"confusion_matrix": wandb.Image("confusion_matrix_isot.png")})

In [None]:
torch.save(model.state_dict(), "charcnn_finetuned_isot.pt")