In [41]:
import requests
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, DistilBertModel
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [42]:
# Google Fact Check API ÏÑ§Ï†ï
FACT_CHECK_API_URL = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
FACT_CHECK_API_KEY = "AIzaSyDW8TNNxSZG2NXzA3HGK-19PDBp0jjoOu0"  # üëâ Ïó¨Í∏∞Ïóê Google API Key ÏûÖÎ†•

In [43]:
def check_fact_with_api(query):
    """Google Fact Check APIÎ•º Ïù¥Ïö©Ìï¥ Îâ¥Ïä§Ïùò ÏßÑÏúÑ Ïó¨Î∂ÄÎ•º ÌôïÏù∏"""
    params = {"query": query, "key": FACT_CHECK_API_KEY}
    response = requests.get(FACT_CHECK_API_URL, params=params)
    if response.status_code == 200:
        data = response.json()
        if "claims" in data:
            return data["claims"]
    return []

In [44]:
# Îç∞Ïù¥ÌÑ∞ Î°úÎìú
df = pd.read_csv("C:/FakeNewsProject/FakeNews_py/News_Dataset.csv")

In [45]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        
        # Fact Check APIÎ•º ÏÇ¨Ïö©ÌïòÏó¨ Ïã†Î¢∞ÎèÑ Ï†êÏàò Í≥ÑÏÇ∞
        fact_check_results = check_fact_with_api(text)
        fact_check_score = 0  # Í∏∞Î≥∏Í∞í
        if fact_check_results:
            fact_check_score = len(fact_check_results)  # Ìå©Ìä∏Ï≤¥ÌÅ¨Îêú ÏÇ¨Î°Ä Ïàò
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'fact_check_score': torch.tensor(fact_check_score, dtype=torch.float),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [46]:
# Îç∞Ïù¥ÌÑ∞ Î∂ÑÌï†
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Content'].values, df['Label'].values, test_size=0.2, random_state=42
)

In [47]:
train_dataset = NewsDataset(train_texts, train_labels, tokenizer)
test_dataset = NewsDataset(test_texts, test_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [48]:
# Î™®Îç∏ Ï†ïÏùò
class FakeNewsDetector(nn.Module):
    def __init__(self, bert_model='distilbert-base-uncased', hidden_dim=128, num_classes=2):
        super(FakeNewsDetector, self).__init__()
        self.bert = DistilBertModel.from_pretrained(bert_model)
        self.lstm = nn.LSTM(768, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2 + 1, num_classes)  # Fact-Check Score Ï∂îÍ∞Ä
        self.dropout = nn.Dropout(0.3)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, input_ids, attention_mask, fact_check_score):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        lstm_out, _ = self.lstm(outputs.last_hidden_state)
        out = self.dropout(lstm_out[:, -1, :])
        out = torch.cat((out, fact_check_score.unsqueeze(1)), dim=1)  # Ìå©Ìä∏Ï≤¥ÌÅ¨ Ï†êÏàò Ï∂îÍ∞Ä
        out = self.fc(out)
        return self.softmax(out)

In [49]:
# Î™®Îç∏ Î∞è ÌïôÏäµ ÏÑ§Ï†ï
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FakeNewsDetector().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

In [50]:
# ÌïôÏäµ Î£®ÌîÑ
def train(model, train_loader, criterion, optimizer, device, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            fact_check_score = batch['fact_check_score'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, fact_check_score)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

In [51]:
# ÌèâÍ∞Ä Ìï®Ïàò
def evaluate(model, test_loader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            fact_check_score = batch['fact_check_score'].to(device)
            labels = batch['label'].cpu().numpy()
            
            outputs = model(input_ids, attention_mask, fact_check_score)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            
            predictions.extend(preds)
            true_labels.extend(labels)
    acc = accuracy_score(true_labels, predictions)
    print(f"Test Accuracy: {acc:.4f}")

In [52]:
# Î™®Îç∏ ÌïôÏäµ Î∞è ÌèâÍ∞Ä
train(model, train_loader, criterion, optimizer, device, epochs=3)
evaluate(model, test_loader, device)

: 