##Data Prep##


In [None]:
import gdown

# Folder ID from the URL
folder_id = '1F3bTRe7dZe2gxPom3LzYPzkfyNSk-8VJ'

# Using gdown to download the folder contents
url = f'https://drive.google.com/drive/folders/1F3bTRe7dZe2gxPom3LzYPzkfyNSk-8VJ'
gdown.download_folder(url, quiet=False)

In [None]:
#Urdu dataset uploading
import pandas as pd
import os
import zipfile

urdu_zip_path = '/content/NLP Project/Urdu Fake News Dataset.zip'

with zipfile.ZipFile(urdu_zip_path, 'r') as zip_ref:

  zip_ref.extractall('/content/Urdu')



In [None]:
#Urdu dataframe loading
def load_news_data(directory, label):
    news_data = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read().strip()
            news_data.append({'text': content, 'label': label})
    return news_data

urdu_trainfake_path = '/content/Urdu/1.Corpus/Train/Fake'
urdu_trainreal_path = '/content/Urdu/1.Corpus/Train/Real'
urdu_testfake_path = '/content/Urdu/1.Corpus/Test/Fake'
urdu_testreal_path = '/content/Urdu/1.Corpus/Test/Real'

urdu_trainfake_texts = load_news_data(urdu_trainfake_path, 'Fake')
urdu_trainreal_texts = load_news_data(urdu_trainreal_path, 'True')
urdu_testfake_texts = load_news_data(urdu_testfake_path, 'Fake')
urdu_testreal_texts = load_news_data(urdu_testreal_path, 'True')

# Creating a DataFrame for the Urdu dataset
urdu_news_df = pd.DataFrame(urdu_trainfake_texts+urdu_trainreal_texts+urdu_testfake_texts+urdu_testreal_texts)
urdu_news_df['source'] = 'Urdu'

urdu_news_df.head()


In [None]:
urdu_news_df.shape

In [None]:
#Loading the english dataset
english_excel_path = '/content/NLP Project/pakistani_dataset_consolidated (1).xlsx'
english_dataset = pd.read_excel(english_excel_path)

english_dataset = english_dataset.dropna(subset=['Text'])

if 'cleaned_labels' in english_dataset.columns:
    english_dataset = english_dataset.rename(columns={'Text': 'text', 'cleaned_labels': 'label'})
elif 'Textual Rating' in english_dataset.columns:  # If another column exists for labels
    english_dataset = english_dataset.rename(columns={'Text': 'text', 'Textual Rating': 'label'})
else:
    raise KeyError("No valid label column found in the English dataset.")

#Adding the source column for tracking the dataset origin
english_dataset['source'] = 'English'


In [None]:
english_dataset.shape

In [None]:
#Combining the two datasets
combined_dataset = pd.concat([english_dataset[['text', 'label', 'source']], urdu_news_df], ignore_index=True)

# Handling any remaining missing values in the combined dataset
combined_dataset = combined_dataset.dropna(subset=['text'])

# Mapping labels to just 'True', 'False', 'Unclear'
def simplify_labels(label):
    label = str(label)
    label = label.lower()
    if 'true' in label or 'partly true' in label or 'half true' in label:
        return 'True'
    elif 'false' in label or 'fake' in label or 'hoax' in label or 'doctored' in label:
        return 'False'
    else:
        return 'Unclear'

combined_dataset['label'] = combined_dataset['label'].apply(simplify_labels)

#Separating English and Urdu data for vectorization
english_data = combined_dataset[combined_dataset['source'] == 'English']
urdu_data = combined_dataset[combined_dataset['source'] == 'Urdu']


In [None]:
combined_dataset = combined_dataset.loc[combined_dataset['label'] != 'Unclear']


In [None]:
unique_labels = combined_dataset['label'].unique()

combined_dataset['label'] = combined_dataset['label'].map({'False': 0, 'True': 1})



In [None]:
print(combined_dataset['label'].dtypes)


int64


##Fake News Bert##

In [None]:
from transformers import  AutoTokenizer, AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained("jy46604790/Fake-News-Bert-Detect")
tokenizer = AutoTokenizer.from_pretrained("jy46604790/Fake-News-Bert-Detect")


In [None]:
# Ensure texts column is available and cleaned
if hasattr(combined_dataset['text'], 'tolist'):
    texts = combined_dataset['text'].tolist()
else:
    texts = combined_dataset['text']

# Apply tokenizer to each entry and store tokenized data in a new column
combined_dataset['tokenized'] = combined_dataset['text'].apply(
    lambda x: tokenizer(
        x,
        padding='max_length',  # Ensure all sequences are padded to the same length
        truncation=True        # Truncate longer sequences to max length
    )
)

# If you only want specific parts (like input_ids), you can do:
combined_dataset['input_ids'] = combined_dataset['text'].apply(
    lambda x: tokenizer(x, padding='max_length', truncation=True)['input_ids']
)

# # Print the modified dataset
# print(combined_dataset.head())


In [None]:
tokenized = combined_dataset['tokenized']
labels = combined_dataset['label']
print(len(tokenized))
print(len(labels))

In [None]:
combined_dataset.head()

In [None]:
import torch

# Extract input_ids and attention_mask
input_ids = torch.tensor(combined_dataset['input_ids'].tolist())
attention_mask = torch.tensor(
    [entry['attention_mask'] for entry in combined_dataset['tokenized']]
)

# Convert labels to tensor
labels = torch.tensor(combined_dataset['label'].tolist())

print(input_ids.shape, attention_mask.shape, labels.shape)


In [None]:
from sklearn.model_selection import train_test_split

X = input_ids
y = labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optionally, split attention masks if needed
attention_mask_train, attention_mask_test = train_test_split(attention_mask, test_size=0.2, random_state=42)


In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Combine tensors into a dataset
train_dataset = TensorDataset(X_train, attention_mask_train, y_train)
test_dataset = TensorDataset(X_test, attention_mask_test, y_test)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [None]:

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

from torch.optim import AdamW

# Loss and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)


In [None]:
from tqdm import tqdm

epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")


In [None]:
all_preds = []
all_labels = []

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)

        correct += (predictions == labels).sum().item()
        total += labels.size(0)

        # Store the predictions and true labels for classification report
        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")

##Roberta##

In [None]:
from transformers import  AutoTokenizer, AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained("Pavan48/fake_news_detection_roberta")
tokenizer = AutoTokenizer.from_pretrained("Pavan48/fake_news_detection_roberta")

In [None]:
# Ensure texts column is available and cleaned
if hasattr(combined_dataset['text'], 'tolist'):
    texts = combined_dataset['text'].tolist()
else:
    texts = combined_dataset['text']

# Apply tokenizer to each entry and store tokenized data in a new column
combined_dataset['tokenized'] = combined_dataset['text'].apply(
    lambda x: tokenizer(
        x,
        padding='max_length',  # Ensure all sequences are padded to the same length
        truncation=True        # Truncate longer sequences to max length
    )
)

# If you only want specific parts (like input_ids), you can do:
combined_dataset['input_ids'] = combined_dataset['text'].apply(
    lambda x: tokenizer(x, padding='max_length', truncation=True)['input_ids']
)

# # Print the modified dataset
# print(combined_dataset.head())


In [None]:
tokenized = combined_dataset['tokenized']
labels = combined_dataset['label']
print(len(tokenized))
print(len(labels))

In [None]:
import torch

# Extract input_ids and attention_mask
input_ids = torch.tensor(combined_dataset['input_ids'].tolist())
attention_mask = torch.tensor(
    [entry['attention_mask'] for entry in combined_dataset['tokenized']]
)

# Convert labels to tensor
labels = torch.tensor(combined_dataset['label'].tolist())

print(input_ids.shape, attention_mask.shape, labels.shape)

In [None]:
from sklearn.model_selection import train_test_split

X = input_ids
y = labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optionally, split attention masks if needed
attention_mask_train, attention_mask_test = train_test_split(attention_mask, test_size=0.2, random_state=42)


In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Combine tensors into a dataset
train_dataset = TensorDataset(X_train, attention_mask_train, y_train)
test_dataset = TensorDataset(X_test, attention_mask_test, y_test)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [None]:
len(test_dataset)

In [None]:

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

from torch.optim import AdamW

# Loss and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)


In [None]:
from tqdm import tqdm

epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")


In [None]:
len(y_test)
print(total)

In [None]:
len(test_loader)

In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        # Loop over each sample in the batch
        for i in range(input_ids.size(0)):
            input_id = input_ids[i].unsqueeze(0)  # Select the i-th input in the batch
            mask = attention_mask[i].unsqueeze(0)  # Select the i-th attention mask
            label = labels[i].unsqueeze(0)  # Select the i-th label

            # Get model predictions for the individual sample
            outputs = model(input_id, attention_mask=mask)
            prediction = torch.argmax(outputs.logits, dim=-1)

            # Update correct and total counters
            correct += (prediction == label).sum().item()
            total += label.size(0)

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")


In [None]:
print(total)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Convert tensors to CPU and NumPy arrays
preds = predictions.cpu().numpy()
true_labels = labels.cpu().numpy()

# Compute confusion matrix
cm = confusion_matrix(true_labels, preds)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
from sklearn.metrics import classification_report
report = classification_report(all_labels, all_predictions, output_dict=True)
print(report)

## XLM-RoBERTa ##


In [None]:
from transformers import XLMRobertaTokenizer
import torch

# Load the tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

# Function to tokenize the text data
def tokenize_data(texts, tokenizer, max_length=512):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

from sklearn.model_selection import train_test_split

# Splitting the data (80% for training and 20% for testing)
train_data, test_data = train_test_split(combined_dataset, test_size=0.2, random_state=42)

# Print the shape to confirm the split
print("Training data shape:", train_data.shape)
print("Testing data shape:", test_data.shape)


In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer

# Define a custom dataset class
class FakeNewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize the tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

# Set max sequence length for padding/truncating
max_len = 512

# Create datasets and dataloaders
train_dataset = FakeNewsDataset(train_data, tokenizer, max_len)
test_dataset = FakeNewsDataset(test_data, tokenizer, max_len)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
from transformers import XLMRobertaForSequenceClassification

model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)


In [None]:
import matplotlib.pyplot as plt
from transformers import AdamW
from torch.optim.lr_scheduler import StepLR
import torch

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = StepLR(optimizer, step_size=3, gamma=0.1)

# Set device for GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training Loop
epochs = 3
losses = []  # Store losses for plotting
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()

    scheduler.step()

    # Save the average loss for this epoch
    avg_loss = total_loss / len(dataloader)
    losses.append(avg_loss)

    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss}")

# Plotting the loss graph
plt.figure(figsize=(10, 6))
plt.plot(range(1, epochs+1), losses, marker='o', linestyle='-', color='b', label='Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import torch
import matplotlib.pyplot as plt
predictions, true_labels = [], []
# Function to evaluate on the test set
def evaluate_on_test_set(model, test_dataloader):
    model.eval()

    with torch.no_grad():
        for batch in test_dataloader:  # Use test_dataloader here
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())

    # Print the classification report for the test set
    print("Classification Report (Test Set):")
    print(classification_report(true_labels, predictions, target_names=['False', 'True']))

    # Generate confusion matrix for the test set
    cm = confusion_matrix(true_labels, predictions)

    # Display the confusion matrix for the test set
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['False', 'True'])
    disp.plot(cmap=plt.cm.Blues)
    plt.title("Confusion Matrix (Test Set)")
    plt.show()

# Assuming your model is already trained, call the evaluate function
evaluate_on_test_set(model, test_dataloader)


##Baseline##

In [None]:
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from scipy.sparse import vstack
import zipfile

#Combining the two datasets
combined_dataset = pd.concat([english_dataset[['text', 'label', 'source']], urdu_news_df], ignore_index=True)

# Handling any remaining missing values in the combined dataset
combined_dataset = combined_dataset.dropna(subset=['text'])

# Mapping labels to just 'True', 'False', 'Unclear'
def simplify_labels(label):
    label = str(label)
    label = label.lower()
    if 'true' in label or 'partly true' in label or 'half true' in label:
        return 'True'
    elif 'false' in label or 'fake' in label or 'hoax' in label or 'doctored' in label:
        return 'False'
    else:
        return 'Unclear'

combined_dataset['label'] = combined_dataset['label'].apply(simplify_labels)

#Separating English and Urdu data for vectorization
english_data = combined_dataset[combined_dataset['source'] == 'English']
urdu_data = combined_dataset[combined_dataset['source'] == 'Urdu']

# Vectorizing the English text using TF-IDF
print("Vectorizing English data...")
tfidf_vectorizer_en = TfidfVectorizer(max_features=5000)
X_english_tfidf = tfidf_vectorizer_en.fit_transform(english_data['text'])

# Vectorizing the Urdu text using TF-IDF
print("Vectorizing Urdu data...")
tfidf_vectorizer_ur = TfidfVectorizer(max_features=5000, token_pattern=r'\w+')
X_urdu_tfidf = tfidf_vectorizer_ur.fit_transform(urdu_data['text'])

# Combining the vectorized data
X_combined_tfidf = vstack([X_english_tfidf, X_urdu_tfidf])

# Combining the labels
y_combined = pd.concat([english_data['label'], urdu_data['label']], ignore_index=True)

# Training the Logistic Regression model
print("Training the Logistic Regression model...")
X_train, X_test, y_train, y_test = train_test_split(X_combined_tfidf, y_combined, test_size=0.2, random_state=42)
lr_combined_model = LogisticRegression(max_iter=1000)
lr_combined_model.fit(X_train, y_train)

# Predicting and evaluating
print("Making predictions and evaluating the model...")
y_pred_combined = lr_combined_model.predict(X_test)

# Printing Report
classification_report_combined = classification_report(y_test, y_pred_combined)
print("\nClassification Report:\n", classification_report_combined)
