In [None]:
import pandas as pd

# Load the data
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [None]:
import re

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = text.lower()
    # Remove stopwords
    tokens = text.split()
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

# Preprocess train and test datasets
train_df['cleaned_text'] = train_df['text'].apply(preprocess_text)
test_df['cleaned_text'] = test_df['text'].apply(preprocess_text)


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [None]:
def tokenize(text, max_len=512):
    encodings = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )
    return encodings


In [None]:
import torch
from torch.utils.data import Dataset

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
from sklearn.model_selection import train_test_split

# Split the train dataset into train and validation datasets
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42, stratify=train_df['label'])

# Tokenize the cleaned text of train, validation, and test datasets
train_encodings = train_df['cleaned_text'].apply(tokenize).tolist()
val_encodings = val_df['cleaned_text'].apply(tokenize).tolist()
test_encodings = test_df['cleaned_text'].apply(tokenize).tolist()

# Create train, validation, and test datasets
train_dataset = NewsDataset(train_encodings, train_df['label'].tolist())
val_dataset = NewsDataset(val_encodings, val_df['label'].tolist())
test_dataset = NewsDataset(test_encodings, [0] * len(test_df))  # Dummy labels for test dataset

