In [None]:
# import shutil and nltk library
import os
import shutil

from nltk import download, word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer

In [None]:
# download 20news-bydate.tar.gz file

!curl 'http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz' >> './20news-bydate.tar.gz'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 13.7M  100 13.7M    0     0  2131k      0  0:00:06  0:00:06 --:--:-- 2948k


In [None]:
# unpack files

shutil.unpack_archive('/content/20news-bydate.tar.gz', '/content/20news-bydate')
!rm '/content/20news-bydate.tar.gz' # delete file

In [None]:
# Download NLTK resources if not already downloaded
download('punkt')
download('averaged_perceptron_tagger')
download('wordnet')

# Function to perform lemmatization
def stem_text(text):
    stemmer = PorterStemmer()
    stemmed_tokens = []

    # Perform POS tagging for lemmatization
    pos_tags = pos_tag(word_tokenize(text))

    for word, pos in pos_tags:
        # Map POS tags to WordNet POS tags
        wn_pos = pos[0].lower() if pos[0].lower() in 'nvar' else None

        if wn_pos:
            stemmed_word = stemmer.stem(word, wn_pos)
        else:
            stemmed_word = stemmer.stem(word)

        stemmed_tokens.append(stemmed_word)

    return ' '.join(stemmed_tokens)

# Function to lemmatize all documents in a directory
def stem_directory(directory):
  for root, dirs, files in os.walk(directory):
    print(f'Processing {root}... found directories: {dirs} and {len(files)} files...')
    for file in files:
        file_path = os.path.join(root, file)

        # read file and get lematized content
        with open(file_path, 'r', encoding='latin1') as f:
            content = f.read()
        stemmed_content = stem_text(content)

        # write lematized content to new file
        with open(file_path, 'w', encoding='latin1') as f:
            f.write(stemmed_content)
    print(f'Finished {root}.')
  print('Finished lemmatizing all files.')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
data_path = '/content/20news-bydate'
#stem_directory(data_path)

In [None]:
# import sklearn/tensorflow and other dependencies

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

In [None]:
def load_and_preprocess_data(data_path):
  texts = []
  labels = []

  for root, dirs, files in os.walk(data_path):
    for file in files:
      file_path = os.path.join(root, file)

      # read file and append content and label
      with open(file_path, 'r', encoding='latin1') as f:
        content = f.read()
        texts.append(content)
        labels.append(root.split("/")[-1])

  # Create a DataFrame for better handling
  df = pd.DataFrame({'text': texts, 'label': labels})

  # Encode labels
  label_encoder = LabelEncoder()
  df['encoded_label'] = label_encoder.fit_transform(df['label'])

  # Split the data into training and testing sets
  train_data, test_data, train_labels, test_labels = train_test_split(
    df['text'], df['encoded_label'], test_size=0.2, random_state=42
  )

  return train_data, test_data, train_labels, test_labels, label_encoder, texts, df['encoded_label'].tolist()

In [None]:
# Load and preprocess the lemmatized 20 Newsgroups dataset
train_data, test_data, train_labels, test_labels, label_encoder, texts, labels = load_and_preprocess_data(data_path)

# Tokenize and pad the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data)

vocab_size = len(tokenizer.word_index) + 1
max_length = 200  # Adjust this based on your dataset and available resources

train_sequences = tokenizer.texts_to_sequences(train_data)
test_sequences = tokenizer.texts_to_sequences(test_data)

X_train = pad_sequences(train_sequences, maxlen=max_length, padding='post')
X_test = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Create a simple neural network model
model = Sequential()
model.add(Embedding(vocab_size, 32, input_length=max_length))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, train_labels, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, test_labels)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Accuracy: 81.62%


In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [None]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [None]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return "positive" if preds.item() == 1 else "negative"

In [None]:
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [None]:
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

Epoch 1/4


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
