<h1> MODEL IMPLEMENTATIONS <h1>

<h2> Imports <h2>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import cupy as cp
import sklearn

<h2> Import and Split <h2>

In [3]:
path = r"Dataset\articles.csv"
data = pd.read_csv(path)
# print(data.head())

In [4]:
X = data['content']
Y = data['gold_label']

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Y_train shape: {Y_train.shape}")
print(f"Y_test shape: {Y_test.shape}")


category_to_label = {
    "entertainment": 0,
    "business": 1,
    "sports": 2,
    "science-technology": 3,
    "international": 4
}

Y_train = [category_to_label[label.strip()] for label in Y_train if label.strip() in category_to_label]
Y_test = [category_to_label[label.strip()] for label in Y_test if label.strip() in category_to_label]


X_train shape: (1594,)
X_test shape: (399,)
Y_train shape: (1594,)
Y_test shape: (399,)


<h2> Bag of Words <h2>


In [None]:
class TextProcessor:
    def __init__(self, dataset):
        self.dataset = dataset
        self.vocabulary = {}
    
    def build_vocabulary(self):
        unique_words = set()
        for sentence in self.dataset:
            unique_words.update(sentence.split())
        self.vocabulary = {word: idx for idx, word in enumerate(sorted(unique_words))}
        return self.vocabulary
    
    def sentence_to_bow(self, sentence):
        words = sentence.split()
        vector = np.zeros(len(self.vocabulary))
        for word in words:
            if word in self.vocabulary:
                index = self.vocabulary[word]
                # print(index)
                vector[index] += 1
        return vector
    
    def vectorize_sentences(self, X):
        return [self.sentence_to_bow(sentence) for sentence in X]


In [25]:
bag = TextProcessor(X_train)
vocab = bag.build_vocabulary()
train_x = bag.vectorize_sentences(X_train)
test_x = bag.vectorize_sentences(X_test)
# print("trainx", train_x[0][12242])
# print(vocab)

16973


<h2> Neural Network Using Pytorch <h2>

In [26]:
import torch
from torch.utils.data import DataLoader, TensorDataset
print(set(Y_train))

train_x = torch.tensor(train_x, dtype=torch.float32)
test_x = torch.tensor(test_x, dtype=torch.float32)
Y_train = torch.tensor(Y_train, dtype=torch.long)
Y_test = torch.tensor(Y_test, dtype=torch.long)

batch_size = 64
train_dataset = TensorDataset(train_x, Y_train)
test_dataset = TensorDataset(test_x, Y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

{tensor(0), tensor(3), tensor(3), tensor(3), tensor(3), tensor(3), tensor(3), tensor(4), tensor(1), tensor(4), tensor(1), tensor(2), tensor(4), tensor(4), tensor(3), tensor(4), tensor(0), tensor(4), tensor(1), tensor(0), tensor(2), tensor(4), tensor(4), tensor(0), tensor(0), tensor(0), tensor(0), tensor(1), tensor(1), tensor(1), tensor(4), tensor(4), tensor(0), tensor(2), tensor(1), tensor(1), tensor(1), tensor(0), tensor(4), tensor(1), tensor(1), tensor(0), tensor(4), tensor(3), tensor(2), tensor(1), tensor(1), tensor(2), tensor(4), tensor(1), tensor(3), tensor(2), tensor(4), tensor(0), tensor(1), tensor(0), tensor(3), tensor(1), tensor(4), tensor(2), tensor(4), tensor(1), tensor(4), tensor(2), tensor(0), tensor(4), tensor(4), tensor(1), tensor(1), tensor(2), tensor(4), tensor(0), tensor(2), tensor(2), tensor(4), tensor(0), tensor(0), tensor(3), tensor(0), tensor(1), tensor(0), tensor(2), tensor(1), tensor(3), tensor(4), tensor(4), tensor(1), tensor(4), tensor(0), tensor(3), tensor(4)

  Y_train = torch.tensor(Y_train, dtype=torch.long)
  Y_test = torch.tensor(Y_test, dtype=torch.long)


In [43]:
import torch.nn as nn
import torch.nn.functional as F
class NewsClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(NewsClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.dropout = nn.Dropout(0.5) 
        self.fc4 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)  
        x = self.fc4(x)
        return x


In [48]:
input_size = train_x.shape[1]
num_classes = len(Y_train.unique())
model = NewsClassifier(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)


In [57]:
num_epochs = 15
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_x, batch_y in train_loader:
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")


Epoch [1/15], Loss: 0.0185
Epoch [2/15], Loss: 0.0181
Epoch [3/15], Loss: 0.0167
Epoch [4/15], Loss: 0.0177
Epoch [5/15], Loss: 0.0165
Epoch [6/15], Loss: 0.0163
Epoch [7/15], Loss: 0.0155
Epoch [8/15], Loss: 0.0139
Epoch [9/15], Loss: 0.0138
Epoch [10/15], Loss: 0.0148
Epoch [11/15], Loss: 0.0140
Epoch [12/15], Loss: 0.0133
Epoch [13/15], Loss: 0.0141
Epoch [14/15], Loss: 0.0125
Epoch [15/15], Loss: 0.0129


In [None]:
def evaluate_model(loader, model):
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch_x, batch_y in loader:
            outputs = model(batch_x)
            _, predicted = torch.max(outputs, 1)  
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
    
    accuracy = correct / total
    return accuracy

train_accuracy = evaluate_model(train_loader, model)
test_accuracy = evaluate_model(test_loader, model)
print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")


Training Accuracy: 1.0000
Test Accuracy: 0.97


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Assuming X_train, X_test, Y_train, Y_test are already defined and preprocessed

# Standardizing the features (Logistic Regression usually benefits from feature scaling)
scaler = StandardScaler()
X_train_scaled = train_x   # Fit and transform on training data
X_test_scaled = test_x       # Only transform on test data

# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000, multi_class='ovr', solver='liblinear')  # Solver 'liblinear' is good for smaller datasets
model.fit(X_train_scaled, Y_train)

# Make predictions
train_predictions = model.predict(X_train_scaled)
test_predictions = model.predict(X_test_scaled)

# Calculate accuracy
train_accuracy = accuracy_score(Y_train, train_predictions)
test_accuracy = accuracy_score(Y_test, test_predictions)

# Print accuracies
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


Training Accuracy: 1.0000
Test Accuracy: 0.9649


In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
model_nb = MultinomialNB()

# Fit the model on the training data
model_nb.fit(X_train_scaled, Y_train)

# Make predictions on both train and test datasets
train_predictions_nb = model_nb.predict(X_train_scaled)
test_predictions_nb = model_nb.predict(X_test_scaled)

# Calculate accuracy
train_accuracy_nb = accuracy_score(Y_train, train_predictions_nb)
test_accuracy_nb = accuracy_score(Y_test, test_predictions_nb)

# Print accuracies
print(f"Training Accuracy (Naive Bayes): {train_accuracy_nb:.4f}")
print(f"Test Accuracy (Naive Bayes): {test_accuracy_nb:.4f}")

Training Accuracy (Naive Bayes): 0.9806
Test Accuracy (Naive Bayes): 0.9674
