In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from pathlib import Path
import re
from collections import Counter

In [2]:
# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Define directories
DATA_DIR = Path('/home/abdellah-ennajari/Desktop/AI-Powered-Academic-Research-Assistant')
PROCESSED_DIR = DATA_DIR / 'Data/processed'

In [5]:
#Load cleaned data
papers_df = pd.read_csv(PROCESSED_DIR / 'cleaned_papers.csv')

# ====================================================
# Data Preparation
# ====================================================


In [6]:
# Create a simple target variable (e.g., categorize papers based on keywords)
def categorize_paper(title):
    if 'machine learning' in title.lower():
        return 'ML'
    elif 'deep learning' in title.lower():
        return 'DL'
    elif 'natural language processing' in title.lower():
        return 'NLP'
    else:
        return 'Other'

papers_df['category'] = papers_df['title'].apply(categorize_paper)


In [7]:
# Encode labels
label_encoder = LabelEncoder()
papers_df['category_encoded'] = label_encoder.fit_transform(papers_df['category'])


In [8]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    papers_df['title'], papers_df['category_encoded'], test_size=0.2, random_state=42
)

# ====================================================
# Text Preprocessing
# ====================================================


In [10]:
# Build vocabulary
def build_vocab(texts, max_vocab_size=10000):
    words = []
    for text in texts:
        words.extend(re.findall(r'\b\w+\b', text.lower()))
    word_counts = Counter(words)
    vocab = {word: idx + 1 for idx, (word, _) in enumerate(word_counts.most_common(max_vocab_size))}
    vocab['<PAD>'] = 0  # Padding token
    return vocab

vocab = build_vocab(X_train)

In [11]:
# Convert text to sequences of integers
def text_to_sequence(text, vocab):
    return [vocab.get(word, 0) for word in re.findall(r'\b\w+\b', text.lower())]

X_train_seq = [text_to_sequence(text, vocab) for text in X_train]
X_test_seq = [text_to_sequence(text, vocab) for text in X_test]

In [12]:
# Pad sequences to a fixed length
max_len = 20  # Maximum sequence length
X_train_padded = np.array([seq[:max_len] + [0] * (max_len - len(seq)) for seq in X_train_seq])
X_test_padded = np.array([seq[:max_len] + [0] * (max_len - len(seq)) for seq in X_test_seq])


In [13]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_padded, dtype=torch.long).to(device)
X_test_tensor = torch.tensor(X_test_padded, dtype=torch.long).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long).to(device)




# ====================================================
# Define Dataset and DataLoader
# ====================================================


In [14]:
class PaperDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = PaperDataset(X_train_tensor, y_train_tensor)
test_dataset = PaperDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# ====================================================
# Define the Model
# ====================================================

In [15]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim * max_len, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.fc(x)
        return x


In [16]:
# Hyperparameters
vocab_size = len(vocab)
embed_dim = 100
num_classes = len(label_encoder.classes_)

# Initialize model, loss function, and optimizer
model = TextClassifier(vocab_size, embed_dim, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# ====================================================
# Training Loop
# ====================================================

In [17]:

num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")


Epoch [1/5], Loss: 0.0332
Epoch [2/5], Loss: 0.0451
Epoch [3/5], Loss: 0.0000
Epoch [4/5], Loss: 0.0001
Epoch [5/5], Loss: 0.0110


# ====================================================
# Evaluation
# ====================================================

In [18]:
model.eval()
y_pred = []
y_true = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        y_pred.extend(predicted.cpu().numpy())
        y_true.extend(y_batch.cpu().numpy())

accuracy = accuracy_score(y_true, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 99.37%


In [19]:
# ====================================================
# Save the Model
# ====================================================


In [20]:
torch.save(model.state_dict(), PROCESSED_DIR / 'text_classifier.pth')
print("\nModel saved to 'processed' directory.")


Model saved to 'processed' directory.
