In [None]:
import pandas as pd
import numpy as np
import torch

from collections import Counter
import re
from nltk.tokenize import word_tokenize #

from sklearn.feature_extraction.text import TfidfVectorizer #

from sklearn.model_selection import train_test_split, GridSearchCV #

from sklearn.svm import SVC #

from sklearn.naive_bayes import MultinomialNB #
from sklearn.linear_model import SGDClassifier, LogisticRegression #
from sklearn.multiclass import OneVsRestClassifier #
from sklearn.metrics import accuracy_score, classification_report # 

from sklearn.ensemble import RandomForestClassifier


import nltk #
import string #
from nltk.corpus import stopwords #
from tqdm import tqdm
import joblib


from sklearn.preprocessing import LabelEncoder #


from transformers import BertTokenizer, BertForSequenceClassification, AdamW, Trainer, TrainingArguments

In [2]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: C:\Users\dimit\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dimit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dimit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data_path = r'C:\Users\dimit\Desktop\nlp_data\dblp-v10.csv'

In [4]:
with open(data_path, "r") as f:
    print("File opened successfully!")


File opened successfully!


In [None]:
#drive.mount('/content/drive')
#data_path = r'/content/drive/My Drive/dblp-v10.csv'

Data load

In [None]:
df = pd.read_csv(data_path)

df.dropna(subset=["abstract", "venue"], inplace=True)
df["abstract"] = df["abstract"].str.lower()
df = df.head(10000)
print(df.shape)

(10000, 8)


In [6]:
df.head()


Unnamed: 0,abstract,authors,n_citation,references,title,venue,year,id
0,"In this paper, a robust 3D triangular mesh wat...","['S. Ben Jabra', 'Ezzeddine Zagrouba']",50,"['09cb2d7d-47d1-4a85-bfe5-faa8221e644b', '10aa...",A new approach of 3D watermarking based on ima...,international symposium on computers and commu...,2008,4ab3735c-80f1-472d-b953-fa0557fed28b
1,We studied an autoassociative neural network w...,"['Joaquín J. Torres', 'Jesús M. Cortés', 'Joaq...",50,"['4017c9d2-9845-4ad2-ad5b-ba65523727c5', 'b118...",Attractor neural networks with activity-depend...,Neurocomputing,2007,4ab39729-af77-46f7-a662-16984fb9c1db
2,It is well-known that Sturmian sequences are t...,"['Genevi eve Paquin', 'Laurent Vuillon']",50,"['1c655ee2-067d-4bc4-b8cc-bc779e9a7f10', '2e4e...",A characterization of balanced episturmian seq...,Electronic Journal of Combinatorics,2007,4ab3a4cf-1d96-4ce5-ab6f-b3e19fc260de
3,One of the fundamental challenges of recognizi...,"['Yaser Sheikh', 'Mumtaz Sheikh', 'Mubarak Shah']",221,"['056116c1-9e7a-4f9b-a918-44eb199e67d6', '05ac...",Exploring the space of a human action,international conference on computer vision,2005,4ab3a98c-3620-47ec-b578-884ecf4a6206
4,This paper generalizes previous optimal upper ...,"['Efraim Laksman', 'Håkan Lennerstad', 'Magnus...",0,"['01a765b8-0cb3-495c-996f-29c36756b435', '5dbc...",Generalized upper bounds on the minimum distan...,Ima Journal of Mathematical Control and Inform...,2015,4ab3b585-82b4-4207-91dd-b6bce7e27c4e


In [None]:

label_encoder = LabelEncoder()
df["venue_label"] = label_encoder.fit_transform(df["venue"])

{'ACM Communications in Computer Algebra': 0, 'ACM Computing Surveys': 1, 'ACM Crossroads Student Magazine': 2, 'ACM Queue': 3, 'ACM Sigapl Apl Quote Quad': 4, 'ACM Sigarch Computer Architecture News': 5, 'ACM Sigois Bulletin': 6, 'ACM Sigsoft Software Engineering Notes': 7, 'ACM Transactions in Embedded Computing Systems': 8, 'ACM Transactions on Architecture and Code Optimization': 9, 'ACM Transactions on Autonomous and Adaptive Systems': 10, 'ACM Transactions on Computational Logic': 11, 'ACM Transactions on Computer Systems': 12, 'ACM Transactions on Computer-Human Interaction': 13, 'ACM Transactions on Database Systems': 14, 'ACM Transactions on Design Automation of Electronic Systems': 15, 'ACM Transactions on Graphics': 16, 'ACM Transactions on Information Systems': 17, 'ACM Transactions on Mathematical Software': 18, 'ACM Transactions on Multimedia Computing, Communications, and Applications': 19, 'ACM Transactions on Programming Languages and Systems': 20, 'ACM Transactions on

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["abstract"].tolist(), df["venue_label"].tolist(), test_size=0.2, random_state=42)


In [None]:
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(train_texts)
X_val_tfidf = vectorizer.transform(val_texts)


In [None]:
models = {
    "SVM": SVC(),
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SGD Classifier": SGDClassifier(),
    "Random Forest": RandomForestClassifier()
}
results = {}
for name, model in models.items():
    model.fit(train_texts, train_labels)
    y_pred = model.predict(train_texts)
    acc = accuracy_score(val_labels, y_pred)
    results[name] = acc
    print(f"{name}: Accuracy = {acc:.4f}")

for name, model in models.items():
    y_pred = model.predict(val_texts)
    print(f"Classification Report for {name}:")
    print(classification_report(val_labels, y_pred))


param_grid_sgd = {
    "estimator__loss": ["hinge", "log_loss"],
    "estimator__alpha": [1e-4, 1e-3, 1e-2],
    "estimator__max_iter": [1000, 2000],
    "estimator__tol": [1e-3, 1e-4],
}

grid_search_sgd = GridSearchCV(OneVsRestClassifier(SGDClassifier()), param_grid_sgd, cv=5, scoring="accuracy", n_jobs=-1)
grid_search_sgd.fit(train_texts, val_texts)

print("Best param:", grid_search_sgd.best_params_)
print("Best score:", grid_search_sgd.best_score_)

###BERT

In [None]:

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)


In [None]:
class VenueDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [None]:

train_dataset = VenueDataset(train_encodings, train_labels)
val_dataset = VenueDataset(val_encodings, val_labels)

In [9]:

vectorizer = TfidfVectorizer(max_features=5000) 
X = vectorizer.fit_transform(df['processed_abstract']).toarray()
y = df['encoded venue']


In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(set(df["venue_label"])), use_fast = True)


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

In [None]:
trainer = Trainer( #from the api
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1))},
)
trainer.train()

In [None]:

predictions = trainer.predict(val_dataset)
y_preds = np.argmax(predictions.predictions, axis=1)
print(classification_report(val_labels, y_preds, target_names=label_encoder.classes_))

In [None]:
# Save trained model & tokenizer for future inference
model.save_pretrained("bert_venue_prediction")
tokenizer.save_pretrained("bert_venue_prediction")

print("Model and tokenizer saved successfully!")