# **Author prediction**

Through this notebook we will focus on predicting authors, our goal is to recommend the most likely authors for a given research paper based on its title, abstract, and references.
Type of task is similar to multi-label classification.

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from scipy.sparse import hstack
import nltk
import ast
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import joblib

In [2]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# prompt: import a dataset from a zipfile in my drive

import zipfile
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# Replace 'your_zip_file.zip' and 'your_dataset.csv' with your actual file names
zip_file_path = '/content/drive/My Drive/dblp-v10.zip'
csv_file_name = 'dblp-v10.csv'

try:
  with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('/content/temp_data') # Extract to a temporary directory

  df = pd.read_csv(f'/content/temp_data/{csv_file_name}')

except FileNotFoundError:
  print(f"Error: File not found at {zip_file_path}")
except KeyError:
    print(f"Error: File {csv_file_name} not found inside the zip archive.")
except Exception as e:
  print(f"An unexpected error occurred: {e}")


Mounted at /content/drive


In [10]:
from multiprocessing import process
# Preprocess the authors column
def preprocess_authors(df):
    # Convert string lists to actual lists
    df['authors'] = df['authors'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    # Normalize names (lowercase, strip whitespace)
    df['authors'] = df['authors'].apply(lambda x: [i.lower() for i in x] if isinstance(x, list) else x)
    return df

# Filter frequent authors
def filter_frequent_authors(df, min_papers=5):
    all_authors = [author for sublist in df['authors'] for author in sublist]
    author_counts = Counter(all_authors)
    frequent_authors = {author for author, count in author_counts.items() if count >= min_papers}
    df['authors'] = df['authors'].apply(lambda lst: [a for a in lst if a in frequent_authors])
    return df

def create_author_matrix(df):
    mlb = MultiLabelBinarizer()
    author_matrix = mlb.fit_transform(df['authors'])
    return author_matrix, mlb

# Preprocess text (abstract + title)
def preprocess_text(df):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    def clean(text):
        text = text.lower() # lowercase the text
        text = re.sub(r'\W+', ' ', text)  # Remove non-word characters
        tokens =  word_tokenize(text)# tokenize
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
        return ' '.join(tokens)

    # Cleaning the title and abstract columns
    df['clean_title'] = df['title'].astype(str).apply(clean)
    df['clean_abstract'] = df['abstract'].astype(str).apply(clean)

    # Combine title and abstract for feature extraction
    df['combined_text'] = df['clean_title'] + ' ' + df['clean_abstract']

    return df

# Feature engineering
def create_features(df):
    # TF-IDF for text
    tfidf = TfidfVectorizer(max_features=5000) # limitation of features for speed
    X_text = tfidf.fit_transform(df['combined_text'])

    # One-hot encode venue
    venue_encoder = OneHotEncoder(handle_unknown='ignore')
    X_venue = venue_encoder.fit_transform(df[['venue']])

    # Normalize year
    df['year'] = (df['year'] - df['year'].min()) / (df['year'].max() - df['year'].min())
    X_year = df['year'].values.reshape(-1, 1)

    # Combine all features
    X = hstack([X_text, X_venue, X_year])
    return X, tfidf

# Train and evaluate the model
def train_and_evaluate(X, y):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    print(X_train.shape)
    print(y_train.shape)
    print(X_test.shape)
    print(y_test.shape)
    # Train a classifier
    # Creating the OneVsRestClassifier using SGDClassifier
    sgd = SGDClassifier(loss='log_loss', max_iter=5, tol=None)
    model = OneVsRestClassifier(sgd)

    # fiting
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    print("F1 Score (Micro):", f1_score(y_test, y_pred, average='micro'))
    print("Precision (Micro):", precision_score(y_test, y_pred, average='micro'))
    print("Recall (Micro):", recall_score(y_test, y_pred, average='micro'))

    return model


def get_top_k_predictions(model, X, mlb, k=5):
    """
    Returns a list of top-k predicted authors for each sample.
    Uses decision_function if available, otherwise predict_proba.
    """
    if hasattr(model, "decision_function"):
        scores = model.decision_function(X)
    elif hasattr(model, "predict_proba"):
        scores = model.predict_proba(X)
    else:
        raise AttributeError("Model does not support decision_function or predict_proba.")

    top_k_authors = []
    for score in scores:
        top_indices = score.argsort()[-k:][::-1]
        top_k_authors.append([mlb.classes_[i] for i in top_indices])
    return top_k_authors

# Example usage:
# Assuming you have a test set X_test and your MultiLabelBinarizer (mlb) from create_author_matrix
# top_k_authors = get_top_k_predictions(model, X_test, mlb, k=5)
# print("Top-5 predicted authors for first test sample:", top_k_authors[0])


In [5]:
df.shape

(1000000, 8)

In [4]:
print(df.head()) # Display first few rows of the DataFrame

                                            abstract  \
0  In this paper, a robust 3D triangular mesh wat...   
1  We studied an autoassociative neural network w...   
2  It is well-known that Sturmian sequences are t...   
3  One of the fundamental challenges of recognizi...   
4  This paper generalizes previous optimal upper ...   

                                             authors  n_citation  \
0             ['S. Ben Jabra', 'Ezzeddine Zagrouba']          50   
1  ['Joaquín J. Torres', 'Jesús M. Cortés', 'Joaq...          50   
2           ['Genevi eve Paquin', 'Laurent Vuillon']          50   
3  ['Yaser Sheikh', 'Mumtaz Sheikh', 'Mubarak Shah']         221   
4  ['Efraim Laksman', 'Håkan Lennerstad', 'Magnus...           0   

                                          references  \
0  ['09cb2d7d-47d1-4a85-bfe5-faa8221e644b', '10aa...   
1  ['4017c9d2-9845-4ad2-ad5b-ba65523727c5', 'b118...   
2  ['1c655ee2-067d-4bc4-b8cc-bc779e9a7f10', '2e4e...   
3  ['056116c1-9e7a-4f9b-a918-4

In [6]:
df["authors"]

Unnamed: 0,authors
0,"['S. Ben Jabra', 'Ezzeddine Zagrouba']"
1,"['Joaquín J. Torres', 'Jesús M. Cortés', 'Joaq..."
2,"['Genevi eve Paquin', 'Laurent Vuillon']"
3,"['Yaser Sheikh', 'Mumtaz Sheikh', 'Mubarak Shah']"
4,"['Efraim Laksman', 'Håkan Lennerstad', 'Magnus..."
...,...
999995,"['Jeril Kuriakose', 'Sandeep Joshi']"
999996,"['Anh Khoa Bui', 'ZheKai Xiao', 'Liter Siek']"
999997,"['Sarah E. Ballinger', 'Thomas A. Adams']"
999998,"['Ben London', 'Bert Huang', 'Lise Getoor']"


In [7]:
# keeping a copy of the original datafram
df_original = df.copy()

In [8]:
df.dropna(inplace=True)

In [9]:
df.shape

(693622, 8)

In [11]:
# Step 2: Preprocess authors
df = preprocess_authors(df)

In [12]:
# Step 3: delete instances with empty list of authors
dfNoEmpty = df[df['authors'].apply(lambda x: isinstance(x, list) and len(x) > 0)]


In [13]:
# Step 4: Filter frequent authors ( a assumption that we keep only authors with at least 5 papers)
df_filtered = filter_frequent_authors(dfNoEmpty, min_papers=5)

In [14]:
#  delete from df_filtered the lines with authors is an emplty list

df_filtered = df_filtered[df_filtered['authors'].apply(lambda x: len(x) > 0)]


In [15]:
dfNoEmpty.shape, df_filtered.shape

((693622, 8), (590211, 8))

In [None]:
# Step 4: Create author matrix
author_matrix, mlb = create_author_matrix(df_filtered)

In [None]:
author_matrix, mlb

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 MultiLabelBinarizer())

In [None]:
author_matrix.shape

In [16]:
all_authors = [author for sublist in df_filtered['authors'] for author in sublist]
print("Unique authors after filtering:", len(set(all_authors)))

Unique authors after filtering: 99521


In [18]:
# Step 5: Preprocess text
df_filtered = preprocess_text(df_filtered)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_title'] = df['title'].astype(str).apply(clean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_abstract'] = df['abstract'].astype(str).apply(clean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['combined_text'] = df['clean_title'] + ' ' + df['clean_abstract']


In [19]:
# save the preproced dataframe to a new csv file
df_filtered.to_csv('preprocessed_data.csv', index=False)

In [20]:
# Step 6: Create features
X, tfidf = create_features(df_filtered)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = (df['year'] - df['year'].min()) / (df['year'].max() - df['year'].min())


In [None]:
# Verify shapes
print("Shape of X:", X.shape)
print("Shape of author_matrix:", author_matrix.shape)
print("Rows in df_filtered:", df_filtered.shape[0])

Shape of X: (499268, 8337)
Shape of author_matrix: (499268, 40773)
Rows in df_filtered: 499268


In [None]:
import joblib

# Cache the TF-IDF vectorizer, the feature matrix, and the preprocessed DataFrame.
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(X, 'features_X.pkl')
joblib.dump(df_filtered, 'df_filtered.pkl')


['df_filtered.pkl']

### Training method 1: Full batch
the shapes are too huge so it's not realy possible to do a full batch traininh

In [None]:
def train_and_evaluate_dense(X, y):
    # Split data into train/test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Use a baseline classifier (e.g., Logistic Regression)
    model = OneVsRestClassifier(LogisticRegression(solver='liblinear'))

    # Train the model on the full dataset
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Evaluate using common metrics
    print("F1 Score (Micro):", f1_score(y_test, y_pred, average='micro'))
    print("Precision (Micro):", precision_score(y_test, y_pred, average='micro'))
    print("Recall (Micro):", recall_score(y_test, y_pred, average='micro'))

    return model

# Train using dense arrays
model = train_and_evaluate_dense(X, author_matrix)


### Methode 2: with batch learning
but i have essues with this method and i couldn't to fixe them

In [None]:
def train_and_evaluate_batch(X, y, batch_size=1000, test_batch_size=5000, checkpoint_interval=100):
    # Split data into train/test indices
    indices = np.arange(X.shape[0])
    train_idx, test_idx = train_test_split(indices, test_size=0.3, random_state=42)

    # Hyperparameter tuning: You can adjust batch_size, max_iter, or learning rate here.
    sgd = SGDClassifier(
        loss='log_loss',          # Logistic regression
        penalty='l2',
        max_iter=1,               # One epoch per batch
        learning_rate='adaptive',
        eta0=0.1,
        random_state=42
    )
    model = OneVsRestClassifier(sgd)

    # Initialize MultiLabelBinarizer for multi-label classification
    mlb = MultiLabelBinarizer()
    # Convert the sparse matrix to a dense array before fitting MultiLabelBinarizer.
    # Then, convert each row to a list of author indices.
    y_dense = y.toarray()
    y_iterable = [np.where(row)[0].tolist() for row in y_dense]
    mlb.fit(y_iterable)  # Fit to get all possible classes
    classes = mlb.classes_  # Get all unique classes

    # Training loop with progress logging and checkpointing
    for i in tqdm(range(0, len(train_idx), batch_size), desc="Training batches"):
        batch_indices = train_idx[i:i+batch_size]
        X_batch = X[batch_indices].tocsr()
        y_batch = y[batch_indices].toarray()  # Convert sparse to dense array for training

        # Use the fitted MultiLabelBinarizer to transform labels for partial_fit
        y_batch_transformed = mlb.transform( [np.where(row)[0].tolist() for row in y_batch] )

        model.partial_fit(X_batch, y_batch_transformed, classes=classes) # Pass transformed labels


        # Save a checkpoint every checkpoint_interval batches
        if ((i // batch_size) + 1) % checkpoint_interval == 0:
            checkpoint_filename = f'model_checkpoint_batch_{(i // batch_size) + 1}.pkl'
            joblib.dump(model, checkpoint_filename)
            print(f"Checkpoint saved: {checkpoint_filename}")

    # Evaluation loop with progress logging
    y_true_batches = []
    y_pred_batches = []
    for i in tqdm(range(0, len(test_idx), test_batch_size), desc="Evaluating batches"):
        batch_indices = test_idx[i:i+test_batch_size]
        X_batch = X[batch_indices].tocsr()
        y_batch_true = y[batch_indices].toarray()

        # Transform true labels for evaluation using the same MultiLabelBinarizer
        y_batch_true_transformed = mlb.transform([np.where(row)[0].tolist() for row in y_batch_true])

        y_batch_pred = model.predict(X_batch)
        y_true_batches.append(y_batch_true_transformed) # Append transformed true labels
        y_pred_batches.append(y_batch_pred)

    y_true = np.vstack(y_true_batches)
    y_pred = np.vstack(y_pred_batches)

    print("F1 Score (Micro):", f1_score(y_true, y_pred, average='micro'))
    print("Precision (Micro):", precision_score(y_true, y_pred, average='micro'))
    print("Recall (Micro):", recall_score(y_true, y_pred, average='micro'))

    return model

In [None]:
# Step 1: Filter authors
author_counts = np.array(author_matrix_sparse.sum(axis=0)).ravel()
top_author_indices = np.argsort(-author_counts)[:1000]
author_matrix_filtered = author_matrix_sparse[:, top_author_indices].tocsc()

# Step 2: Remove empty papers
non_empty_mask = author_matrix_filtered.sum(axis=1) > 0
non_empty_indices = np.where(non_empty_mask)[0]  # Get 1D array of indices
author_matrix_filtered = author_matrix_filtered[non_empty_indices]
X_filtered = X_sparse[non_empty_indices]

# Step 3: Convert labels to dense
author_matrix_dense = author_matrix_filtered.toarray()

# Step 4: Train with classes initialized
classes = np.arange(author_matrix_dense.shape[1])
model = train_and_evaluate_batch3(X_filtered, author_matrix_dense, classes)

In [None]:
# Example prediction
example_idx = 0
example_text = df.iloc[example_idx]['text']
example_venue = df.iloc[example_idx]['venue']
example_year = df.iloc[example_idx]['year']

# TF-IDF for text
tfidf = TfidfVectorizer(max_features=5000)
# One-hot encode venue
venue_encoder = OneHotEncoder(handle_unknown='ignore')

example_features = hstack([
    tfidf.transform([example_text]),
    venue_encoder.transform([[example_venue]]),
    np.array([[example_year]])
])
predicted_authors = model.predict(example_features)
predicted_author_names = mlb.inverse_transform(predicted_authors)
print(f"Predicted authors for example paper: {predicted_author_names}")

## An advanced model

For advanced models we can try the methods:
- specter
- SCIBERT

In [None]:
! pip install sentence-transformers

In [23]:
df_filtered.columns

Index(['abstract', 'authors', 'n_citation', 'references', 'title', 'venue',
       'year', 'id', 'clean_title', 'clean_abstract', 'combined_text'],
      dtype='object')

In [None]:
from sentence_transformers import SentenceTransformer

# Load the SPECTER model (suitable for scientific texts)
model = SentenceTransformer('allenai-specter')

# Generate embeddings for the combined text
# Note: For large datasets, you might want to process in batches.
embeddings = model.encode(df_filtered['combined_text'].tolist(), show_progress_bar=True)

# Convert embeddings to a NumPy array
import numpy as np
X_embeddings = np.array(embeddings)

print("Shape of Transformer Embeddings:", X_embeddings.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.77k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/331 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/462k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/18445 [00:00<?, ?it/s]