# Citation Prediction with Unsupervised Feature Discovery

**Goal**: Predict paper citation counts using supervised learning + unsupervised feature engineering

**Pipeline**:
1. Load data (papers from MIT, Stanford, Berkeley, Michigan, Georgia Tech, Toronto)
2. Exploratory Data Analysis
3. Unsupervised Feature Discovery:
   - Topic modeling (LDA)
   - University clustering
   - Abstract embeddings + PCA
4. Feature Engineering
5. Supervised Models (Random Forest, XGBoost)
6. Evaluation

In [None]:
# Install required packages
!pip install pandas numpy scikit-learn matplotlib seaborn
!pip install sentence-transformers transformers
!pip install xgboost lightgbm
!pip install umap-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sentence_transformers import SentenceTransformer
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Data

Expected columns:
- `title`: Paper title
- `abstract`: Paper abstract
- `authors`: Author names (comma-separated or list)
- `university`: University name
- `venue`: Conference/journal name
- `year`: Publication year
- `citations`: Citation count (target variable)

In [None]:
# Load your data here
# df = pd.read_csv('papers_dataset.csv')

# For now, create dummy data to test the pipeline
np.random.seed(42)
n_samples = 1000

universities = ['MIT', 'Stanford', 'UC Berkeley', 'University of Michigan', 
                'Georgia Tech', 'University of Toronto']
venues = ['NeurIPS', 'ICML', 'CVPR', 'ICCV', 'ACL', 'EMNLP', 'SIGIR', 'KDD']

df = pd.DataFrame({
    'title': [f'Paper {i}' for i in range(n_samples)],
    'abstract': [f'This paper presents a novel approach to machine learning using deep neural networks and optimization techniques.' for _ in range(n_samples)],
    'authors': [f'Author{i}, Author{i+1}' for i in range(n_samples)],
    'university': np.random.choice(universities, n_samples),
    'venue': np.random.choice(venues, n_samples),
    'year': np.random.randint(2015, 2021, n_samples),
    'citations': np.random.lognormal(3, 1.5, n_samples).astype(int)
})

print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Basic statistics
print("\nDataset Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())
print("\nCitation statistics:")
print(df['citations'].describe())

## 2. Exploratory Data Analysis

In [None]:
# Citation distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(df['citations'], bins=50, edgecolor='black')
axes[0].set_xlabel('Citations')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Citation Distribution')

axes[1].hist(np.log1p(df['citations']), bins=50, edgecolor='black', color='orange')
axes[1].set_xlabel('Log(Citations + 1)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Log-Transformed Citation Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Citations by university
plt.figure(figsize=(12, 6))
df.groupby('university')['citations'].mean().sort_values().plot(kind='barh')
plt.xlabel('Average Citations')
plt.title('Average Citations by University')
plt.tight_layout()
plt.show()

In [None]:
# Citations by venue
plt.figure(figsize=(12, 6))
df.groupby('venue')['citations'].mean().sort_values().plot(kind='barh', color='green')
plt.xlabel('Average Citations')
plt.title('Average Citations by Venue')
plt.tight_layout()
plt.show()

In [None]:
# Citations by year
plt.figure(figsize=(10, 6))
df.groupby('year')['citations'].mean().plot(marker='o', linewidth=2)
plt.xlabel('Year')
plt.ylabel('Average Citations')
plt.title('Average Citations by Publication Year')
plt.grid(True)
plt.show()

## 3. Unsupervised Feature Discovery

### 3.1 Topic Modeling (LDA)

In [None]:
# Topic modeling on abstracts
print("Running LDA topic modeling...")

# Vectorize abstracts
vectorizer = CountVectorizer(
    max_features=1000,
    stop_words='english',
    min_df=2,
    max_df=0.8
)

X_counts = vectorizer.fit_transform(df['abstract'])

# LDA with 15 topics
n_topics = 15
lda = LatentDirichletAllocation(
    n_components=n_topics,
    random_state=42,
    n_jobs=-1,
    max_iter=20
)

topic_distributions = lda.fit_transform(X_counts)

print(f"Topic distributions shape: {topic_distributions.shape}")
print("\nTop words per topic:")

feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    top_words = [feature_names[i] for i in topic.argsort()[-10:][::-1]]
    print(f"Topic {topic_idx}: {', '.join(top_words)}")

In [None]:
# Add topic features to dataframe
topic_cols = [f'topic_{i}' for i in range(n_topics)]
df_topics = pd.DataFrame(topic_distributions, columns=topic_cols)
df = pd.concat([df.reset_index(drop=True), df_topics], axis=1)

# Dominant topic
df['dominant_topic'] = topic_distributions.argmax(axis=1)

print("Topic features added!")
print(f"New shape: {df.shape}")

In [None]:
# Visualize citations by dominant topic
plt.figure(figsize=(12, 6))
df.groupby('dominant_topic')['citations'].mean().plot(kind='bar')
plt.xlabel('Topic')
plt.ylabel('Average Citations')
plt.title('Average Citations by Dominant Topic')
plt.tight_layout()
plt.show()

### 3.2 University Clustering

In [None]:
# Cluster universities based on citation patterns
print("Clustering universities...")

university_stats = df.groupby('university')['citations'].agg(['mean', 'std', 'median', 'count'])
print("\nUniversity statistics:")
print(university_stats)

# K-means clustering (3 tiers: top, mid, lower)
n_university_clusters = 3
kmeans_uni = KMeans(n_clusters=n_university_clusters, random_state=42)
university_stats['cluster'] = kmeans_uni.fit_predict(university_stats[['mean', 'std']])

# Map back to dataframe
university_to_cluster = university_stats['cluster'].to_dict()
df['university_tier'] = df['university'].map(university_to_cluster)

print("\nUniversity clusters:")
print(university_stats[['mean', 'cluster']].sort_values('cluster'))

### 3.3 Venue Clustering

In [None]:
# Cluster venues based on citation patterns
print("Clustering venues...")

venue_stats = df.groupby('venue')['citations'].agg(['mean', 'std', 'median', 'count'])
print("\nVenue statistics:")
print(venue_stats)

# K-means clustering
n_venue_clusters = 3
kmeans_venue = KMeans(n_clusters=n_venue_clusters, random_state=42)
venue_stats['cluster'] = kmeans_venue.fit_predict(venue_stats[['mean', 'std']])

# Map back to dataframe
venue_to_cluster = venue_stats['cluster'].to_dict()
df['venue_tier'] = df['venue'].map(venue_to_cluster)

print("\nVenue clusters:")
print(venue_stats[['mean', 'cluster']].sort_values('cluster'))

### 3.4 Abstract Embeddings + Dimensionality Reduction

In [None]:
# Generate embeddings using SciBERT
print("Generating abstract embeddings with SciBERT...")
print("(This may take a few minutes)")

model = SentenceTransformer('allenai/scibert_scivocab_uncased')

# Generate embeddings (batch processing for efficiency)
embeddings = model.encode(
    df['abstract'].tolist(),
    show_progress_bar=True,
    batch_size=32
)

print(f"Embeddings shape: {embeddings.shape}")

In [None]:
# PCA dimensionality reduction
print("Reducing dimensionality with PCA...")

n_pca_components = 50
pca = PCA(n_components=n_pca_components, random_state=42)
embeddings_pca = pca.fit_transform(embeddings)

print(f"Reduced embeddings shape: {embeddings_pca.shape}")
print(f"Explained variance ratio (first 10 components): {pca.explained_variance_ratio_[:10]}")
print(f"Total explained variance: {pca.explained_variance_ratio_.sum():.3f}")

In [None]:
# Add PCA features to dataframe
pca_cols = [f'pca_{i}' for i in range(n_pca_components)]
df_pca = pd.DataFrame(embeddings_pca, columns=pca_cols)
df = pd.concat([df.reset_index(drop=True), df_pca], axis=1)

print("PCA features added!")
print(f"Final shape: {df.shape}")

## 4. Feature Engineering

In [None]:
# Create additional features
print("Creating additional features...")

# Text-based features
df['abstract_length'] = df['abstract'].str.len()
df['abstract_word_count'] = df['abstract'].str.split().str.len()
df['title_length'] = df['title'].str.len()

# Author features
df['num_authors'] = df['authors'].str.count(',') + 1

# Time features
df['years_since_pub'] = 2026 - df['year']

print("Features created!")
df[['abstract_length', 'abstract_word_count', 'num_authors', 'years_since_pub']].head()

In [None]:
# Encode categorical variables
le_university = LabelEncoder()
le_venue = LabelEncoder()

df['university_encoded'] = le_university.fit_transform(df['university'])
df['venue_encoded'] = le_venue.fit_transform(df['venue'])

print("Categorical encoding complete!")

## 5. Model Training

### 5.1 Prepare Features

In [None]:
# Define feature columns
feature_cols = (
    # Basic features
    ['abstract_length', 'abstract_word_count', 'title_length', 'num_authors', 
     'years_since_pub', 'year'] +
    # Categorical encoded
    ['university_encoded', 'venue_encoded'] +
    # Unsupervised features - clusters
    ['university_tier', 'venue_tier', 'dominant_topic'] +
    # Unsupervised features - topics
    topic_cols +
    # Unsupervised features - PCA
    pca_cols
)

X = df[feature_cols]
y = df['citations']

print(f"Feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTotal features: {len(feature_cols)}")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled!")

### 5.2 Baseline: Random Forest

In [None]:
print("Training Random Forest...")

rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

rf.fit(X_train, y_train)

# Predictions
y_pred_rf_train = rf.predict(X_train)
y_pred_rf_test = rf.predict(X_test)

# Evaluation
print("\n=== Random Forest Results ===")
print(f"Train RMSE: {np.sqrt(mean_squared_error(y_train, y_pred_rf_train)):.2f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_rf_test)):.2f}")
print(f"Train MAE: {mean_absolute_error(y_train, y_pred_rf_train):.2f}")
print(f"Test MAE: {mean_absolute_error(y_test, y_pred_rf_test):.2f}")
print(f"Train R¬≤: {r2_score(y_train, y_pred_rf_train):.3f}")
print(f"Test R¬≤: {r2_score(y_test, y_pred_rf_test):.3f}")

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 20 Most Important Features:")
print(feature_importance.head(20))

# Plot
plt.figure(figsize=(12, 8))
feature_importance.head(20).plot(x='feature', y='importance', kind='barh')
plt.xlabel('Importance')
plt.title('Top 20 Feature Importances (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

### 5.3 XGBoost

In [None]:
print("Training XGBoost...")

xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbosity=1
)

xgb_model.fit(X_train, y_train)

# Predictions
y_pred_xgb_train = xgb_model.predict(X_train)
y_pred_xgb_test = xgb_model.predict(X_test)

# Evaluation
print("\n=== XGBoost Results ===")
print(f"Train RMSE: {np.sqrt(mean_squared_error(y_train, y_pred_xgb_train)):.2f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_xgb_test)):.2f}")
print(f"Train MAE: {mean_absolute_error(y_train, y_pred_xgb_train):.2f}")
print(f"Test MAE: {mean_absolute_error(y_test, y_pred_xgb_test):.2f}")
print(f"Train R¬≤: {r2_score(y_train, y_pred_xgb_train):.3f}")
print(f"Test R¬≤: {r2_score(y_test, y_pred_xgb_test):.3f}")

## 6. Model Evaluation

In [None]:
# Compare predictions
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Random Forest
axes[0].scatter(y_test, y_pred_rf_test, alpha=0.5)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Citations')
axes[0].set_ylabel('Predicted Citations')
axes[0].set_title(f'Random Forest (R¬≤ = {r2_score(y_test, y_pred_rf_test):.3f})')

# XGBoost
axes[1].scatter(y_test, y_pred_xgb_test, alpha=0.5, color='green')
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1].set_xlabel('Actual Citations')
axes[1].set_ylabel('Predicted Citations')
axes[1].set_title(f'XGBoost (R¬≤ = {r2_score(y_test, y_pred_xgb_test):.3f})')

plt.tight_layout()
plt.show()

In [None]:
# Residual plots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Random Forest residuals
residuals_rf = y_test - y_pred_rf_test
axes[0].scatter(y_pred_rf_test, residuals_rf, alpha=0.5)
axes[0].axhline(y=0, color='r', linestyle='--', lw=2)
axes[0].set_xlabel('Predicted Citations')
axes[0].set_ylabel('Residuals')
axes[0].set_title('Random Forest Residuals')

# XGBoost residuals
residuals_xgb = y_test - y_pred_xgb_test
axes[1].scatter(y_pred_xgb_test, residuals_xgb, alpha=0.5, color='green')
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted Citations')
axes[1].set_ylabel('Residuals')
axes[1].set_title('XGBoost Residuals')

plt.tight_layout()
plt.show()

In [None]:
# Model comparison summary
comparison = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost'],
    'Test RMSE': [
        np.sqrt(mean_squared_error(y_test, y_pred_rf_test)),
        np.sqrt(mean_squared_error(y_test, y_pred_xgb_test))
    ],
    'Test MAE': [
        mean_absolute_error(y_test, y_pred_rf_test),
        mean_absolute_error(y_test, y_pred_xgb_test)
    ],
    'Test R¬≤': [
        r2_score(y_test, y_pred_rf_test),
        r2_score(y_test, y_pred_xgb_test)
    ]
})

print("\n=== Model Comparison ===")
print(comparison)

# Determine best model
best_model_idx = comparison['Test R¬≤'].idxmax()
best_model = comparison.loc[best_model_idx, 'Model']
print(f"\nüèÜ Best Model: {best_model}")

## 7. Feature Group Analysis

Analyze the contribution of unsupervised features

In [None]:
# Train model without unsupervised features
basic_features = ['abstract_length', 'abstract_word_count', 'title_length', 
                  'num_authors', 'years_since_pub', 'year',
                  'university_encoded', 'venue_encoded']

X_train_basic = X_train[basic_features]
X_test_basic = X_test[basic_features]

rf_basic = RandomForestRegressor(
    n_estimators=200,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)

rf_basic.fit(X_train_basic, y_train)
y_pred_basic = rf_basic.predict(X_test_basic)

print("=== Baseline (No Unsupervised Features) ===")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_basic)):.2f}")
print(f"Test R¬≤: {r2_score(y_test, y_pred_basic):.3f}")

print("\n=== Full Model (With Unsupervised Features) ===")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_rf_test)):.2f}")
print(f"Test R¬≤: {r2_score(y_test, y_pred_rf_test):.3f}")

improvement = r2_score(y_test, y_pred_rf_test) - r2_score(y_test, y_pred_basic)
print(f"\n‚úÖ R¬≤ Improvement from Unsupervised Features: {improvement:.3f}")

## 8. Save Models

In [None]:
import pickle

# Save best model and preprocessing objects
with open('best_model.pkl', 'wb') as f:
    pickle.dump(rf if best_model == 'Random Forest' else xgb_model, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('encoders.pkl', 'wb') as f:
    pickle.dump({
        'university': le_university,
        'venue': le_venue,
        'vectorizer': vectorizer,
        'lda': lda,
        'pca': pca,
        'sentence_model': model
    }, f)

print("Models and preprocessors saved!")

## Summary

This notebook demonstrates:
1. ‚úÖ **Unsupervised feature discovery** using LDA, clustering, and embeddings
2. ‚úÖ **Feature engineering** from discovered patterns
3. ‚úÖ **Supervised prediction** using Random Forest and XGBoost
4. ‚úÖ **Evaluation** showing improvement from unsupervised features

**Next steps**:
- Load real data from your professor
- Tune hyperparameters with grid search
- Try additional models (LightGBM, neural networks)
- Feature selection to reduce dimensionality
- Cross-validation for robust evaluation