# Unsupervised Learning on AUB Dataset

**Goal**: Apply unsupervised learning techniques to discover patterns, clusters, and latent features in the AUB publication dataset

**Techniques**:
1. Topic Modeling (LDA) - Discover research themes in abstracts
2. Clustering - Group similar papers, authors, venues
3. Dimensionality Reduction - PCA, t-SNE, UMAP for visualization
4. Anomaly Detection - Identify outliers and unusual papers
5. Feature Discovery - Extract new features for downstream models

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Sklearn imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, PCA, TruncatedSVD, NMF
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

%matplotlib inline

## 1. Load Cleaned AUB Data

In [None]:
# Load cleaned data
df = pd.read_pickle('../data/processed/cleaned_data.pkl')

print(f"Dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"\nColumns: {list(df.columns)}")

In [None]:
# Basic info
print("Dataset Info:")
print(df.info())

print("\nFirst few rows:")
df.head()

## 2. Topic Modeling with LDA

Discover latent research topics in paper abstracts using Latent Dirichlet Allocation

In [None]:
# Prepare text data - using Abstract column
abstracts = df['Abstract'].fillna('').astype(str)

print(f"Total abstracts: {len(abstracts)}")
print(f"Non-empty abstracts: {(abstracts.str.len() > 0).sum()}")
print(f"\nSample abstract:\n{abstracts.iloc[0][:300]}...")

In [None]:
# Vectorize abstracts with CountVectorizer for LDA
print("Vectorizing abstracts...")

vectorizer = CountVectorizer(
    max_features=2000,
    stop_words='english',
    min_df=5,  # Minimum document frequency
    max_df=0.7,  # Maximum document frequency
    ngram_range=(1, 2)  # Unigrams and bigrams
)

X_counts = vectorizer.fit_transform(abstracts)

print(f"Vocabulary size: {len(vectorizer.get_feature_names_out())}")
print(f"Document-term matrix shape: {X_counts.shape}")
print(f"Sparsity: {(1 - X_counts.nnz / (X_counts.shape[0] * X_counts.shape[1])) * 100:.2f}%")

In [None]:
# Train LDA model with multiple topic counts to find optimal
print("Training LDA models with different topic counts...\n")

topic_counts = [5, 10, 15, 20, 25]
perplexities = []
log_likelihoods = []

for n_topics in topic_counts:
    print(f"Training LDA with {n_topics} topics...")
    lda = LatentDirichletAllocation(
        n_components=n_topics,
        random_state=42,
        n_jobs=-1,
        max_iter=20,
        learning_method='batch'
    )
    lda.fit(X_counts)
    
    perplexity = lda.perplexity(X_counts)
    log_likelihood = lda.score(X_counts)
    
    perplexities.append(perplexity)
    log_likelihoods.append(log_likelihood)
    
    print(f"  Perplexity: {perplexity:.2f}")
    print(f"  Log-likelihood: {log_likelihood:.2f}\n")

In [None]:
# Plot perplexity vs number of topics
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(topic_counts, perplexities, marker='o', linewidth=2)
axes[0].set_xlabel('Number of Topics')
axes[0].set_ylabel('Perplexity')
axes[0].set_title('LDA Perplexity vs Number of Topics\n(Lower is better)')
axes[0].grid(True)

axes[1].plot(topic_counts, log_likelihoods, marker='o', linewidth=2, color='orange')
axes[1].set_xlabel('Number of Topics')
axes[1].set_ylabel('Log-Likelihood')
axes[1].set_title('LDA Log-Likelihood vs Number of Topics\n(Higher is better)')
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Train final LDA model with optimal topic count
optimal_topics = 15  # Adjust based on the plot above

print(f"Training final LDA model with {optimal_topics} topics...")

lda_final = LatentDirichletAllocation(
    n_components=optimal_topics,
    random_state=42,
    n_jobs=-1,
    max_iter=30,
    learning_method='batch',
    verbose=1
)

topic_distributions = lda_final.fit_transform(X_counts)

print(f"\nTopic distribution matrix shape: {topic_distributions.shape}")

In [None]:
# Display top words for each topic
feature_names = vectorizer.get_feature_names_out()
n_top_words = 15

print("="*80)
print("TOP WORDS PER TOPIC")
print("="*80)

topic_keywords = {}
for topic_idx, topic in enumerate(lda_final.components_):
    top_indices = topic.argsort()[-n_top_words:][::-1]
    top_words = [feature_names[i] for i in top_indices]
    topic_keywords[topic_idx] = top_words
    
    print(f"\nTopic {topic_idx}:")
    print(f"  {', '.join(top_words)}")

In [None]:
# Add topic features to dataframe
topic_cols = [f'topic_{i}' for i in range(optimal_topics)]
df_topics = pd.DataFrame(topic_distributions, columns=topic_cols, index=df.index)

# Dominant topic for each paper
df['dominant_topic'] = topic_distributions.argmax(axis=1)
df['dominant_topic_weight'] = topic_distributions.max(axis=1)

# Add all topic distributions
df = pd.concat([df, df_topics], axis=1)

print("Topic features added to dataframe!")
print(f"New shape: {df.shape}")

In [None]:
# Analyze topics
print("Topic Distribution:")
print(df['dominant_topic'].value_counts().sort_index())

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Papers per topic
df['dominant_topic'].value_counts().sort_index().plot(kind='bar', ax=axes[0])
axes[0].set_xlabel('Topic ID')
axes[0].set_ylabel('Number of Papers')
axes[0].set_title('Distribution of Papers Across Topics')
axes[0].grid(axis='y')

# Average citations by topic
df.groupby('dominant_topic')['Citations'].mean().plot(kind='bar', ax=axes[1], color='orange')
axes[1].set_xlabel('Topic ID')
axes[1].set_ylabel('Average Citations')
axes[1].set_title('Average Citations by Dominant Topic')
axes[1].grid(axis='y')

plt.tight_layout()
plt.show()

## 3. TF-IDF and NMF Topic Modeling

Alternative topic modeling using Non-negative Matrix Factorization

In [None]:
# TF-IDF vectorization
print("Creating TF-IDF representation...")

tfidf_vectorizer = TfidfVectorizer(
    max_features=2000,
    stop_words='english',
    min_df=5,
    max_df=0.7,
    ngram_range=(1, 2)
)

X_tfidf = tfidf_vectorizer.fit_transform(abstracts)
print(f"TF-IDF matrix shape: {X_tfidf.shape}")

In [None]:
# Train NMF model
print("Training NMF topic model...")

nmf = NMF(
    n_components=optimal_topics,
    random_state=42,
    max_iter=200,
    init='nndsvda'
)

nmf_topics = nmf.fit_transform(X_tfidf)
print(f"NMF topic matrix shape: {nmf_topics.shape}")

In [None]:
# Display NMF topics
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

print("="*80)
print("NMF TOPICS")
print("="*80)

for topic_idx, topic in enumerate(nmf.components_):
    top_indices = topic.argsort()[-n_top_words:][::-1]
    top_words = [tfidf_feature_names[i] for i in top_indices]
    
    print(f"\nNMF Topic {topic_idx}:")
    print(f"  {', '.join(top_words)}")

In [None]:
# Add NMF topics to dataframe
nmf_cols = [f'nmf_topic_{i}' for i in range(optimal_topics)]
df_nmf = pd.DataFrame(nmf_topics, columns=nmf_cols, index=df.index)

df['nmf_dominant_topic'] = nmf_topics.argmax(axis=1)
df = pd.concat([df, df_nmf], axis=1)

print("NMF features added!")
print(f"New shape: {df.shape}")

## 4. Dimensionality Reduction & Visualization

Use PCA and t-SNE to visualize the high-dimensional abstract space

In [None]:
# PCA on TF-IDF features
print("Running PCA...")

n_pca_components = 50
pca = PCA(n_components=n_pca_components, random_state=42)
X_pca = pca.fit_transform(X_tfidf.toarray())

print(f"PCA shape: {X_pca.shape}")
print(f"Explained variance (first 10 components): {pca.explained_variance_ratio_[:10]}")
print(f"Total explained variance: {pca.explained_variance_ratio_.sum():.3f}")

In [None]:
# Plot explained variance
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Individual variance
axes[0].bar(range(1, 21), pca.explained_variance_ratio_[:20])
axes[0].set_xlabel('Principal Component')
axes[0].set_ylabel('Explained Variance Ratio')
axes[0].set_title('PCA Explained Variance (First 20 Components)')
axes[0].grid(axis='y')

# Cumulative variance
axes[1].plot(range(1, n_pca_components + 1), 
             np.cumsum(pca.explained_variance_ratio_), 
             marker='o', linewidth=2)
axes[1].axhline(y=0.8, color='r', linestyle='--', label='80% variance')
axes[1].axhline(y=0.9, color='orange', linestyle='--', label='90% variance')
axes[1].set_xlabel('Number of Components')
axes[1].set_ylabel('Cumulative Explained Variance')
axes[1].set_title('Cumulative Explained Variance')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Add PCA features to dataframe
pca_cols = [f'pca_{i}' for i in range(n_pca_components)]
df_pca = pd.DataFrame(X_pca, columns=pca_cols, index=df.index)
df = pd.concat([df, df_pca], axis=1)

print("PCA features added!")
print(f"New shape: {df.shape}")

In [None]:
# t-SNE for 2D visualization (sample subset for speed)
print("Running t-SNE (this may take a few minutes)...")

# Use a sample for t-SNE if dataset is large
sample_size = min(5000, len(df))
sample_idx = np.random.choice(len(df), sample_size, replace=False)

tsne = TSNE(
    n_components=2,
    random_state=42,
    perplexity=30,
    max_iter=1000,
    verbose=1
)

X_tsne = tsne.fit_transform(X_pca[sample_idx])
print(f"t-SNE shape: {X_tsne.shape}")

In [None]:
# Visualize t-SNE colored by dominant topic
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Color by topic
scatter1 = axes[0].scatter(
    X_tsne[:, 0], 
    X_tsne[:, 1],
    c=df.iloc[sample_idx]['dominant_topic'],
    cmap='tab20',
    alpha=0.6,
    s=10
)
axes[0].set_title('t-SNE Visualization Colored by Dominant Topic')
axes[0].set_xlabel('t-SNE 1')
axes[0].set_ylabel('t-SNE 2')
plt.colorbar(scatter1, ax=axes[0], label='Topic ID')

# Color by citations (log scale)
scatter2 = axes[1].scatter(
    X_tsne[:, 0],
    X_tsne[:, 1],
    c=np.log1p(df.iloc[sample_idx]['Citations']),
    cmap='viridis',
    alpha=0.6,
    s=10
)
axes[1].set_title('t-SNE Visualization Colored by Citations (log scale)')
axes[1].set_xlabel('t-SNE 1')
axes[1].set_ylabel('t-SNE 2')
plt.colorbar(scatter2, ax=axes[1], label='Log(Citations + 1)')

plt.tight_layout()
plt.show()

## 5. Clustering Analysis

Apply K-Means clustering to discover paper groups

In [None]:
# Determine optimal number of clusters using elbow method and silhouette score
print("Finding optimal number of clusters...\n")

k_range = range(3, 16)
inertias = []
silhouette_scores_list = []
ch_scores = []

# Use PCA features for clustering
X_cluster = X_pca[:, :20]  # Use first 20 PCA components

for k in k_range:
    print(f"Testing k={k}...")
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_cluster)
    
    inertias.append(kmeans.inertia_)
    silhouette_scores_list.append(silhouette_score(X_cluster, labels))
    ch_scores.append(calinski_harabasz_score(X_cluster, labels))
    
print("Done!")

In [None]:
# Plot clustering metrics
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Elbow plot
axes[0].plot(k_range, inertias, marker='o', linewidth=2)
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method')
axes[0].grid(True)

# Silhouette score
axes[1].plot(k_range, silhouette_scores_list, marker='o', linewidth=2, color='orange')
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Score (Higher is better)')
axes[1].grid(True)

# Calinski-Harabasz score
axes[2].plot(k_range, ch_scores, marker='o', linewidth=2, color='green')
axes[2].set_xlabel('Number of Clusters (k)')
axes[2].set_ylabel('Calinski-Harabasz Score')
axes[2].set_title('Calinski-Harabasz Score (Higher is better)')
axes[2].grid(True)

plt.tight_layout()
plt.show()

# Best k based on silhouette score
best_k_idx = np.argmax(silhouette_scores_list)
best_k = list(k_range)[best_k_idx]
print(f"\nOptimal k based on silhouette score: {best_k}")

In [None]:
# Train final K-Means model
optimal_k = best_k  # Use the optimal k from above

print(f"Training K-Means with k={optimal_k}...")
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=20)
df['kmeans_cluster'] = kmeans_final.fit_predict(X_cluster)

print(f"\nCluster distribution:")
print(df['kmeans_cluster'].value_counts().sort_index())

In [None]:
# Analyze clusters
cluster_analysis = df.groupby('kmeans_cluster').agg({
    'Citations': ['mean', 'median', 'std', 'count'],
    'Year': ['min', 'max', 'mean'],
}).round(2)

print("Cluster Characteristics:")
print(cluster_analysis)

In [None]:
# Visualize clusters
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Cluster sizes
df['kmeans_cluster'].value_counts().sort_index().plot(kind='bar', ax=axes[0, 0])
axes[0, 0].set_xlabel('Cluster ID')
axes[0, 0].set_ylabel('Number of Papers')
axes[0, 0].set_title('Papers per Cluster')
axes[0, 0].grid(axis='y')

# Average citations by cluster
df.groupby('kmeans_cluster')['Citations'].mean().plot(kind='bar', ax=axes[0, 1], color='orange')
axes[0, 1].set_xlabel('Cluster ID')
axes[0, 1].set_ylabel('Average Citations')
axes[0, 1].set_title('Average Citations by Cluster')
axes[0, 1].grid(axis='y')

# Cluster visualization on t-SNE (if available)
if 'X_tsne' in locals():
    scatter = axes[1, 0].scatter(
        X_tsne[:, 0],
        X_tsne[:, 1],
        c=df.iloc[sample_idx]['kmeans_cluster'],
        cmap='tab10',
        alpha=0.6,
        s=10
    )
    axes[1, 0].set_title('K-Means Clusters on t-SNE Space')
    axes[1, 0].set_xlabel('t-SNE 1')
    axes[1, 0].set_ylabel('t-SNE 2')
    plt.colorbar(scatter, ax=axes[1, 0], label='Cluster ID')

# Year distribution by cluster
df.boxplot(column='Year', by='kmeans_cluster', ax=axes[1, 1])
axes[1, 1].set_xlabel('Cluster ID')
axes[1, 1].set_ylabel('Publication Year')
axes[1, 1].set_title('Year Distribution by Cluster')
plt.suptitle('')  # Remove default boxplot title

plt.tight_layout()
plt.show()

## 6. Anomaly Detection

Identify outlier papers using DBSCAN and statistical methods

In [None]:
# DBSCAN for outlier detection
print("Running DBSCAN for anomaly detection...")

dbscan = DBSCAN(eps=3, min_samples=10)
df['dbscan_cluster'] = dbscan.fit_predict(X_cluster)

# -1 indicates outliers
n_outliers = (df['dbscan_cluster'] == -1).sum()
n_clusters_dbscan = len(set(df['dbscan_cluster'])) - (1 if -1 in df['dbscan_cluster'].values else 0)

print(f"\nDBSCAN found:")
print(f"  Clusters: {n_clusters_dbscan}")
print(f"  Outliers: {n_outliers} ({n_outliers/len(df)*100:.2f}%)")

In [None]:
# Analyze outliers
outliers = df[df['dbscan_cluster'] == -1]

print("\nOutlier Statistics:")
print(f"Mean citations: {outliers['Citations'].mean():.2f} vs {df['Citations'].mean():.2f} (overall)")
print(f"Median citations: {outliers['Citations'].median():.0f} vs {df['Citations'].median():.0f} (overall)")

if len(outliers) > 0:
    print("\nTop 10 outliers by citations:")
    print(outliers.nlargest(10, 'Citations')[['Year', 'Citations', 'dominant_topic']])

In [None]:
# Statistical outlier detection based on citations
from scipy import stats

# Z-score method
df['citation_zscore'] = np.abs(stats.zscore(df['Citations']))
df['is_citation_outlier'] = df['citation_zscore'] > 3

print(f"\nCitation outliers (|z| > 3): {df['is_citation_outlier'].sum()}")
print("\nTop citation outliers:")
print(df[df['is_citation_outlier']].nlargest(10, 'Citations')[['Year', 'Citations', 'citation_zscore']])

## 7. Feature Correlation Analysis

Analyze relationships between unsupervised features and citations

In [None]:
# Correlation of unsupervised features with citations
unsupervised_features = (
    topic_cols + 
    nmf_cols + 
    pca_cols[:10] +  # First 10 PCA components
    ['dominant_topic', 'nmf_dominant_topic', 'kmeans_cluster']
)

# Correlations
correlations = df[unsupervised_features + ['Citations']].corr()['Citations'].drop('Citations')
correlations = correlations.abs().sort_values(ascending=False)

print("Top 20 correlations with Citations:")
print(correlations.head(20))

In [None]:
# Visualize top correlations
plt.figure(figsize=(10, 8))
correlations.head(20).plot(kind='barh')
plt.xlabel('Absolute Correlation with Citations')
plt.title('Top 20 Unsupervised Features by Correlation with Citations')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 8. Save Results

In [None]:
# Summary of features added
original_cols = pd.read_pickle('../data/processed/cleaned_data.pkl').columns
new_cols = [col for col in df.columns if col not in original_cols]

print("="*80)
print("UNSUPERVISED LEARNING SUMMARY")
print("="*80)
print(f"\nOriginal dataset: {len(original_cols)} columns")
print(f"New dataset: {len(df.columns)} columns")
print(f"Features added: {len(new_cols)}")
print(f"\nNew feature categories:")
print(f"  - LDA topics: {len(topic_cols)}")
print(f"  - NMF topics: {len(nmf_cols)}")
print(f"  - PCA components: {len(pca_cols)}")
print(f"  - Cluster labels: {len([c for c in new_cols if 'cluster' in c])}")
print(f"  - Other: {len(new_cols) - len(topic_cols) - len(nmf_cols) - len(pca_cols) - len([c for c in new_cols if 'cluster' in c])}")

In [None]:
# Save enhanced dataset with unsupervised features
output_path = Path('../data/processed/data_with_unsupervised_features.pkl')
df.to_pickle(output_path)

print(f"\nEnhanced dataset saved to: {output_path}")
print(f"File size: {output_path.stat().st_size / 1024**2:.2f} MB")

In [None]:
# Save feature list for documentation
feature_summary = pd.DataFrame({
    'feature': new_cols,
    'type': ['topic' if 'topic' in c else 'nmf' if 'nmf' in c else 'pca' if 'pca' in c else 'cluster' if 'cluster' in c else 'other' for c in new_cols]
})

feature_summary.to_csv('../data/processed/unsupervised_features_list.csv', index=False)
print("\nFeature list saved to: ../data/processed/unsupervised_features_list.csv")

## Summary

This notebook applied multiple unsupervised learning techniques to the AUB dataset:

1. ✅ **Topic Modeling** - LDA and NMF discovered research themes in abstracts
2. ✅ **Dimensionality Reduction** - PCA, t-SNE for feature extraction and visualization
3. ✅ **Clustering** - K-Means and DBSCAN grouped similar papers
4. ✅ **Anomaly Detection** - Identified outlier papers
5. ✅ **Feature Engineering** - Created new features for downstream models

**Next steps:**
- Use these unsupervised features in your supervised models
- Compare model performance with/without unsupervised features
- Analyze which unsupervised features are most predictive
- Consider additional techniques (autoencoders, word embeddings)