In [None]:
# Install dependencies for Python libraries
!pip install -r https://raw.githubusercontent.com/dkharazi/bert-news/main/requirements.txt

In [None]:
# Import libraries for generic data preprocessing
import os
import json
import shap
import numpy as np
import pandas as pd
from itertools import chain
from collections import Counter
import matplotlib.pyplot as plt
import plotly.express as px
from imblearn.over_sampling import SMOTEN

# Import libraries for preprocessing embeddings
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sentence_transformers import SentenceTransformer

# Import libraries for classifying clusters based on embeddings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
from xgboost import XGBClassifier, DMatrix

# Import libraries for clustering and topic classification
import hdbscan
import umap.umap_ as umap
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer

### Set Random Seed

In [None]:
np.random.seed(10)
os.environ["TOKENIZERS_PARALLELISM"] = 'false'

### Load Preprocessed Data

In [None]:
articles = pd.read_parquet('https://github.com/dkharazi/bert-news/blob/main/data/proc_articles.gzip?raw=true')
articles = articles.reset_index(drop=True)

In [None]:
articles.head()

### Perform Stemming and Remove Stop Words

In [None]:
# Load in stop words, stemmer, and regex tokenizer
stop = stopwords.words('english')
punc = RegexpTokenizer(r'\w+')
# stemmer = PorterStemmer()

# Define function for de-lemmatizing words, removing whitespace, and stop words
def reformat_articles(w):
    # lowecase and remove stop words
    words = [word.lower() for word in w.split() if word.lower() not in stop]
    # remove punctuation
    words = [punc.tokenize(word) for word in words]
    # remove whitespace
    words = list(chain.from_iterable(words))
    # remove stems
    # words = [stemmer.stem(word) for word in words]  # TODO -- SEE IF IMPROVED AFTER DELETING THIS LINE
    # convert from list to string
    new_article = ' '.join(words)
    return new_article

# Apply lemmatization, whitespace removal, and stop word removal
articles['content'] = articles['content'].apply(reformat_articles)
articles.head()

### Embed Articles

In [None]:
# Load in pre-trained DistilBERT model
# DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than bert-base-uncased , runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language understanding benchmark
model = SentenceTransformer('distilbert-base-nli-mean-tokens', device='cuda')

In [None]:
# Encode embeddings
embeddings = model.encode(articles['content'], show_progress_bar=True)

### Standardize Embeddings

In [None]:
# Perform mean-centering standardization on embeddings
std_embeddings = StandardScaler(with_mean=True).fit_transform(embeddings)

### Neighborhood-Based Dimensionality Reduction

In [None]:
# Perform non-linear dimensionality reduction
# The smaller the value, the more localized the dimensionality reduction
# The larger the value, the more globalized the dimensionality reduction
# n_neighbors = 15
# n_components = 5
# min_dist = 0.01
# metric = cosine
umap_embeddings = umap.UMAP(n_neighbors=15, n_components=5, min_dist=0.01, metric='cosine', random_state=42).fit_transform(std_embeddings)

# Perform min-max standardization on embeddings
std_umap_embeddings = MinMaxScaler().fit_transform(umap_embeddings)

### Density Based Clustering

In [None]:
# Perform density-based clustering on dimensionality reduced embeddings
# The smaller the value, the more clusters returned
# The larger the value, the fewer clusters returned
# min_cluster_size=100
# metric=euclidean
cluster = hdbscan.HDBSCAN(min_cluster_size=100, metric='euclidean').fit(std_umap_embeddings)

In [None]:
# Frequencies of each cluster/label
unique, counts = np.unique(cluster.labels_, return_counts=True)
np.asarray((unique, counts)).T

### Plot Clusters

In [None]:
# Prepare data
umap_data = umap.UMAP(n_neighbors=15, n_components=2, min_dist=0.01, metric='cosine', random_state=27).fit_transform(embeddings)
result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = cluster.labels_

# Visualize clusters
fig, ax = plt.subplots(figsize=(20, 10))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=10)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=10, cmap='hsv_r')
plt.colorbar()

### Define Mapping of Topics

In [None]:
# Define mapping from cluster to topic
clust_map = {
    0: 'Baseball',
    1: 'Nuclear',
    2: 'Cuba Relations',
    3: 'Space',
    4: 'Terrorism and War',
    5: 'Crime',
    6: 'Brexit',
    7: 'Football',
    8: 'Olympics',
    9: 'Basketball',
    10: 'Protesting and Activism',
    11: 'LGBTQ Discrimination',
    12: 'Segregation and Racial Discrimination',
    13: 'Politics',
    14: 'U.S. Illegal Immigration',
    15: 'Nazism and Syrian Refugee Crisis',
    16: 'Global Warming',
    17: 'Technology and Data Privacy',
    18: 'Food',
    19: 'Medical Research',
    20: 'Widespread Disease',
    21: 'Real Estate',
    22: 'Business and Finance',
    23: 'Music',
    24: 'Pop Culture and Entertainment'
    }

### Build Topics JSON

In [None]:
# TODO -- Remove row filter from line 5
# Add cluster and embeddings to dataframe
articles['cluster'] = cluster.labels_
articles = articles.replace(clust_map)
articles['umap_embed1'] = umap_data[:,0]
articles['umap_embed2'] = umap_data[:,1]
topics = articles[articles['cluster'] != -1].reset_index(drop=True)
topics = topics.reset_index()

# Prepare dataframe for JSON formatting
topics = topics.rename(columns={'index': 'article_id'})
topics = topics[['article_id', 'publication', 'party', 'cluster', 'umap_embed1', 'umap_embed2']]

# Convert dataframe to JSON
topics_str = topics.to_json(orient="records")
topics_json = json.loads(topics_str)
topics_str = json.dumps(topics_json, indent=4)

# Take a glimpse at JSON!
print(topics_str)

### Get Top Words for each Topic

In [None]:
# Assign articles to topics
docs_df = pd.DataFrame(articles['content'])
docs_df['Topic'] = cluster.labels_
docs_df = docs_df[docs_df['Topic'] != -1]
docs_df['Doc_ID'] = range(len(docs_df))

# Rename columns
docs_df.columns = ['Doc', 'Topic', 'Doc_ID']
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
docs_per_topic.head()

In [None]:
# Compute c-TF-IDF scores for each word
def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)
    return tf_idf, count
  
tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(articles['content']))
tf_idf[0:2]

In [None]:
# Extract top words for each topic
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = [{'topic': clust_map[label], 'top20_words': [{'word': words[j], 'tfidf': tf_idf_transposed[i][j]} for j in indices[i]][::-1]} for i, label in enumerate(labels)]
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df)
top_n_words

### Build TF-IDF JSON

In [None]:
# Convert dictionary to JSON
tfidf_str = json.dumps(top_n_words, indent=4)

# Take a glimpse at JSON!
print(tfidf_str)

### Classify Topics based on Embeddings

In [None]:
# Prepare data for XGBoost training
idx = cluster.labels_ != -1
y = cluster.labels_[idx]
X = embeddings[idx]

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.15,
    random_state=42,
    shuffle=True,
    stratify=y
    )

In [None]:
# Get same number of randomly chosen test indices
idx = []
min_lab = min(Counter(y_test).values())
for i in np.unique(y_test):
    idx.extend(np.random.choice(np.where(y_test==i)[0], min_lab, replace=False))

# Enforce similar frequency of labels in test data
X_test = X_test[idx]
y_test = y_test[idx]

In [None]:
# Try undersampling to 1000 articles for each cluster
large_c = [c for c, count in Counter(y_train.tolist()).items() if count >= 1000]
small_c = [c for c, count in Counter(y_train.tolist()).items() if count < 1000]
idx = np.where(np.isin(y_train, small_c))[0].tolist()
for i in large_c:
    idx.extend(np.random.choice(np.where(y_train==i)[0], 1000, replace=False))

# Enforce similar frequency of labels in training data
X_train = X_train[idx]
y_train = y_train[idx]

In [None]:
# Initialize SMOTEN object for oversampling nominal features
sampler = SMOTEN(random_state=12)

# Simulate oversampled data for imbalanced classes
X_res, y_res = sampler.fit_resample(X_train, y_train)

# Check previous work!
print(Counter(y_res))

In [None]:
%%time
# Build xgboost model to predict topics based on embeddings
xgb = XGBClassifier(
    n_estimators=100,
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    use_label_encoder=False
    )

# Train model
model = xgb.fit(X_res, y_res, eval_metric=['mlogloss'])

### Generate Important Features

In [None]:
# Test model accuracy
preds = model.predict(X_res)
round((preds == y_res).sum().astype(float) / len(preds)*100, 2)

In [None]:
# Generate list of embedding names
feature_names = np.array(['Embedding {}'.format(str(i)) for i in range(1,len(model.feature_importances_)+1)])

# Get top-20 most important features
sorted_idx = model.feature_importances_.argsort()[0:20]

# Plot feature importances
plt.barh(feature_names[sorted_idx], model.feature_importances_[sorted_idx])
plt.xlabel("XGBoost Feature Importance")

### Assign Top BERT Embeddings to Articles

In [None]:
# Assign top-3 embeddings to each article
articles['bert_embed1'] = embeddings[:,694]
articles['bert_embed2'] = embeddings[:,193]
articles['bert_embed3'] = embeddings[:,162]
articles.head()

In [None]:
# Filter out any articles without definitive clusters
embed_df = articles[articles['cluster'] != -1].reset_index(drop=True)

# Visualize top-3 embeddings with 3D graph
# and color each point based on their topic
fig = px.scatter_3d(embed_df, x='bert_embed1', y='bert_embed2', z='bert_embed3', color='cluster')
fig.show()

### Save Final File as JSON

In [None]:
# Reformat strings as dictionaries
topics_json = json.loads(topics_str)
tfidf_json = json.loads(tfidf_str)

# Combine dictionaries together
merged_dict = {'topics_data': topics_json, 'tfidf_data': tfidf_json}

# Dump dictionary to JSON file
with open('/data/news.json', 'w', encoding='utf-8') as f:
    json.dump(merged_dict, f, ensure_ascii=False, indent=4)

### References

- [Plotting XGBoost and Shap Values](https://github.com/slundberg/shap#tree-ensemble-example-xgboostlightgbmcatboostscikit-learnpyspark-models)
- [Text plots with Shap values](https://shap.readthedocs.io/en/latest/example_notebooks/api_examples/plots/text.html#text-plot)
- [Partitioning data with Shap values](https://shap.readthedocs.io/en/latest/example_notebooks/api_examples/explainers/Exact.html#Tabular-data-with-partition-(Owen-value)-masking)