<a href="https://colab.research.google.com/github/darinkist/bertopic_arxiv_data_ml_2022/blob/main/BERTopic_ArxivData_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Installing needed packages
!pip install bertopic

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

In [None]:
# Assuming you have downloaded the zip file from
# https://www.kaggle.com/Cornell-University/arxiv
# Run unzip
!unzip archive.zip

In [10]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()
import re

In [None]:
chunks = pd.read_json('arxiv-metadata-oai-snapshot.json', 
                      lines=True, 
                      chunksize = 50000)

ml_topics = []
for chunk in tqdm(chunks):
    chunk_df = chunk[chunk.categories.str.contains('cs.LG|cs.AI|stat.ML', 
                                                   regex=True)].copy()
    
    if not chunk_df.empty:
        chunk_df['latest_version'] = pd.to_datetime(chunk_df['versions'].apply(lambda x: list(x[-1].values())[1]))
        
        ml_topics.append(
            chunk_df.loc[chunk_df['latest_version'].dt.year == 2022, 
                         ['title', 'abstract','latest_version']]
        )
        
ml_topics_df = pd.concat(ml_topics)


ml_topics_df['doc_raw'] = ml_topics_df['title'] + '. ' + ml_topics_df['abstract']

In [None]:
len(ml_topics_df)

In [None]:
ml_topics_df.head(3)

In [None]:
def cleaner(text):

    # Keep only alphanumerical words plus ,.!?
    pattern = re.compile(r"[A-Za-z\-.,?!]{3,50}")
    return text.str.findall(pattern).str.join(' ')


ml_topics_df['doc_clean'] = ml_topics_df['doc_raw'].str.replace('\n', ' ')
ml_topics_df['doc_clean'] = ml_topics_df['doc_clean'].str.replace('https?://\S+', '', 
                                                                  case=False)
ml_topics_df['doc_clean'] = cleaner(ml_topics_df['doc_clean'])

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()

def lemmatize_it(sent):
    empty = []
    for word, tag in pos_tag(word_tokenize(sent)):
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
        if not wntag:
            lemma = word
            empty.append(lemma)
        else:
            lemma = lemmatizer.lemmatize(word, wntag)
            empty.append(lemma)
    return ' '.join(empty)

ml_topics_df['doc_lem'] = ml_topics_df['doc_clean'].progress_apply(lambda x: lemmatize_it(x))

In [43]:
# Store the processed data to json
ml_topics_df.to_json("ml_txt_2022_cleaned_lem.json")

In [44]:
ml_txt_2022_cleaned = pd.read_json("ml_txt_2022_cleaned_lem.json")
# to_json exports date as unix timestamp
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_json.html
ml_txt_2022_cleaned["latest_version"] = pd.to_datetime(ml_txt_2022_cleaned["latest_version"], unit="ms")

In [None]:
%%time
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

# This might take >1h
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(ml_txt_2022_cleaned['doc_lem'].values, show_progress_bar=True)
np.save("ml_2022_embeddings", embeddings)

In [50]:
# Recommendation to store the embeddings - so that you do not have to go
# through the processing again when trying out different hyperparameters
embeddings_saved = np.load("ml_2022_embeddings.npy")

In [51]:
from nltk.corpus import stopwords

umap_model = UMAP(n_neighbors=3, n_components=3, min_dist=0.05)
hdbscan_model = HDBSCAN(min_cluster_size=60, min_samples=40,
                        gen_min_span_tree=True,
                        prediction_data=True)

stopwords_list = list(stopwords.words('english')) + ['data', 'model', 'models', 'approach', 
                                                'approaches','problem','problems', 'training', 
                                                'methods', 'method', 'algorithm', 'algorithms', 
                                                'use', 'result', 'results', 'show', 'propose', 
                                                'provide', 'system', 'accuracy', 'href', 'https', 
                                                'URL', 'github.com', 'github', 'www.github.com'
                                               ]

vectorizer_model = CountVectorizer(ngram_range=(1, 2),
                                   stop_words=stopwords_list
                                  )

model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    language='english',
    calculate_probabilities=False,
    verbose=True,
)

In [None]:
topics, probs = model.fit_transform(ml_txt_2022_cleaned['doc_lem'].values, embeddings_saved)

In [None]:
model.visualize_barchart(top_n_topics=4, n_words=10)

In [None]:
model.visualize_barchart(top_n_topics=4, n_words=10).write_html("ml_topics_22.html")