In [1]:
!pip install contextualized-topic-models==2.3.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Import General Utility Libraries 

In [2]:
import re
import urllib
import gzip
import io
import csv
import random
from collections import defaultdict
from tqdm import tqdm
import nltk

Where to store the data file. If you want, you can adjust the path.

In [3]:
path_before_1990 = '/content/drive/My Drive/titles_before_1990.txt'
path_from_1990_to_2009 = '/content/drive/My Drive/titles_from_1990_to_2009.txt'
path_from_2010 = '/content/drive/My Drive/titles_from_2010.txt'

Execute the following cell only once to download the data and write it as a file to your google drive. Afterwards, skip this cell or comment it out.

In [3]:
from google.colab import drive
drive.mount('/content/drive')

# to download the data manually or get more information, go to: https://dblp.org/faq/How+can+I+download+the+whole+dblp+dataset.html
url = 'https://dblp.uni-trier.de/xml/dblp.xml.gz'
# num_titles = 500000  # the (max)number of titles to load 


def load_gzip_file(url):
    """Download Gzip-file."""
    response = urllib.request.urlopen(url)
    compressed_file = io.BytesIO(response.read())
    decompressed_file = gzip.GzipFile(fileobj=compressed_file)
    return decompressed_file

def extract_titles(input_file, max_num=40000):
    """Extract title and publication year of dblp papers, given as input file.
    
    Divide the papers into 3 time periods. 
    
    Collect max max_num papers per time period.
    """
    pairs_before_1990 = []
    count_before_1990 = 0
    pairs_from_1990_to_2009 = []
    count_from_1990_to_2009 = 0
    pairs_from_2010 = []
    count_from_2010 = 0
    got_title = False
    for line in tqdm(input_file):
        line_str = line.decode('utf-8')
        if got_title: 
            # we have a title and check for the corresponding year
            year_result = re.search(r'<year>(.*)</year>', line_str)
            if year_result:
                # we also have the year and thus save the title-year pair
                year = int(year_result.group(1))
                if year < 1990:
                    pairs_before_1990.append((title, year))
                    count_before_1990 += 1
                elif year < 2010:
                    pairs_from_1990_to_2009.append((title, year))
                    count_from_1990_to_2009 += 1
                else:
                    pairs_from_2010.append((title, year))
                    count_from_2010 += 1
                got_title = False
        else:
            # we have no title and search for title
            result = re.search(r'<title>(.*)</title>', line_str)
            if result:
                title = result.group(1)
                if len(title.split(' ')) < 3:  
                    # only include titles with at least four words
                    continue
                got_title = True
        
        if count_before_1990 >= max_num and count_from_1990_to_2009 >= max_num and count_from_2010 >= max_num:
            return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010
    
    return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010

def save_data(pairs, file_path):
    with open(file_path, 'w') as fout:
        writer = csv.writer(fout)
        for pair in pairs:
            writer.writerow(pair)

in_file = load_gzip_file(url)
pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010 = extract_titles(in_file)
save_data(pairs_before_1990, path_before_1990)
save_data(pairs_from_1990_to_2009, path_from_1990_to_2009)
save_data(pairs_from_2010, path_from_2010)

Mounted at /content/drive


14922037it [00:31, 481205.67it/s]


Mount your google drive (in case it is not yet mounted) so that the newly created files are available.

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# LDA

In [5]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

num_lda_topics = 5

### Before the 1990s:

In [6]:
with open(path_before_1990) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]

Let's perform some simple preprocessing:

In [7]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = text.lower()
    return text

titles_before_1990 = [preprocess_text(title) for title in titles]

In [8]:
titles_before_1990[:10]

['object model capabilities for distributed object management',
 'distributed object management technology',
 'muffin a distributed database machine',
 'algebraical optimization of ftaexpressions',
 'wissensrepraumlsentation und maschinelles lernen',
 'an algebraic characterization of stuf',
 'zur systemarchitektur von lilog',
 'mengenorientierte auswertung von anfragen in der logikprogrammiersprache prolog',
 'definite resolution over constraint languages',
 'dokumentation der syntax der liloggrammatik']

Now we turn the documents (or titles in this case) into a matrix feature representation.

In [9]:
num_features = 10000
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english')
tf = tf_vectorizer.fit_transform(titles_before_1990)
tf_feature_names = tf_vectorizer.get_feature_names_out()

In [10]:
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)  


In [11]:
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: design using digital circuits applications der number implementation von software und class
Topic 1: algorithm problem method sequential recognition time machines dynamic pattern computing solution use
Topic 2: data networks theory systems approach distributed programming graphs model language chemical structure
Topic 3: note logic functions network algorithms application memory sets models languages machine development
Topic 4: systems computer control analysis information linear new problems parallel optimal finite performance


Topics:
0. Graph/networks algorithms (seems to be mostly about algorithms that (maybe) operate on graphs/networks)
1. pattern recognition (and maybe robotics)
2. ...

### From 1990 to 2009:

Add your code for topic modelling the period from 1990 to 2009 here...

I will use the same preprocessing but a different vectorizer.

Note, we look at titles as single documents. Each of these documents consists of words. We want to assign each document (= title) to a topic, but we choose the number of topics.

In [12]:
with open(path_from_1990_to_2009) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = text.lower()
    return text

titles_1990_2009 = [preprocess_text(title) for title in titles]

In [13]:
len(titles_1990_2009)

330317

In [14]:
num_features = 10000
tf_vectorizer = TfidfVectorizer(
    max_df=0.95, 
    min_df=2, 
    max_features=num_features, 
    stop_words='english',
    token_pattern='[a-zA-Z0-9]{3,}'
)
tf = tf_vectorizer.fit_transform(titles_1990_2009)
tf_feature_names = tf_vectorizer.get_feature_names_out()

In [15]:
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)  


In [16]:
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: graphs stability modeling identification systems design generalized performance special analysis estimation molecular
Topic 1: information networks systems linear problem mobile models algorithm model algorithms design efficient
Topic 2: control systems nonlinear methods dynamic equations method parallel study computing research equation
Topic 3: adaptive systems management detection software theory development fuzzy robust multiple structure approach
Topic 4: data network using neural evaluation analysis finite class digital codes recognition internet


### From 2010 onwards:

Add your code for topic modelling the period from 2010 onwards here...

Let us again use the tf-idf vectorizer but let's change some parameters in the LDA and maybe change the maximum number of features. We will increase the number of features, as the length of the dataset is much larger!

In [17]:
with open(path_from_2010) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = text.lower()
    return text

titles_2010 = [preprocess_text(title) for title in titles]

In [18]:
len(titles_2010)

825940

In [19]:
num_features = 15000
tf_vectorizer = TfidfVectorizer(
    max_df=0.95, 
    min_df=2, 
    max_features=num_features, 
    stop_words='english',
    token_pattern='[a-zA-Z0-9]{3,}'
)
tf = tf_vectorizer.fit_transform(titles_2010)
tf_feature_names = tf_vectorizer.get_feature_names_out()

In [20]:
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)  


In [21]:
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: nonlinear using dynamic detection equations learning model applications application based problem deep
Topic 1: control systems estimation power classification method distributed finite tracking optimal optimization algorithms
Topic 2: networks time sensor network data systems social wireless based mobile image evaluation
Topic 3: adaptive algorithm method computing functions distribution theory graphs visual random global allocation
Topic 4: multiple data state machine learning order graph feature cognitive virtual service effects


# Combined Topic Models

New method developed by [Bianchi et al. 2021](https://aclanthology.org/2021.acl-short.96/). 

[A 6min presentation of the paper by one of the authors.](https://underline.io/lecture/25716-pre-training-is-a-hot-topic-contextualized-document-embeddings-improve-topic-coherence)

Code: [https://github.com/MilaNLProc/contextualized-topic-models](https://github.com/MilaNLProc/contextualized-topic-models)

Tutorial: [https://colab.research.google.com/drive/1fXJjr_rwqvpp1IdNQ4dxqN4Dp88cxO97?usp=sharing](https://colab.research.google.com/drive/1fXJjr_rwqvpp1IdNQ4dxqN4Dp88cxO97?usp=sharing)

Again, perform topic modelling for the three time periods - this time using the combined topic models (CTMs). 

You can use and adapt the code from the tutorial linked above.

Use the available GPU for faster running times.

In [22]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords

num_ctm_topics = 5  # you can also choose a higher number of topics

### Before the 1990s:

In [23]:
from nltk.corpus import stopwords as stop_words
nltk.download('stopwords')

with open(path_before_1990) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]


stopwords = list(stop_words.words("english"))

sp = WhiteSpacePreprocessingStopwords(titles, stopwords_list=stopwords)
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


We initialize the model, and pass the unpreprocessed corpus (which is used for contextual embeddings) as well as the preprocessed documents (used for bow). This function creates the BoW and obtains the contextualized word embeddings (BERT representations).

I am using a smaller BERT (sentence transformer) model as training takes too long otherwise.

In [24]:
tp = TopicModelDataPreparation("all-MiniLM-L6-v2")

training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

Batches:   0%|          | 0/198 [00:00<?, ?it/s]

The next step consists of actually fitting the topic model. We will keep all the parameters as in the tutorial document on Combined TM. The only parameter we needed to change is the contextual_size, as we used a different, smaller sentence Tranformer base model.

In [25]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=384, n_components=5, num_epochs=5)
ctm.fit(training_dataset) # run the model

Epoch: [5/5]	 Seen Samples: [197640/197640]	Train Loss: 35.38233664257279	Time: 0:00:12.042552: : 5it [00:51, 10.37s/it]
Sampling: [20/20]: : 20it [02:42,  8.12s/it]


Finally we can get all our topics, or as many as we want.

In [26]:
ctm.get_topic_lists(5)

[['algorithm', 'algorithms', 'parallel', 'problem', 'networks'],
 ['computer', 'system', 'information', 'design', 'data'],
 ['semantics', 'logic', 'modal', 'symbolic', 'sets'],
 ['control', 'analysis', 'time', 'systems', 'optimal'],
 ['und', 'computers', 'uuml', 'ring', 'der']]

In [27]:
len(tp.vocab)

2000

### From 1990 to 2009

From here on, the code is uncommented as the methods used are the same as for the dataset before 1990.

In [28]:
with open(path_from_1990_to_2009) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]


stopwords = list(stop_words.words("english"))

sp = WhiteSpacePreprocessingStopwords(titles, stopwords_list=stopwords)
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()

In [29]:
tp = TopicModelDataPreparation("all-MiniLM-L6-v2")

training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

Batches:   0%|          | 0/1634 [00:00<?, ?it/s]

In [30]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=384, n_components=5, num_epochs=5)
ctm.fit(training_dataset) # run the model

Epoch: [5/5]	 Seen Samples: [1633610/1633610]	Train Loss: 41.811606007513284	Time: 0:01:24.774775: : 5it [06:52, 82.56s/it]
Sampling: [20/20]: : 20it [22:12, 66.63s/it]


In [31]:
ctm.get_topic_lists(5)

[['networks', 'wireless', 'management', 'information', 'mobile'],
 ['using', 'based', 'data', 'image', 'model'],
 ['problems', 'problem', 'equations', 'solutions', 'equation'],
 ['net', 'plant', 'checking', 'mechanical', 'descriptor'],
 ['control', 'systems', 'time', 'sub', 'sup']]

### From 2010 onwards

In [32]:
with open(path_from_2010) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]


stopwords = list(stop_words.words("english"))

sp = WhiteSpacePreprocessingStopwords(titles, stopwords_list=stopwords)
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()

In [33]:
tp = TopicModelDataPreparation("all-MiniLM-L6-v2")

training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)



Batches:   0%|          | 0/4104 [00:00<?, ?it/s]



In [34]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=384, n_components=5, num_epochs=5)
ctm.fit(training_dataset) # run the model

Epoch: [5/5]	 Seen Samples: [4103675/4103675]	Train Loss: 50.88480006708212	Time: 0:03:27.735492: : 5it [17:20, 208.11s/it]
Sampling: [20/20]: : 20it [55:46, 167.35s/it]


In [35]:
ctm.get_topic_lists(5)

[['using', 'image', 'learning', 'detection', 'classification'],
 ['networks', 'energy', 'wireless', 'sensor', 'power'],
 ['time', 'systems', 'control', 'nonlinear', 'sub'],
 ['cascade', 'cascaded', 'pso', 'sparsity', 'weighting'],
 ['study', 'review', 'social', 'research', 'case']]