Input file is unlabeled 'united airline' data. The code is to generate topics by weeks using biterm topic modeling which is appropriate for short text. And then summarize similar topics in different weeks.

In [None]:
!pip install biterm

In [None]:
import nltk
import nltk.corpus
import ast
import numpy as np

import re
import string
PUNCT_TO_REMOVE = string.punctuation

from nltk import word_tokenize
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words("english"))

from nltk.stem import WordNetLemmatizer, SnowballStemmer
stemmer = SnowballStemmer('english')

def text_processing(text):
    text = text.lower()
    text = re.compile(r'https?://\S+|www\.\S+').sub(r'', text)
    text = text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
    text = " ".join([word for word in str(text).split() if word not in STOPWORDS])
    text = " ".join([stemmer.stem(WordNetLemmatizer().lemmatize(word, pos='v')) for word in str(text).split()])
    return text

In [None]:
'''
  Process 'title' data, convert 'date' column to datetime type and sort by date column
'''
import datetime
import pandas as pd

df_train = pd.read_csv('news-articles-united-airline_social-animal_201601-201905.txt',sep='|')
df_train['title'] = df_train['title'].apply(lambda x: text_processing(x))
df_train['date'] = df_train['created_at'].apply(lambda x:datetime.datetime.strptime(x, '%Y/%m/%d %H:%M:%S').date())
df_sorted = df_train.sort_values('date').reset_index()

In [None]:
from biterm.cbtm import oBTM
from sklearn.feature_extraction.text import CountVectorizer
from biterm.utility import vec_to_biterms, topic_summuary

In [None]:
'''
  Generate topics in order by weeks using BTM. Generated data contains start date, end date of the week and top 10 topics in the week.
'''
date_started = df_sorted['date'][0]
date_end = df_sorted['date'][len(df_sorted)-1] + datetime.timedelta(days=1)
dict_timeline = {}
dict_timeline['startdate'] = []
dict_timeline['enddate'] = []
dict_timeline['topics'] = []
while date_started < date_end:
    df_topic = df_sorted[(df_sorted['date']>=date_started) & (df_sorted['date']<(date_started+datetime.timedelta(days=7)))]  
    vec = CountVectorizer(stop_words='english')
    X = vec.fit_transform(df_topic['title']).toarray()
    vocab = np.array(vec.get_feature_names())
    biterms = vec_to_biterms(X)

    btm = oBTM(num_topics=10, V=vocab)
    for i in range(0,len(biterms),100):
        biterms_chunk = biterms[i:i+100]
        btm.fit(biterms_chunk,iterations=50)
    topics = btm.transform(biterms)
    topwords = topic_summuary(btm.phi_wz.T, X, vocab, 10,verbose=False)['top_words']
    print('start date:{}, end date:{}'.format(date_started,(date_started+datetime.timedelta(days=6))))
    print(topwords)
    list_topic = []
    for t in topwords:
        list_topic.append(t.tolist())
    dict_timeline['startdate'].append(date_started)
    dict_timeline['enddate'].append(date_started+datetime.timedelta(days=6))
    dict_timeline['topics'].append(list_topic)
    date_started = date_started + datetime.timedelta(days=7)

In [None]:
df_topics = pd.DataFrame.from_dict(dict_timeline,orient='index').T
df_topics.info()
df_topics.to_csv('topic_all.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   startdate  73 non-null     object
 1   enddate    73 non-null     object
 2   topics     73 non-null     object
dtypes: object(3)
memory usage: 1.8+ KB


In [None]:
'''
  Separate topics in topic list of each week and generate new dataframe. Add new column 'in_timeline' for next step.
'''
dict_new = {}
dict_new['startdate'] = []
dict_new['enddate'] = []
dict_new['topic'] = []
for i,row in df_topics.iterrows():
    for t in row['topics']:
        dict_new['startdate'].append(row['startdate'])
        dict_new['enddate'].append(row['enddate'])
        dict_new['topic'].append(t)

df_toptl = pd.DataFrame.from_dict(dict_new,orient='index').T
df_toptl['in_timeline'] = 0

In [None]:
'''
  Summarize all the similar topics by timeline. If one topic have more than 3 words same as those in another topics, these 2 topics are
  identified as similar topics. After one topic is identified as similar topic to the previous one, it would be annotated as 'in_timeline'
  and ignored when loop to the row.
'''
dict_topic = {}
dict_topic['startdate'] = []
dict_topic['enddate'] = []
dict_topic['words'] = []
dict_topic['topic'] = []

i_t = 0
IGNORE = ['unit','airlin','flight','plane','man','woman']
for index in range(len(df_toptl)):
    if df_toptl.loc[index]['in_timeline'] == 0:
        topic = df_toptl.loc[index]['topic']
        i_t += 1
        dict_topic['startdate'].append(df_toptl.loc[index]['startdate'])
        dict_topic['enddate'].append(df_toptl.loc[index]['enddate'])
        dict_topic['words'].append(topic)
        dict_topic['topic'].append(i_t)
        for i_2,row_2 in df_toptl[df_toptl['startdate']>df_toptl.loc[index]['startdate']].iterrows():
            topic_2 = row_2['topic']
            word_count = 0
            for word in topic:
                if word not in IGNORE and word in topic_2:
                    word_count += 1
            if word_count >= 3:
                dict_topic['startdate'].append(row_2['startdate'])
                dict_topic['enddate'].append(row_2['enddate'])
                dict_topic['words'].append(topic_2)
                dict_topic['topic'].append(i_t)
                df_toptl.loc[i_2,'in_timeline']  = 1
      

In [None]:
'''
  Only retrieve topic groups which have more than 2 topics in timeline.
'''
df_similartopics  = pd.DataFrame.from_dict(dict_topic,orient='index').T
df_final = df_similartopics[df_similartopics.groupby(['topic'])['words'].transform('count') > 2]
df_final.to_csv('final_topics.csv')