In [2]:
import re
import pandas as pd
from bertopic import BERTopic
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv("scopus.csv")

In [5]:
df.columns

Index(['Authors', 'Title', 'Year', 'Cited by', 'Link', 'Affiliations',
       'Authors with affiliations', 'Abstract', 'Author Keywords',
       'Index Keywords'],
      dtype='object')

In [38]:
import string
from nltk.corpus import stopwords
match = f"[\\s{string.punctuation}]+"


def run_bert_topic(df):
    #raw = pd.read_csv(download_link.format(id),dtype={'id':str},engine="python")   
    custom_stops = re.split(match,"The Authors. Journal of the Association for Information Science and Technology published by Wiley Periodicals, Inc. on behalf of ASIS&T.".lower())
    custom_stops += ["asist","article","abstract","manuscript","paper","publication","book","review","reviewer","www"]
    custom_stops = set(custom_stops).union(set(stopwords.words('english')))
    
    raw = df.dropna(subset=['Abstract'])
    raw['text'] = raw.apply(lambda row: re.sub(r"http\S+", "", row.Abstract).lower(),1)
    raw.text = raw.apply(lambda row: " ".join([x for x in re.sub("[^a-zA-Z]+", " ", row.text).split() if not x in custom_stops]), 1)
    raw = raw.loc[(raw.text != ""), :]
    timestamps = raw.Year.to_list()
    tweets = raw.text.to_list()
    
    topic_model = BERTopic(verbose=True)
    topics, probs = topic_model.fit_transform(tweets)
    topic_df = pd.DataFrame({"topic":topics}).reset_index()
    topic_df = topic_df.loc[(topic_df.topic!=-1),:]
    topic_df['weight']=1
    topic_defs = pd.DataFrame.from_dict(topic_model.get_topics(),orient="index").reset_index().melt(id_vars=['index']).sort_values(['index','variable'])
    topic_defs = pd.concat([topic_defs["index"].reset_index(drop=True),pd.DataFrame(topic_defs['value'].tolist(),columns=["word","weight"]).reset_index(drop=True)],axis=1).rename(columns={"index":"topic"})
    return pd.pivot_table(topic_df,values = 'weight', fill_value=0, columns = 'topic',index = 'index'), topic_defs

In [39]:
result = run_bert_topic(df)

Batches: 100%|██████████| 43/43 [00:01<00:00, 36.96it/s]
2022-08-08 16:03:12,333 - BERTopic - Transformed documents to Embeddings
2022-08-08 16:03:17,932 - BERTopic - Reduced dimensionality
2022-08-08 16:03:18,003 - BERTopic - Clustered reduced embeddings


In [41]:
result[0]

topic,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1359,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1360,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1362,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1363,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
set([1,2]).union(set([1,3]))

{1, 2, 3}

In [42]:
result[0].to_csv("jasist_topics.csv")

In [43]:
result[1].to_csv("jasist_topic_defs.csv")