# Topic Modelling on `data_schoolofinf`

Using the scrapped metadata and downloaded PDF, we create the topic models.


0. using `gensim`, create the corpus, vocabulary
1. use `LDAtuning` (in R) to find the best number of topics avaiable
2. create topic model using `gensim`
3. visualise the results using `lda2vis`

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import pickle as pkl
import pandas as pd
from numpy.random import RandomState
rng = RandomState(93748573)
import os

DATA_DIR = '../../data/data_schoolofinf/'

In [2]:
from gensim.corpora import Dictionary

2018-02-01 00:03:31,707 : INFO : 'pattern' package not found; tag filters are not available for English


### Combining all the tokens together:

In [3]:
df_combined_toks = pd.read_pickle(os.path.join(DATA_DIR,'toks', 'toks.combined.pkl'))

In [4]:
df_combined_toks.head(2)

Unnamed: 0_level_0,year,toks_metada,toks_pdf2txt
pub_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
400818dc-63af-4a26-80c5-906f98e1f8ab,1989,"[ballooning, stability, analysis, jet, hmode, ...",
18b1a861-afef-4fff-bc80-d02e05be18c4,2013,"[query, processing, data, integration, chapter...",


In [5]:
df_combined_toks['toks'] = df_combined_toks.apply(
    lambda row: list(row.toks_metada) + list(row.toks_pdf2txt), axis=1)

## Using publications from 1997-2017

In [6]:
df_combined_toks = df_combined_toks.drop(
    df_combined_toks[(df_combined_toks.year < 1997) | (df_combined_toks.year > 2017)].index)

In [7]:
df_combined_toks.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8028 entries, 18b1a861-afef-4fff-bc80-d02e05be18c4 to b2920a27-5293-4f4a-8874-4a0ea804d91a
Data columns (total 4 columns):
year            8028 non-null int64
toks_metada     8028 non-null object
toks_pdf2txt    8028 non-null object
toks            8028 non-null object
dtypes: int64(1), object(3)
memory usage: 313.6+ KB


### Model 1: metadata + PDF

In [8]:
docs = df_combined_toks.toks.tolist()

combined_toks_dict = Dictionary(docs)

# Filter to remove words thatappeared too frequent (in more than 50% of doucuments) 
# and too little (less than 10 occurences)
combined_toks_dict.filter_extremes(no_below=10, no_above=.5)
combined_toks_dict.compactify()

2018-01-31 23:53:23,656 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-01-31 23:53:41,514 : INFO : built Dictionary(636191 unique tokens: ['query', 'processing', 'data', 'integration', 'chapter']...) from 8028 documents (total 25623592 corpus positions)
2018-01-31 23:53:42,445 : INFO : discarding 581457 tokens: [('data', 4317), ('approach', 4177), ('based', 4816), ('access', 4192), ('model', 4714), ('system', 5123), ('paper', 4628), ('using', 4803), ('supervectors', 8), ('use', 4212)]...
2018-01-31 23:53:42,446 : INFO : keeping 54734 tokens which were in no less than 10 and no more than 4014 (=50.0%) documents
2018-01-31 23:53:42,633 : INFO : resulting dictionary: Dictionary(54734 unique tokens: ['query', 'processing', 'integration', 'chapter', 'illustrate']...)


In [20]:
# # Create a bow tagging for each publication:
# df_combined_toks['bow'] = df_combined_toks['toks'].apply(combined_toks_dict.doc2bow)

# # Generate a corpus based on the tokens, which we will be using later
# corpus = df_combined_toks.bow.tolist()

In [9]:
# Save corpus:
combined_toks_dict.save(os.path.join(DATA_DIR, 'corpora','dictionary.all'))

2018-01-31 23:00:46,788 : INFO : saving Dictionary object under ../../data/data_schoolofinf/corpora/dictionary.all, separately None
2018-01-31 23:00:46,840 : INFO : saved ../../data/data_schoolofinf/corpora/dictionary.all


In [19]:
with open(os.path.join(DATA_DIR, 'toks', 'bow2idx.all'), 'w') as f:
    for pub in docs:
        bow = combined_toks_dict.doc2bow(pub)# list of (id,count)
        if len(bow):
            rep = "".join([(str(a)+" ")*c for (a, c) in bow])
            f.write(rep.strip() + "\n")

### Model 2: only Metadata

In [20]:
df_metadata = df_combined_toks[['year','toks_metada']]

In [21]:
docs = df_metadata.toks_metada.tolist()

toks_dict = Dictionary(docs)

# Filter to remove words thatappeared too frequent (in more than 50% of doucuments) 
# and too little (less than 10 occurences)
toks_dict.filter_extremes(no_below=10, no_above=.5)
toks_dict.compactify()

2018-02-01 00:00:38,208 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-02-01 00:00:39,055 : INFO : built Dictionary(26756 unique tokens: ['query', 'processing', 'data', 'integration', 'chapter']...) from 8028 documents (total 719861 corpus positions)
2018-02-01 00:00:39,089 : INFO : discarding 20534 tokens: [('chase', 9), ('sv', 2), ('ubm', 2), ('supervectors', 2), ('synthesizer', 9), ('eer', 4), ('wsj', 7), ('rps', 2), ('openairinterface', 3), ('academia', 6)]...
2018-02-01 00:00:39,090 : INFO : keeping 6222 tokens which were in no less than 10 and no more than 4014 (=50.0%) documents
2018-02-01 00:00:39,099 : INFO : resulting dictionary: Dictionary(6222 unique tokens: ['query', 'processing', 'data', 'integration', 'chapter']...)


In [22]:
# # Create a bow tagging for each publication:
# df_metadata['bow'] = df_metadata['toks_metada'].apply(toks_dict.doc2bow)

# # Generate a corpus based on the tokens, which we will be using later
# corpus = df_metadata.bow.tolist()

In [23]:
# Save corpus:
toks_dict.save(os.path.join(DATA_DIR, 'corpora','dictionary.meta'))

2018-02-01 00:00:39,135 : INFO : saving Dictionary object under ../../data/data_schoolofinf/corpora/dictionary.meta, separately None
2018-02-01 00:00:39,139 : INFO : saved ../../data/data_schoolofinf/corpora/dictionary.meta


In [24]:
with open(os.path.join(DATA_DIR, 'toks', 'bow2idx.meta'), 'w') as f:
    for pub in docs:
        bow = toks_dict.doc2bow(pub)
        if len(bow):
            rep = "".join([(str(a)+" ")*c for (a, c) in bow])
            f.write(rep.strip() + "\n")

## Restrict from 2012-2017

In [25]:
df_combined_toks = df_combined_toks.drop(
    df_combined_toks[(df_combined_toks.year < 2012) | (df_combined_toks.year > 2017)].index)

In [26]:
print(list(df_combined_toks.year.unique()))

[2013, 2012, 2014, 2016, 2015, 2017]


In [27]:
df_combined_toks.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3620 entries, 18b1a861-afef-4fff-bc80-d02e05be18c4 to b2920a27-5293-4f4a-8874-4a0ea804d91a
Data columns (total 4 columns):
year            3620 non-null int64
toks_metada     3620 non-null object
toks_pdf2txt    3620 non-null object
toks            3620 non-null object
dtypes: int64(1), object(3)
memory usage: 141.4+ KB


### Model 3: metadata + PDF 

In [28]:
docs = df_combined_toks.toks.tolist()

combined_toks_dict = Dictionary(docs)

# Filter to remove words thatappeared too frequent (in more than 50% of doucuments) 
# and too little (less than 10 occurences)
combined_toks_dict.filter_extremes(no_below=10, no_above=.5)
combined_toks_dict.compactify()

2018-02-01 00:00:56,661 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-02-01 00:01:05,770 : INFO : built Dictionary(376785 unique tokens: ['query', 'processing', 'data', 'integration', 'chapter']...) from 3620 documents (total 13084978 corpus positions)
2018-02-01 00:01:06,334 : INFO : discarding 338843 tokens: [('data', 2339), ('part', 1843), ('problem', 1981), ('approach', 2225), ('based', 2528), ('access', 2347), ('second', 1869), ('model', 2398), ('system', 2553), ('informatik_germany', 8)]...
2018-02-01 00:01:06,335 : INFO : keeping 37942 tokens which were in no less than 10 and no more than 1810 (=50.0%) documents
2018-02-01 00:01:06,452 : INFO : resulting dictionary: Dictionary(37942 unique tokens: ['query', 'processing', 'integration', 'chapter', 'illustrate']...)


In [29]:
# # Create a bow tagging for each publication:
# df_combined_toks['bow'] = df_combined_toks['toks'].apply(combined_toks_dict.doc2bow)

# # Generate a corpus based on the tokens, which we will be using later
# corpus = df_combined_toks.bow.tolist()

In [30]:
# Save corpus:
combined_toks_dict.save(os.path.join(DATA_DIR, 'corpora','dictionary.less.all'))

2018-02-01 00:01:06,525 : INFO : saving Dictionary object under ../../data/data_schoolofinf/corpora/dictionary.less.all, separately None
2018-02-01 00:01:06,544 : INFO : saved ../../data/data_schoolofinf/corpora/dictionary.less.all


In [31]:
with open(os.path.join(DATA_DIR, 'toks', 'bow2idx.less.all'), 'w') as f:
    for pub in docs:
        bow = combined_toks_dict.doc2bow(pub)
        if len(bow):
            rep = "".join([(str(a)+" ")*c for (a, c) in bow])
            f.write(rep.strip() + "\n")

# Topic models for dblp

In [3]:
DATA_DIR = '../../data/data_dblp/'

In [5]:
df_dblp = pd.read_pickle(os.path.join(DATA_DIR,'toks','toks.dblp.1997-2017.pkl'))

In [6]:
df_dblp.head(3)

Unnamed: 0_level_0,year,toks
id,Unnamed: 1_level_1,Unnamed: 2_level_1
00745041-3636-4d18-bbec-783c4278c40d,2003,"[self, stabilizing, algorithm, finding, cuttin..."
00dc2bba-3237-4d4e-b541-1205b97df981,2003,"[software, evolution, transformation, electron..."
04136c62-06a9-4c74-9da2-547448a9dc6f,2003,"[kernel, pls, variant, regression, european, s..."


In [11]:
docs_dblp = df_dblp.toks.tolist()
dict_dblp = Dictionary(docs_dblp)

# Using a stricter rule here!
# Filter to remove words thatappeared too frequent (in more than 50% of doucuments) 
# and too little (less than 10 occurences)
dict_dblp.filter_extremes(no_below=50, no_above=.3)
dict_dblp.compactify()

2018-01-31 21:30:31,153 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-01-31 21:30:31,688 : INFO : adding document #10000 to Dictionary(33842 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:30:32,698 : INFO : adding document #20000 to Dictionary(58906 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:30:33,718 : INFO : adding document #30000 to Dictionary(77373 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:30:34,985 : INFO : adding document #40000 to Dictionary(96811 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:30:36,217 : INFO : adding document #50000 to Dictionary(112050 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:30:37,546 : INFO : adding document #60000 to Dictionary(125243 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'c

2018-01-31 21:31:47,162 : INFO : adding document #540000 to Dictionary(387080 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:31:48,761 : INFO : adding document #550000 to Dictionary(390017 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:31:50,203 : INFO : adding document #560000 to Dictionary(392926 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:31:51,792 : INFO : adding document #570000 to Dictionary(395888 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:31:53,305 : INFO : adding document #580000 to Dictionary(398865 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:31:54,720 : INFO : adding document #590000 to Dictionary(401849 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:31:56,245 : INFO : adding document #600000 to Dict

2018-01-31 21:33:03,293 : INFO : adding document #1070000 to Dictionary(535314 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:33:04,820 : INFO : adding document #1080000 to Dictionary(537610 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:33:06,291 : INFO : adding document #1090000 to Dictionary(539797 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:33:08,079 : INFO : adding document #1100000 to Dictionary(542625 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:33:09,707 : INFO : adding document #1110000 to Dictionary(545907 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:33:11,429 : INFO : adding document #1120000 to Dictionary(548859 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:33:12,948 : INFO : adding document #1130000 

2018-01-31 21:34:23,397 : INFO : adding document #1600000 to Dictionary(656991 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:34:24,196 : INFO : adding document #1610000 to Dictionary(658353 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:34:24,842 : INFO : adding document #1620000 to Dictionary(660006 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:34:25,655 : INFO : adding document #1630000 to Dictionary(661852 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:34:26,831 : INFO : adding document #1640000 to Dictionary(663729 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:34:28,075 : INFO : adding document #1650000 to Dictionary(665636 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:34:29,633 : INFO : adding document #1660000 

2018-01-31 21:35:38,632 : INFO : adding document #2130000 to Dictionary(758067 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:35:40,011 : INFO : adding document #2140000 to Dictionary(759901 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:35:41,261 : INFO : adding document #2150000 to Dictionary(761609 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:35:42,574 : INFO : adding document #2160000 to Dictionary(763312 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:35:43,993 : INFO : adding document #2170000 to Dictionary(765049 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:35:45,284 : INFO : adding document #2180000 to Dictionary(766636 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:35:46,731 : INFO : adding document #2190000 

2018-01-31 21:36:51,852 : INFO : adding document #2660000 to Dictionary(846412 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:36:53,169 : INFO : adding document #2670000 to Dictionary(848269 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:36:54,602 : INFO : adding document #2680000 to Dictionary(850496 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:36:56,162 : INFO : adding document #2690000 to Dictionary(852282 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:36:57,684 : INFO : adding document #2700000 to Dictionary(854102 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:36:59,478 : INFO : adding document #2710000 to Dictionary(855910 unique tokens: ['self', 'stabilizing', 'algorithm', 'finding', 'cutting']...)
2018-01-31 21:37:01,116 : INFO : adding document #2720000 

In [12]:
dict_dblp.save(os.path.join(DATA_DIR, 'corpora', 'dictionary.dblp.1997-2017'))

2018-01-31 21:39:38,604 : INFO : saving Dictionary object under ../../data/data_dblp/corpora/dictionary.dblp.1997-2017, separately None
2018-01-31 21:39:38,726 : INFO : saved ../../data/data_dblp/corpora/dictionary.dblp.1997-2017


In [13]:
dict_dblp.save_as_text(
    os.path.join(DATA_DIR, 'corpora', 'dictionary.dblp.1997-2017.txt'))

2018-01-31 21:41:56,899 : INFO : saving dictionary mapping to ../../data/data_dblp/corpora/dictionary.dblp.1997-2017.txt


In [23]:
dict_dblp.doc2bow(['we'])

[]

In [8]:
with open(os.path.join(DATA_DIR, 'toks', 'bow2idx.all'), 'w') as f:
    for pub in docs_dblp:
        bow = dict_dblp.doc2bow(pub)
        if len(bow):
            rep = "".join([(str(a)+" ")*c for (a, c) in bow])
            f.write(rep.strip() + "\n")

In [None]:
rowTotals <- apply(dtm , 1, sum)
dtm.new   <- dtm[rowTotals> 0, ]