# Introduction

This notebook assumes:
1. You already have tokenized all the documents and stored them in SpaCy's Doc format on disk.
2. You have trained a LDA model.

This notebook will:

1. load the required SpaCy Doc.
2. fit (predict) the topics with LDA models.
3. Save the result to a JSON file.

In [4]:
import pandas as pd
import sqlite3
import gensim
import nltk
import glob
import json
import pickle
from tqdm import tqdm_notebook as tn
import os
## Helpers

def save_pkl(target_object, filename):
    with open(filename, "wb") as file:
        pickle.dump(target_object, file)
        
def load_pkl(filename):
    return pickle.load(open(filename, "rb"))

def save_json(target_object, filename):
    with open(filename, 'w') as file:
        json.dump(target_object, file)
        
def load_json(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    return data

## Preparing Data

In this step, we are going to load data from disk to the memory and properly format them so that we can processing them in the next "preprocessing" stage.

In [2]:
# Loading metadata from trainning database
con = sqlite3.connect("F:/FMR/data.sqlite")
db_documents = pd.read_sql_query("SELECT * from documents", con)
db_authors = pd.read_sql_query("SELECT * from authors", con)
data = db_documents # just a handy alias
data.head()

Unnamed: 0,id,title,abstract,publication_date,submission_date,cover_url,full_url,first_page,last_page,pages,document_type,type,article_id,context_key,label,publication_title,submission_path,journal_id
0,1,Role-play and Use Case Cards for Requirements ...,<p>This paper presents a technique that uses r...,2006-01-01T00:00:00-08:00,2009-02-26T07:42:10-08:00,http://aisel.aisnet.org/acis2001/1,http://aisel.aisnet.org/cgi/viewcontent.cgi?ar...,,,,article,article,1001,742028,1,ACIS 2001 Proceedings,acis2001/1,1
1,2,Flexible Learning and Academic Performance in ...,<p>This research investigates the effectivenes...,2001-01-01T00:00:00-08:00,2009-02-26T22:04:53-08:00,http://aisel.aisnet.org/acis2001/10,http://aisel.aisnet.org/cgi/viewcontent.cgi?ar...,,,,article,article,1006,744077,10,ACIS 2001 Proceedings,acis2001/10,2
2,3,Proactive Metrics: A Framework for Managing IS...,<p>Managers of information systems development...,2001-01-01T00:00:00-08:00,2009-02-26T22:03:31-08:00,http://aisel.aisnet.org/acis2001/11,http://aisel.aisnet.org/cgi/viewcontent.cgi?ar...,,,,article,article,1005,744076,11,ACIS 2001 Proceedings,acis2001/11,3
3,4,Reuse in Information Systems Development: Clas...,<p>There has been a trend in recent years towa...,2001-01-01T00:00:00-08:00,2009-02-26T22:02:29-08:00,http://aisel.aisnet.org/acis2001/12,http://aisel.aisnet.org/cgi/viewcontent.cgi?ar...,,,,article,article,1004,744075,12,ACIS 2001 Proceedings,acis2001/12,4
4,5,Improving Software Development: The Prescripti...,<p>We describe the Prescriptive Simplified Met...,2001-01-01T00:00:00-08:00,2009-02-26T22:01:24-08:00,http://aisel.aisnet.org/acis2001/13,http://aisel.aisnet.org/cgi/viewcontent.cgi?ar...,,,,article,article,1003,744074,13,ACIS 2001 Proceedings,acis2001/13,5


## Loading SpaCy

In [3]:
import spacy
nlp = spacy.load('en')

## Determining Journals
We want to build a dedicated LDA model for each journal. So here we want to get a list of journal prefix.

In [4]:
def get_name(s):
    end = 0
    for i in range(len(s.split('/')[0])):
        try:
            a = int(s[i])
            end = i
            break
        except:
            continue
    return s[:end]

journals = []
for i in db_documents['submission_path']:
    journals.append(get_name(i))

In [5]:
journals = set(journals)

In [6]:
from gensim.models.phrases import Phraser, Phrases

In [7]:
from itertools import tee
import multiprocessing

# Use tn(iter, desc="Some text") to track progress
def gen_tokenized_dict_beta(untokenized_dict):
    gen1, gen2 = tee(untokenised.items())
    ids = (id_ for (id_, text) in gen1)
    texts = (text for (id_, text) in gen2)
    docs = nlp.pipe(tn(texts, desc="Tokenization", total=len(untokenized_dict)), n_threads=9)
    tokenised = {id_: doc for id_, doc in zip(ids, docs)}
    return tokenised

def gen_tokenized_dict(untokenized_dict):
    return {k: nlp(v) for k, v in tn(untokenized_dict.items(), desc="Tokenization")}

def gen_tokenized_dict_parallel(untokenized_dict): # Uses textblob
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as executor:
         return {num:sqr for num, sqr in tn(zip(untokenized_dict.keys(), executor.map(TextBlob, untokenized_dict.values())), desc="Tokenization")}

def keep_journal(dict_, journal):
    kept = {k: v for k, v in tn(dict_.items(), desc="Journal Filter") if k.startswith(journal)}
    print("Original: ", len(dict_), ", Kept ", len(kept), " items.")
    return kept

In [8]:
import os
from spacy.tokens.doc import Doc
def save_doc_dict(d, folder_name):
    os.mkdir(folder_name)
    nlp.vocab.dump_vectors(os.path.join(folder_name, 'vocab.bin'))
    for k, v in tn(d.items(), desc="Saving doc"):
        k = k.replace('/', '-') + '.doc'
        with open(os.path.join(folder_name, k), 'wb') as f:
            f.write(v.to_bytes())
            
def load_doc_dict(folder_name):
    nlp = spacy.load('en') # This is very important
    file_list = glob.glob(os.path.join(folder_name, "*.doc"))
    d = {}
    nlp.vocab.load_vectors_from_bin_loc(os.path.join(folder_name, 'vocab.bin'))
    for k in tn(file_list, desc="Loading doc"):
        with open(os.path.join(k), 'rb') as f:
            k_ = k.split('\\')[-1].replace('-', '/').replace('.doc', '')
            for bs in Doc.read_bytes(f):
                d[k_] = Doc(nlp.vocab).from_bytes(bs)
    return d

In [9]:
def pos_filter(l, pos="NOUN"):
    return [str(i.lemma_).lower() for i in l if i.pos_ == 'NOUN' and i.is_alpha]

In [10]:
def bigram(corpus):
    phrases = Phrases(corpus)
    make_bigram = Phraser(phrases)
    return [make_bigram[i] for i in tn(corpus, desc='Bigram')], make_bigram

In [11]:
# Set training parameters.
num_topics = 150
chunksize = 2000
passes = 1
iterations = 150
eval_every = None  # Don't evaluate model perplexity, takes too much time.

In [None]:
import gensim.corpora
import pyLDAvis.gensim
import warnings
from imp import reload
from os.path import exists
warnings.filterwarnings("ignore")
def predict_journal(j):
    corpus = load_doc_dict(j)
    corpus = {k: pos_filter(v) for k, v in tn(corpus.items())}
    
    # Make it bigram
    tokenised_list, make_bigram = bigram([i for i in corpus.values()])
    
    # Override the corpus
    corpus = {k: make_bigram[i] for k, v in tn(corpus.items())}
    dictionary = gensim.corpora.Dictionary.load(os.path.join(j, "_noun_bigram.ldamodel.dictionary"))
    # dictionary.filter_extremes(no_below=2, no_above=0.5, keep_n=None)
    if len(dictionary) < 10:
        print("Warning: dictionary only has " + str(len(dictionary)) + " items. Passing.")
        return None, None
    
    # Train LDA model.
    from gensim.models import LdaModel
    model = LdaModel.load(os.path.join(j, "_noun_bigram_" + str(num_topics) + ".ldamodel"))
    paper_vec_lib = {}
    for paper_path, tokens in tn(corpus.items(), desc="Predicting"):
        bow = dictionary.doc2bow(tokens)
        paper_vec_lib[paper_path] = model[bow]
    save_json(paper_vec_lib, os.path.join(j, "paper_vec_lib.json"))
    del dictionary, model

journals = set([i for i in journals if i])
for j in tn(journals, desc="Journal"):
    if not exists(os.path.join(j, "paper_vec_lib.json")):
        try:
            predict_journal(j)
        except Exception as e:
            print(e)
    else:
        print(j, "already exists.")

globdev already exists.
icmb already exists.
digit already exists.
bled already exists.
sais already exists.
sighci already exists.
Could not open binary file b'amcis\\vocab.bin'
isd already exists.
icdss already exists.
iris already exists.
sprouts_proceedings_siggreen_ already exists.
wisp already exists.
sbis already exists.
mg already exists.
ukais already exists.
Could not open binary file b'ecis\\vocab.bin'


# Loading the JSONs

In [5]:
def load_paper_topic_lib(folder_name):
    d = load_json(os.path.join(folder_name, "paper_vec_lib.json"))
    return d

In [6]:
digit_paper_topic_lib = load_paper_topic_lib('digit')

In [7]:
digit_paper_topic_lib.keys()

dict_keys(['digit2015/3', 'digit2007/12', 'digit2008/5', 'digit2010/13', 'digit2004/4', 'digit2013/12', 'digit2007/7', 'digit2014/6', 'digit2006/6', 'digit2010/2', 'digit2008/3', 'digit2006/2', 'digit2015/10', 'digit2001/2', 'digit2007/9', 'digit2015/4', 'digit2003/3', 'digit2015/13', 'digit2013/4', 'digit2015/14', 'digit2002/2', 'digit2004/5', 'digit2005/4', 'digit2003/2', 'digit2003/4', 'digit2004/2', 'digit2013/11', 'digit2014/7', 'digit2015/11', 'digit2007/16', 'digit2013/8', 'digit2015/16', 'digit2007/14', 'digit2015/8', 'digit2009/6', 'digit2013/3', 'digit2006/3', 'digit2009/2', 'digit2014/9', 'digit2015/1', 'digit2013/2', 'digit2004/3', 'digit2008/9', 'digit2009/1', 'digit2008/6', 'digit2007/10', 'digit2015/5', 'digit2005/1', 'digit2015/2', 'digit2009/5', 'digit2008/7', 'digit2006/8', 'digit2007/4', 'digit2013/1', 'digit2010/11', 'digit2007/6', 'digit2009/10', 'digit2010/10', 'digit2006/5', 'digit2010/1', 'digit2010/15', 'digit2010/14', 'digit2014/4', 'digit2013/5', 'digit2002/1

In [8]:
digit_paper_topic_lib['digit2008/4']

[[18, 0.17483466083199492],
 [39, 0.3763245523057287],
 [73, 0.25043883246204185]]

# Interpreting the 
```
# Topic, Confidence
[[18, 0.17483466083199492],
 [39, 0.3763245523057287],
 [73, 0.25043883246204185]]
```