In [None]:
import sqlite3 as sql
import pandas as pd
import numpy as np
import logging
import time
import re
from tqdm import tqdm
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split

In [None]:
from gensim.matutils import sparse2full 
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import TfidfModel, LdaModel
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora
from gensim.utils import ClippedCorpus
from gensim.models.coherencemodel import CoherenceModel

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

In [None]:
import logging
logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
local_db = '''/Data/samples/wiki/enwiki_articles_20200520.db'''

In [None]:
STOPWORDS = set(stopwords.words('english'))
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]*_')
lemmatizer = nltk.stem.WordNetLemmatizer()

In [None]:
def get_query(query, db):
    with sql.connect(db) as conn:
        df = pd.read_sql_query(query, conn)
    df.columns = [str(col).lower() for col in df.columns]
    return df

In [None]:
sample = '''SELECT * FROM articles LIMIT 10'''
df = get_query(sample, local_db)
df

In [None]:
df.text[0]

In [None]:
class Text():
    def __init__(self, row_ids, db):
        self.row_ids = row_ids
        self.db = db
        self.len = len(row_ids)

    def __iter__(self):
        row_ids_shuffled = np.random.choice(self.row_ids, self.len, replace=False)
        with sql.connect(self.db) as conn:
            for row_id in row_ids_shuffled:
                select = '''SELECT text FROM articles where rowid=%d''' % row_id
                doc = self.get_query(select, conn)
                tokens = self.tokenize(doc)
                yield tokens

    def __len__(self):
        return self.len
        
    def get_query(self, select, conn):
        df = pd.read_sql_query(select, conn)
        df.columns = [str(col).lower() for col in df.columns]
        return df['text'].values[0]
        
    def tokenize(self, text):
        text = REPLACE_BY_SPACE_RE.sub('', text)
        text = text.lower()
        text = ' '.join([word for word in text.split() if word not in STOPWORDS])
        text = ' '.join([lemmatizer.lemmatize(word,'v') for word in text.split()])
        tokens = re.findall('''[a-z]{3,}''', text)
        return tokens

In [None]:
rowids = get_query('''SELECT rowid FROM articles''', local_db)['rowid'].tolist()

In [None]:
train_ids, holdout_ids = train_test_split(rowids, train_size=0.8, random_state=123)

In [None]:
len(rowids)

In [None]:
train_ids[:5]

In [None]:
holdout_ids[:5]

In [None]:
train_dict = Dictionary().load('../output/train_set_wiki_dictionary_filtered_no_hyphens.dict')

In [None]:
print(train_dict)

In [None]:
holdout_dict = Dictionary().load('../output/holdout_set_wiki_dictionary_filtered_no_hyphens.dict')

In [None]:
print(holdout_dict)

In [None]:
holdout_corpus = corpora.MmCorpus('/Data/Corpora/holdout_set_wiki_corpus_no_hyphens.mm')

In [None]:
train_corpus = corpora.MmCorpus('/Data/Corpora/train_set_wiki_corpus_no_hyphens.mm')

In [None]:
N_train = len(train_corpus)
N_holdout = len(holdout_corpus)
N_train, N_holdout

In [None]:
holdout_text = Text(holdout_ids, local_db)

In [None]:
num_topics = [50, 100, 250, 500]
coherence_time_5percent = []
coherence_val_5percent = []
for k in num_topics:
    print('--- Training on %s topics ---' % k)
    start_time = time.time()
    model = LdaModel.load('../models/lda_5percent_corpus_'+str(k)+'topics.model')    
    topics = []
    for topic_id, topic in model.show_topics(num_topics=k, num_words=100, formatted=False):
        topic_words = [word for word, _ in topic]
        topics.append(topic_words)  
    coherence_model = CoherenceModel(topics=topics, texts=holdout_text, dictionary = holdout_dict, coherence='c_v')
    coherence = coherence_model.get_coherence()
    coherence_time = time.time()
    coherence_minutes = round((coherence_time-start_time)/60.,2)
    print('--- %s minutes to compute coherence ---' % coherence_minutes)
    print('--- LDA %s topics coherence = %s ---' % (k, coherence))

    coherence_time_5percent.append(coherence_minutes)
    coherence_val_5percent.append(coherence)

In [None]:
coherence_time_5percent

In [None]:
coherence_val_5percent

In [None]:
num_topics = [50, 100, 250, 500]
coherence_time_10percent = []
coherence_val_10percent = []
for k in num_topics:
    print('--- Training on %s topics ---' % k)
    start_time = time.time()
    model = LdaModel.load('../models/lda_10percent_corpus_'+str(k)+'topics.model')    
    topics = []
    for topic_id, topic in model.show_topics(num_topics=k, num_words=100, formatted=False):
        topic_words = [word for word, _ in topic]
        topics.append(topic_words)      
    coherence_model = CoherenceModel(topics=topics, texts=holdout_text, dictionary = holdout_dict, coherence='c_v')
    coherence = coherence_model.get_coherence()
    coherence_time = time.time()
    coherence_minutes = round((coherence_time-start_time)/60.,2)
    print('--- %s minutes to compute coherence ---' % coherence_minutes)
    print('--- LDA %s topics coherence = %s ---' % (k, coherence))

    coherence_time_10percent.append(coherence_minutes)
    coherence_val_10percent.append(coherence)

In [None]:
lda_time_10percent

In [None]:
coherence_time_10percent

In [None]:
num_topics = [50, 100, 250, 500]
coherence_time_25percent = []
coherence_val_25percent = []
for k in num_topics:
    print('--- Training on %s topics ---' % k)
    start_time = time.time()
    model = LdaModel.load('../models/lda_25percent_corpus_'+str(k)+'topics.model')    
    coherence_model = CoherenceModel(topics=topics, texts=holdout_text, dictionary = holdout_dict, coherence='c_v')
    topics = []
    for topic_id, topic in model.show_topics(num_topics=k, num_words=100, formatted=False):
        topic_words = [word for word, _ in topic]
        topics.append(topic_words)
    coherence_model = CoherenceModel(topics=topics, texts=holdout_text, dictionary = holdout_dict, coherence='c_v')
    coherence = coherence_model.get_coherence()
    coherence_time = time.time()
    coherence_minutes = round((coherence_time-start_time)/60.,2)
    print('--- %s minutes to compute coherence ---' % coherence_minutes)
    print('--- LDA %s topics coherence = %s ---' % (k, coherence))

    coherence_time_25percent.append(coherence_minutes)
    coherence_val_25percent.append(coherence)

In [None]:
coherence_time_25percent

In [None]:
coherence_val_25percent

In [None]:
num_topics = [50, 100, 250, 500]
coherence_time_50percent = []
coherence_val_50percent = []
for k in num_topics:
    print('--- Training on %s topics ---' % k)
    start_time = time.time()
    model = LdaModel.load('../models/lda_50percent_corpus_'+str(k)+'topics.model')
    print('modle loaded...')
    topics = []
    for topic_id, topic in model.show_topics(num_topics=k, num_words=100, formatted=False):
        topic_words = [word for word, _ in topic]
        topics.append(topic_words)
    print('topics collected...')
    coherence_model = CoherenceModel(topics=topics, texts=holdout_text, dictionary = holdout_dict, coherence='c_v')
    print('coherence model built...')
    print('computing coherence...')
    coherence = coherence_model.get_coherence()
    coherence_time = time.time()
    coherence_minutes = round((coherence_time-start_time)/60.,2)
    print('--- %s minutes to compute coherence ---' % coherence_minutes)
    print('--- LDA %s topics coherence = %s ---' % (k, coherence))

    coherence_time_50percent.append(coherence_minutes)
    coherence_val_50percent.append(coherence)

In [None]:
coherence_time_50percent

In [None]:
coherence_val_50percent

In [None]:
num_topics = [50, 100, 250, 500]
coherence_time_100percent = []
coherence_val_100percent = []
for k in num_topics:
    print('--- Training on %s topics ---' % k)
    start_time = time.time()
    model = LdaModel.load('../models/lda_100percent_corpus_'+str(k)+'topics.model')  
    print('model loaded...')
    topics = []
    for topic_id, topic in model.show_topics(num_topics=k, num_words=100, formatted=False):
        topic_words = [word for word, _ in topic]
        topics.append(topic_words)
    print('topics collected...')
    coherence_model = CoherenceModel(topics=topics, texts=holdout_text, dictionary = holdout_dict, coherence='c_v')
    print('coherence model built...')
    print('computing coherence...')
    coherence = coherence_model.get_coherence()
    coherence_time = time.time()
    coherence_minutes = round((coherence_time-start_time)/60.,2)
    print('--- %s minutes to compute coherence ---' % coherence_minutes)
    print('--- LDA %s topics coherence = %s ---' % (k, coherence))

    coherence_time_100percent.append(coherence_minutes)
    coherence_val_100percent.append(coherence)

In [None]:
coherence_time_100percent

In [None]:
coherence_val_100percent