In [None]:
import sqlite3 as sql
import pandas as pd
import numpy as np
import logging
import time
import re
import random
from tqdm import tqdm
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split

In [None]:
from gensim.matutils import sparse2full 
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import TfidfModel, LdaModel
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora
from gensim.utils import ClippedCorpus
from gensim.models.coherencemodel import CoherenceModel

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

In [None]:
STOPWORDS = set(stopwords.words('english'))
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]*_')
lemmatizer = nltk.stem.WordNetLemmatizer()

In [None]:
local_db = '''/Data/samples/wiki/enwiki_articles_20200520.db'''

In [None]:
def get_query(query, db):
    with sql.connect(db) as conn:
        df = pd.read_sql_query(query, conn)
    df.columns = [str(col).lower() for col in df.columns]
    return df

In [None]:
title_query = '''SELECT title FROM articles'''
titles = get_query(title_query, local_db)
titles.head()

ONE HUNDRED PERCENT TRAINING CORPUS

In [None]:
model = LdaModel.load('../models/new/lda_100percent_corpus_50topics.model')
topics = pd.read_csv('../models/new/lda_100percent_corpus_50topics.csv')

In [None]:
test_rowid = 100
test_query = '''SELECT * FROM articles where rowid=%d''' % test_rowid
test_article = get_query(test_query, local_db)
test_tokens = next(iter(Corpus([test_rowid], local_db, holdout_dict)))
test_topics = model[test_tokens]
print(test_article.title.values[0])
for topic in sorted(test_topics, key=lambda x: x[1], reverse=True)[:20]:
    print('Contribution:', np.round(topic[1],2), '    Topic:', [t.strip("''") for t in topics.iloc[topic[0]].values[1].strip('][').split(', ')[:5]])

FIFTY PERCENT TRAINING CORPUS

In [None]:
model = LdaModel.load('../models/new/lda_50percent_corpus_50topics.model')
topics = pd.read_csv('../models/new/lda_50percent_corpus_50topics.csv')

In [None]:
test_rowid = 100
test_query = '''SELECT * FROM articles where rowid=%d''' % test_rowid
test_article = get_query(test_query, local_db)
test_tokens = next(iter(Corpus([test_rowid], local_db, holdout_dict)))
test_topics = model[test_tokens]
print(test_article.title.values[0])
for topic in sorted(test_topics, key=lambda x: x[1], reverse=True)[:20]:
    print('Contribution:', np.round(topic[1],2), '    Topic:', [t.strip("''") for t in topics.iloc[topic[0]].values[1].strip('][').split(', ')[:5]])

TWENTY FIVE PERCENT TRAINING CORPUS

In [None]:
model = LdaModel.load('../models/lda_25percent_corpus_50topics.model')
topics = pd.read_csv('../models/lda_25percent_corpus_50topics.csv')

In [None]:
test_rowid = 100
test_query = '''SELECT * FROM articles where rowid=%d''' % test_rowid
test_article = get_query(test_query, local_db)
test_tokens = next(iter(Corpus([test_rowid], local_db, holdout_dict)))
test_topics = model[test_tokens]
print(test_article.title.values[0])
for topic in sorted(test_topics, key=lambda x: x[1], reverse=True)[:20]:
    print('Contribution:', np.round(topic[1],2), '    Topic:', [t.strip("''") for t in topics.iloc[topic[0]].values[1].strip('][').split(', ')[:5]])

FIVE PERCENT TRAINING CORPUS

In [None]:
model = LdaModel.load('../models/new/lda_5percent_corpus_50topics.model')
topics = pd.read_csv('../models/new/lda_5percent_corpus_50topics.csv')

In [None]:
test_rowid = 100
test_query = '''SELECT * FROM articles where rowid=%d''' % test_rowid
test_article = get_query(test_query, local_db)
test_tokens = next(iter(Corpus([test_rowid], local_db, holdout_dict)))
test_topics = model[test_tokens]
print(test_article.title.values[0])
for topic in sorted(test_topics, key=lambda x: x[1], reverse=True)[:20]:
    print('Contribution:', np.round(topic[1],2), '    Topic:', [t.strip("''") for t in topics.iloc[topic[0]].values[1].strip('][').split(', ')[:5]])

ONE PERCENT TRAINING CORPUS

In [None]:
model = LdaModel.load('../models/new/lda_1percent_corpus_50topics.model')
topics = pd.read_csv('../models/new/lda_1percent_corpus_50topics.csv')

In [None]:
test_rowid = 100
test_query = '''SELECT * FROM articles where rowid=%d''' % test_rowid
test_article = get_query(test_query, local_db)
test_tokens = next(iter(Corpus([test_rowid], local_db, holdout_dict)))
test_topics = model[test_tokens]
print(test_article.title.values[0])
for topic in sorted(test_topics, key=lambda x: x[1], reverse=True)[:20]:
    print('Contribution:', np.round(topic[1],2), '    Topic:', [t.strip("''") for t in topics.iloc[topic[0]].values[1].strip('][').split(', ')[:5]])