In [None]:
import sqlite3 as sql
import pandas as pd
import numpy as np
import logging
import time
import re
from tqdm import tqdm
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
from gensim.matutils import sparse2full 
from gensim.corpora import Dictionary
from gensim.models import TfidfModel

In [None]:
local_db = '''/Data/samples/wiki/enwiki_articles_20200520.db'''

In [None]:
STOPWORDS = set(stopwords.words('english'))
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]*_')
lemmatizer = nltk.stem.WordNetLemmatizer()

In [None]:
def get_query(query, db):
    with sql.connect(db) as conn:
        df = pd.read_sql_query(query, conn)
    df.columns = [str(col).lower() for col in df.columns]
    return df

In [None]:
sample = '''SELECT * FROM articles LIMIT 10'''
df = get_query(sample, local_db)
df

In [None]:
df.text[0]

In [None]:
class Corpus():
    def __init__(self, row_ids, db):
        self.row_ids = row_ids
        self.db = db
        self.len = len(row_ids)

    def __iter__(self):
        row_ids_shuffled = np.random.choice(self.row_ids, self.len, replace=False)
        with sql.connect(self.db) as conn:
            for row_id in row_ids_shuffled:
                select = '''SELECT text FROM articles where rowid=%d''' % row_id
                doc = self.get_query(select, conn)
                tokens = self.tokenize(doc)
                yield tokens

    def __len__(self):
        return self.len
        
    def get_query(self, select, conn):
        df = pd.read_sql_query(select, conn)
        df.columns = [str(col).lower() for col in df.columns]
        return df['text'].values[0]
        
    def tokenize(self, text):
        text = REPLACE_BY_SPACE_RE.sub('', text)
        text = text.lower()
        text = ' '.join([word for word in text.split() if word not in STOPWORDS])
        text = ' '.join([lemmatizer.lemmatize(word,'v') for word in text.split()])
        tokens = re.findall('''[a-z-]{3,}''', text)
        return tokens

In [None]:
rowids = get_query('''SELECT rowid FROM articles''', local_db)['rowid'].tolist()

In [None]:
train_ids, holdout_ids = train_test_split(rowids, train_size=0.8, random_state=123)

In [None]:
train_ids[:5]

In [None]:
holdout_ids[:5]

Holdout split dictionary

In [None]:
start = time.time()
dictionary = Dictionary(Corpus(holdout_ids, local_db), prune_at = 1000000000)
end = time.time()
print('Time to train dictionary from generator: %0.2fs' % (end - start))

In [None]:
len(dictionary.keys())

In [None]:
vocab = [(dictionary[id], dictionary.dfs[id]/len(holdout_ids)) for id in dictionary.dfs.keys()]

In [None]:
sorted(vocab, key=lambda x: x[1])[::-1][:10]

In [None]:
dictionary.save('../output/holdout_set_wiki_dictionary.dict')

In [None]:
holdout_dict = Dictionary().load('../output/holdout_set_wiki_dictionary.dict')

In [None]:
vocab = [(holdout_dict[id], holdout_dict.dfs[id]/len(holdout_ids)) for id in holdout_dict.dfs.keys()]

In [None]:
sorted(vocab, key=lambda x: x[1])[::-1][:10]

In [None]:
holdout_dict.filter_extremes(no_below = 2, no_above=0.5, keep_n=100000000)

In [None]:
len(holdout_dict.keys())

In [None]:
holdout_dict.save('../output/holdout_set_wiki_dictionary_filtered.dict')

Training split dictionary

In [None]:
start = time.time()
dictionary = Dictionary(Corpus(train_ids, local_db), prune_at = 1000000000)
end = time.time()
print('Time to train dictionary from generator: %0.2fs' % (end - start))

In [None]:
len(dictionary.keys())

In [None]:
dictionary.save('../output/train_set_wiki_dictionary.dict')

In [None]:
train_dict = Dictionary().load('../output/train_set_wiki_dictionary.dict')

In [None]:
vocab = [(train_dict[id], train_dict.dfs[id]/len(train_ids)) for id in train_dict.dfs.keys()]

In [None]:
sorted(vocab, key=lambda x: x[1])[::-1][:10]

In [None]:
train_dict.filter_extremes(no_below = 2, no_above=0.5, keep_n=100000000)

In [None]:
vocab = [(train_dict[id], train_dict.dfs[id]/len(train_ids)) for id in train_dict.dfs.keys()]
sorted(vocab, key=lambda x: x[1])[::-1][:10]

In [None]:
len(train_dict.keys())

In [None]:
train_dict.save('../output/train_set_wiki_dictionary_filtered.dict')

In [None]:
tfidf = TfidfModel(dictionary=train_dict)

In [None]:
tfidf.save('../output/tfidf.model')