In [None]:
import sqlite3 as sql
import pandas as pd
import numpy as np
import logging
import time
import re
from tqdm import tqdm
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
from gensim.matutils import sparse2full 
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import TfidfModel

In [None]:
local_db = '''/Data/samples/wiki/enwiki_articles_20200520.db'''

In [None]:
STOPWORDS = set(stopwords.words('english'))
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]*_')
lemmatizer = WordNetLemmatizer()

In [None]:
def get_query(query, db):
    with sql.connect(db) as conn:
        df = pd.read_sql_query(query, conn)
    df.columns = [str(col).lower() for col in df.columns]
    return df

In [None]:
sample = '''SELECT * FROM articles LIMIT 10'''
df = get_query(sample, local_db)
df

In [None]:
df.text[0]

In [None]:
class Corpus():
    def __init__(self, row_ids, db, dictionary):
        self.row_ids = row_ids
        self.db = db
        self.dictionary = dictionary
        self.len = len(row_ids)

    def __iter__(self):
        row_ids_shuffled = np.random.choice(self.row_ids, self.len, replace=False)
        with sql.connect(self.db) as conn:
            for row_id in row_ids_shuffled:
                select = '''SELECT text FROM articles where rowid=%d''' % row_id
                doc = self.get_query(select, conn)
                tokens = self.tokenize(doc)
                out = self.dictionary.doc2bow(tokens)
                yield out

    def __len__(self):
        return self.len
        
    def get_query(self, select, conn):
        df = pd.read_sql_query(select, conn)
        df.columns = [str(col).lower() for col in df.columns]
        return df['text'].values[0]
        
    def tokenize(self, text):
        text = REPLACE_BY_SPACE_RE.sub('', text)
        text = text.lower()
        text = ' '.join([word for word in text.split() if word not in STOPWORDS])
        text = ' '.join([lemmatizer.lemmatize(word,'v') for word in text.split()])
        tokens = re.findall('''[a-z]{3,}''', text)
        return tokens

In [None]:
rowids = get_query('''SELECT rowid FROM articles''', local_db)['rowid'].tolist()

In [None]:
train_ids, holdout_ids = train_test_split(rowids, train_size=0.8, random_state=123)

In [None]:
len(rowids)

In [None]:
train_ids[:5]

In [None]:
holdout_ids[:5]

In [None]:
train_dict = Dictionary().load('../output/train_set_wiki_dictionary_filtered_no_hyphens.dict')

In [None]:
holdout_dict = Dictionary().load('../output/holdout_set_wiki_dictionary_filtered_no_hyphens.dict')

In [None]:
# for k, v in holdout_dict.token2id.items():
#     print(k, v)

In [None]:
holdout_corpus = Corpus(holdout_ids, local_db, holdout_dict)

In [None]:
next(iter(holdout_corpus))

In [None]:
MmCorpus.serialize('/Data/Corpora/holdout_set_wiki_corpus_no_hyphens.mm', holdout_corpus)

In [None]:
train_corpus = Corpus(train_ids, local_db, train_dict)

In [None]:
MmCorpus.serialize('/Data/Corpora/train_set_wiki_corpus_no_hyphens.mm', train_corpus)