In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas(desc='apply')
import dill
import os
import sys
import re
# import matplotlib.pyplot as plt

from sklearn import base
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.pipeline import Pipeline
# from sklearn.model_selection import train_test_split, GridSearchCV

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

# %matplotlib inline

In [11]:
class TextPreProcess(base.BaseEstimator, base.TransformerMixin):
    """
    Input  : document list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    
    def __init__(self, ignore):
        self.en_stop = set(stopwords.words('english')) # English stop words list
        self.tokenizer = RegexpTokenizer(r'[a-z]+&?[a-z]+')
        self.lemmatizer = WordNetLemmatizer()
        self.replace = ignore
    
    def _process(self, text):
        raw = text.lower()
        for key, val in self.replace.items():
            raw = re.sub(key, val, raw)
        tokens = self.tokenizer.tokenize(raw)
        stopped_tokens = [i for i in tokens if not i in self.en_stop]
        lemma_tokens = [self.lemmatizer.lemmatize(i) for i in stopped_tokens]
        output = ' '.join(lemma_tokens)
        return output
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        output = [self._process(text) for text in X]
        return output

def _partial_fit(transformer, estimator, X, y, **kwargs):
    X = transformer.transform(X)
    estimator.partial_fit(X, y, **kwargs)
    del X, y
    return None

def fit_model(files, transformer, estimator, classes, save=False, filename='models/mnb_unweighted/estimator.pkd'):
    for file in tqdm(files, desc='feathers'):
        df = pd.read_feather(file)
        _partial_fit(transformer, estimator, X=df['title'], y=df['region'], classes=classes)
    if save:
        with open(filename, 'wb') as file:
            dill.dump(estimator, file)
    return estimator

In [12]:
with open('grouped/weeks.pkd', 'rb') as file:
    weeks = dill.load(file)
    
with open('models/tfidf/top_tfidf.pkd', 'rb') as file:
    tfv = dill.load(file)

classes = pd.read_feather('other_data/us_states.feather', columns=['Abrv'])

weights = pd.read_feather('other_data/census.feather', columns=['region', 'weight'])
weights.set_index('region', inplace=True)
weights.sort_index(inplace=True)
weights = weights['weight'].tolist()

folder = 'cleaned_cache'
files = [os.path.join(folder, 'data_{}.feather'.format(i)) for i in range(len(os.listdir(folder)))]

In [13]:
%%time

est = MultinomialNB(alpha=1)
estimators = fit_model(files, tfv, est, classes, save=True, filename='models/mnb_unweighted/top_vocabulary/estimator.pkd')

HBox(children=(IntProgress(value=0, description='feathers', max=41, style=ProgressStyle(description_width='ini…


CPU times: user 15min 33s, sys: 42.7 s, total: 16min 16s
Wall time: 16min 33s
