In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas(desc='apply')
import dill
import os
import sys
import re
# import matplotlib.pyplot as plt

from sklearn import base
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.pipeline import Pipeline
# from sklearn.model_selection import train_test_split, GridSearchCV

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

# %matplotlib inline

In [4]:
class TextPreProcess(base.BaseEstimator, base.TransformerMixin):
    """
    Input  : document list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    
    def __init__(self, ignore):
        self.en_stop = set(stopwords.words('english')) # English stop words list
        self.tokenizer = RegexpTokenizer(r'[a-z]+&?[a-z]+')
        self.lemmatizer = WordNetLemmatizer()
        self.replace = ignore
    
    def _process(self, text):
        raw = text.lower()
        for key, val in self.replace.items():
            raw = re.sub(key, val, raw)
        tokens = self.tokenizer.tokenize(raw)
        stopped_tokens = [i for i in tokens if not i in self.en_stop]
        lemma_tokens = [self.lemmatizer.lemmatize(i) for i in stopped_tokens]
        output = ' '.join(lemma_tokens)
        return output
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        output = [self._process(text) for text in X]
        return output

def _partial_fit(transformer, estimator, X, y, **kwargs):
    X = transformer.transform(X)
    estimator.partial_fit(X, y, **kwargs)
    del X, y
    return None

def get_weekly_models(files, transformer, estimators, classes, save=False, filename='models/cnb_unweighted/weekly_estimators.pkd'):
    for file in tqdm(files, desc='feathers'):
        df = pd.read_feather(file)
        groups = df.set_index('posted_date').groupby(pd.Grouper(freq='W'))
        for week, data in tqdm(groups, desc='partial_fit'):
            try:
                _partial_fit(transformer, estimators[week], X=data['title'], y=data['region'], classes=classes)
            except ValueError:
                pass
            del week, data
    if save:
        with open(filename, 'wb') as file:
            dill.dump(estimators, file)
    return estimators

In [5]:
with open('grouped/weeks.pkd', 'rb') as file:
    weeks = dill.load(file)
    
with open('models/tfidf/top_tfidf.pkd', 'rb') as file:
    tfv = dill.load(file)

classes = pd.read_feather('other_data/us_states.feather', columns=['Abrv'])

weights = pd.read_feather('other_data/census.feather', columns=['region', 'weight'])
weights.set_index('region', inplace=True)
weights.sort_index(inplace=True)
weights = weights['weight'].tolist()

folder = 'cleaned_cache'
files = [os.path.join(folder, 'data_{}.feather'.format(i)) for i in range(len(os.listdir(folder)))]

In [6]:
%%time

est = {week: MultinomialNB(alpha=1, class_prior=weights) for week in weeks}
estimators = get_weekly_models(files, tfv, est, classes, save=True, filename='models/mnb_weighted/weekly_estimators.pkd')

HBox(children=(IntProgress(value=0, description='feathers', max=41, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='partial_fit', max=76, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=75, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=45, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=50, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=71, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=78, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=54, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=51, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=78, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=71, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=75, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=50, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=45, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=75, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=75, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=50, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=50, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=45, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=78, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=75, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=50, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=50, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=75, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=50, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=50, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=75, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=78, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=76, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=78, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=76, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=45, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=50, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=78, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=78, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=50, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=50, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=75, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=75, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=50, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=50, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='partial_fit', max=78, style=ProgressStyle(description_width='…


CPU times: user 21min 43s, sys: 56.2 s, total: 22min 39s
Wall time: 18min 50s


## Plotting

In [2]:
import plotly.offline as py
import plotly.graph_objs as go

py.init_notebook_mode(connected=True)

In [10]:
with open('models/mnb_unweighted/top_vocabulary/weekly_estimators.pkd', 'rb') as file:
    estimators = dill.load(file)
    
ignore = pd.read_feather('other_data/ignore.feather')
ignore_dict = ignore.set_index('regex').to_dict()['sub']
pipes = {key: Pipeline([('clean', TextPreProcess(ignore=ignore_dict)), ('tfidf', tfv), ('nb', val)]) for key, val in estimators.items()}

In [11]:
text = [input('job keywords: ')]
tpp = TextPreProcess(ignore=ignore_dict)
cleaned = tpp.transform(text)
keywords = tfv.inverse_transform(tfv.transform(cleaned))

# Create figure
fig = go.Figure()

# Add traces, one for each slider step
data = []
steps = []
i = 0
for date, pipe in pipes.items():
    label = date.strftime('%Y-%m-%d')
    plot = go.Choropleth(locations=pipe.named_steps['nb'].classes_, 
                         z=pipe.predict_proba(text)[0], 
                         zmin=0,
                         zmax=1,
                         locationmode='USA-states',
                         colorscale='Blues', 
                         name=label,
                         colorbar=dict(title='Proportion',
                                       titleside='top'
                                      )
                        )
    step = dict(method="restyle",
                args=["visible", [False] * len(estimators)],
                label=label
               )
    step["args"][1][i] = True  # Toggle i'th trace to "visible"
    data.append(plot)
    steps.append(step)
    i += 1

sliders = [dict(active=10, 
                currentvalue={"prefix": "Week of: "},
                steps=steps
               )
          ]
layout = go.Layout(geo_scope='usa', title=', '.join(keywords[0]), sliders=sliders,
                   paper_bgcolor='rgba(0,0,0,0)',
                   plot_bgcolor='rgba(0,0,0,0)')

fig = go.Figure(data=data, layout=layout)
# # Make 10th trace visible
fig.data[10].visible = True

# py.plot(fig, filename='plots/usa_total_posts.html', auto_open=False)
# s = py.plot(fig, include_plotlyjs=False, output_type='div')
py.iplot(fig)

job keywords: data scientist
