In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from random import sample
import dill
import re
import os

# Parse and cache .zip files

In [2]:
def _no_location(df):
    truth = np.array(df['region'].isna().tolist() and df['locality'].isna().tolist())
    idx = df[truth].index
    df.drop(idx, inplace=True)
    return None

def _abrv_states(df):
    df['region'] = df['region'].str.upper().replace(states_dict)
    return None

def _in_usa(df):
    truth = df[['region']].isin(states_dict.values())['region']
    idx = df[~truth].index
    df.drop(idx, inplace=True)
    return None

def _has_title(df):
    df.dropna(subset=['title'], inplace=True)
    return None

def _combine_dates(df):
    df['posted_date'].fillna(df['date_added'], inplace=True)
    df.drop('date_added', axis=1, inplace=True)
    df.rename(columns={'posted_date': 'date'}, inplace=True)
    return None

def _has_dates(df, columns):
    df.dropna(subset=columns, how='all', inplace=True)
    return None

def _date_parser(s):
    output = pd.to_datetime(s, format='%Y-%m-%d', errors='coerce')
    return output

def _clean_and_save_chunk(file, num=0, **kwargs):
    for chunk in pd.read_csv(file, **kwargs):
        _has_title(chunk)
        _has_dates(chunk, columns=date_cols)
        _abrv_states(chunk)
        _in_usa(chunk)
        chunk.reset_index(drop=True).to_feather('raw_cache/data_{}.feather'.format(num))
        num += 1
    return num

def cache_files(files, num=0, **kwargs):
    for file in tqdm(files, desc='zip files'):
        num = _clean_and_save_chunk(file, num=num, **kwargs)
    return None

In [27]:
pop = pd.read_feather('other_data/census.feather')
states = pd.read_feather('other_data/us_states.feather')
states_dict = states.set_index('STATE').to_dict()['Abrv']

zip_columns = ['title', 'brand', 'category', 'locality', 'region', 'date_added', 'posted_date']
date_cols = ['date_added', 'posted_date']

start = pd.datetime(2017, 12, 1)
end = pd.datetime(2018, 7, 21)

In [106]:
folder = 'raw_zips'
files = [os.path.join(folder, file) for file in os.listdir(folder)]
cache_files(files, usecols=zip_columns, chunksize=1e7, compression='infer', dtype=str, parse_dates=date_cols, date_parser=_date_parser)

HBox(children=(IntProgress(value=0, description='zip files', max=7, style=ProgressStyle(description_width='ini…

# Group data

Group by day, week, state, and city over time and cache

In [2]:
def _get_df(file, **kwargs):
    df = pd.read_feather(file, **kwargs)
    _within_range(df)
    df.dropna(subset=['posted_date'], inplace=True)
#     _add_day_of_week(df)
    return df

def _within_range(df):
    start = pd.datetime(2017, 1, 1)
    end = pd.datetime(2018, 7, 1)
    truth = ~df['posted_date'].isin(pd.date_range(start, end))
    df.drop(df[truth].index, inplace=True)

def _add_day_of_week(df):
    days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday']
    df['day_of_week'] = pd.Categorical(df['posted_date'].dt.day_name(), categories=days, ordered=True)
    return None

In [18]:
job_counts = None
folder = 'raw_cache'
files = [os.path.join(folder, file) for file in os.listdir(folder) if file.endswith('.feather')]

for file in tqdm(files, desc='feather_files'):
    df = _get_df(file, columns=['locality', 'region', 'posted_date'])
    df['posts'] = 1
    grouped = df.groupby(['locality', 'region', 'posted_date']).sum()
    if job_counts is None:
        job_counts = pd.Series()
        job_counts.name = 'posts'
        job_counts = job_counts.add(grouped['posts'], level='locality', fill_value=0)
    else:
        job_counts = job_counts.add(grouped['posts'], fill_value=0)
        
job_counts = job_counts.reset_index()
job_counts['posts'] = job_counts['posts'].astype(int)
job_counts.to_feather('grouped/job_counts.feather')

HBox(children=(IntProgress(value=0, description='feather_files', max=41, style=ProgressStyle(description_width…




# Dimensionality reduction

I want to apply SVD onto a sparse matrix of counted terms from `CountVectorizer` to get the principle axes, but `TruncatedSVD` from `sklearn` is too memory intensive and takes a long time. A solution is to possibly find an on-line algorithm that is something like gradient descent for SVD. I found a couple of resources to go through for this:

- [stack exchange](https://stats.stackexchange.com/questions/177007/updating-svd-decomposition-after-adding-one-new-row-to-the-matrix)
- [gensim](https://pypi.org/project/gensim/)
- [surprise](http://surpriselib.com/)
- [sparsesvd](https://pypi.org/project/sparsesvd/)

I'm leaning towards `gensim` at the moment, there are good resources for it and it seems like it is widely used for this expressed purpose, in particular, the `Latent Semantic Indexing` transformation. [Data Camp](https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python) has a resource outlining this exact procedure.

In [3]:
from sklearn import base
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from gensim import corpora
from gensim.models import LsiModel
from nltk import download
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

## Process titles

In [27]:
def preprocess_data(doc_set):
    """
    Input  : docuemnt list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    lemmatizer = WordNetLemmatizer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [lemmatizer.lemmatize(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

def prepare_corpus(doc_clean):
    """
    Input  : clean document
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # generate LDA model
    return dictionary, doc_term_matrix

def create_gensim_lsa_model(doc_clean, number_of_topics):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    # generate LSA model
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
#     print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel

def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
    """
    Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LSA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        # generate LSA model
        model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [17]:
ignore = pd.read_feather('other_data/ignore.feather')
ignore = ignore.set_index('regex').to_dict()['sub']

def chunks(l, n):
    # For item i in a range that is a length of l,
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        yield l[i:i+n]

class TextPreProcess(base.BaseEstimator, base.TransformerMixin):
    """
    Input  : document list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    
    def __init__(self, ignore):
        self.en_stop = set(stopwords.words('english')) # English stop words list
        self.tokenizer = RegexpTokenizer(r'[a-z]+&?[a-z]+')
        self.lemmatizer = WordNetLemmatizer()
        self.replace = ignore
    
    def _process(self, text):
        raw = text.lower()
        for key, val in self.replace.items():
            raw = re.sub(key, val, raw)
        tokens = self.tokenizer.tokenize(raw)
        stopped_tokens = [i for i in tokens if not i in self.en_stop]
        lemma_tokens = [self.lemmatizer.lemmatize(i) for i in stopped_tokens]
        output = ' '.join(lemma_tokens)
        return output
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        output = X.apply(self._process)
        return output

## Clean and cache titles

In [25]:
folder = 'raw_cache'
files = [os.path.join(folder, file) for file in os.listdir(folder) if file.endswith('.feather')]

tpp = TextPreProcess(ignore)

i = 0
for file in tqdm(files, desc='clean titles'):
    df = _get_df(file, columns=['title', 'region', 'posted_date'])
    df['title'] = tpp.fit_transform(df['title'])
    df.reset_index(drop=True).to_feather('cleaned_titles/titles_{}.feather'.format(i))
    i += 1

HBox(children=(IntProgress(value=0, description='clean titles', max=41, style=ProgressStyle(description_width=…

In [74]:
def get_dictionary(files)
    dictionary = corpora.Dictionary(prune_at=5000)
    for file in tqdm(files, desc='produce dictionary'):
        df = pd.read_feather(file, columns=['title'])
        text = df['title'].str.split()
        dictionary.add_documents(text, prune_at=5000)

    dictionary.filter_tokens(bad_ids=['nd', 'rd', 'st'])
    dictionary.compactify()

    return dictionary

HBox(children=(IntProgress(value=0, description='produce dictionary', max=41, style=ProgressStyle(description_…

  labels, = index.labels


In [72]:
folder = 'title_cache'
files = [os.path.join(folder, file) for file in os.listdir(folder) if file.endswith('.feather')]

dictionary = get_dictionary(files)

In [77]:
dictionary.save_as_text('transforms/dictionary.bin', sort_by_word=False)

In [79]:
len(dictionary)

5422