In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import random
import dill
import re
import os
import matplotlib

from sklearn import base
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

import nltk
from nltk.corpus import stopwords

# Parse and cache .zip files

In [47]:
pop = pd.read_feather('other_data/census.feather')
states = pd.read_feather('other_data/us_states.feather')
states_dict = states.set_index('STATE').to_dict()['Abrv']

zip_columns = ['title', 'brand', 'category', 'locality', 'region', 'date_added', 'posted_date']
date_cols = ['date_added', 'posted_date']

start = pd.datetime(2017, 12, 1)
end = pd.datetime(2018, 7, 21)

def _no_location(df):
    truth = np.array(df['region'].isna().tolist() and df['locality'].isna().tolist())
    idx = df[truth].index
    df.drop(idx, inplace=True)
    return None

def _abrv_states(df):
    df['region'] = df['region'].str.upper().replace(states_dict)
    return None

def _in_usa(df):
    truth = df[['region']].isin(states_dict.values())['region']
    idx = df[~truth].index
    df.drop(idx, inplace=True)
    return None

def _has_title(df):
    df.dropna(subset=['title'], inplace=True)
    return None

def _combine_dates(df):
    df['posted_date'].fillna(df['date_added'], inplace=True)
    df.drop('date_added', axis=1, inplace=True)
    df.rename(columns={'posted_date': 'date'}, inplace=True)
    return None

def _has_dates(df, columns=date_cols):
    df.dropna(subset=columns, how='all', inplace=True)
    return None

def _date_parser(s):
    output = pd.to_datetime(s, format='%Y-%m-%d', errors='coerce')
    return output

def _clean_and_save_chunk(file, num=0, **kwargs):
    for chunk in pd.read_csv(file, **kwargs):
        _has_title(chunk)
        _has_dates(chunk, columns=date_cols)
        _abrv_states(chunk)
        _in_usa(chunk)
        chunk.reset_index(drop=True).to_feather('raw_cache/data_{}.feather'.format(num))
        num += 1
    return num

def cache_files(files, num=0, **kwargs):
    for file in tqdm(files, desc='zip files'):
        num = _clean_and_save_chunk(file, num=num, **kwargs)
    return None

In [106]:
folder = 'raw_zips'
files = [os.path.join(folder, file) for file in os.listdir(folder)]
cache_files(files, usecols=zip_columns, chunksize=1e7, compression='infer', dtype=str, parse_dates=date_cols, date_parser=_date_parser)

HBox(children=(IntProgress(value=0, description='zip files', max=7, style=ProgressStyle(description_width='ini…

# Group data

Group by day, week, and states over time

## Load Feathers

For now, we will only focus on the first 10 files.

In [23]:
# files = [file for file in os.listdir('raw_cache') if file.endswith('feather')]
files = ['raw_cache/data_{}.feather'.format(i) for i in range(10)]
dfs = []
for file in tqdm(files, desc='load files'):
    temp = pd.read_feather(file, columns=['title', 'locality', 'region', 'posted_date'])
    dfs.append(temp.dropna(subset=['posted_date']))
    df = pd.concat(dfs)
df.reset_index(drop=True, inplace=True)
df['day_of_week'] = pd.Categorical(df['posted_date'].dt.day_name(), categories= ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday'], ordered=True)
df.describe()

HBox(children=(IntProgress(value=0, description='load files', max=10, style=ProgressStyle(description_width='i…




Unnamed: 0,title,locality,region,posted_date,day_of_week
count,28999478,28972617,28999478,28999478,28999478
unique,349502,10094,52,1073,7
top,Sales Associate,Seattle,CA,2016-11-14 00:00:00,Friday
freq,349819,757154,4155012,745974,5460305
first,,,,2012-08-17 00:00:00,
last,,,,2019-01-23 00:00:00,


## Total number of jobs posted per day

In [48]:
per_day = df.set_index('posted_date').groupby(pd.Grouper(freq='D'))['title'].count()
per_day.name = 'posts'
per_day = per_day.to_frame()
per_day.reset_index(inplace=True)
per_day.to_feather('grouped/posts_per_day.feather')

## Weekly distribution

In [49]:
week = df.groupby('day_of_week').count()['title']
week.name = 'posts'
total = week.sum()
week = week.to_frame()
week['ratio'] = week['posts']/total
week.sort_values('day_of_week', inplace=True)
week.reset_index(inplace=True)
week.to_feather('grouped/weekly_distribution.feather')

## Jobs per city

Job posts per state also included in data frame

In [26]:
jobs_city_date = df.groupby(['locality', 'region', 'posted_date']).count()['title']
jobs_city_date.name = 'posts'
jobs_city_date = jobs_city_date.to_frame().reset_index()
jobs_city_date.to_feather('grouped/jobs_city_date.feather')

# Dimensionality reduction

I want to apply SVD onto a sparse matrix of counted terms from `CountVectorizer` to get the principle axes, but `TruncatedSVD` from `sklearn` is too memory intensive and takes a long time. A solution is to possibly find an on-line algorithm that is something like gradient descent for SVD. I found a couple of resources to go through for this:

- [stack exchange](https://stats.stackexchange.com/questions/177007/updating-svd-decomposition-after-adding-one-new-row-to-the-matrix)
- [gensim](https://pypi.org/project/gensim/)
- [surprise](http://surpriselib.com/)
- [sparsesvd](https://pypi.org/project/sparsesvd/)

I'm leaning towards `gensim` at the moment, there are good resources for it and it seems like it is widely used for this expressed purpose, in particular, the `Latent Semantic Indexing` transformation. [Data Camp](https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python) has a resource outlining this exact procedure.

In [153]:
class CleanTransformer(base.BaseEstimator, base.TransformerMixin):
    
    @staticmethod
    def _process(string):
        output = re.sub(r"""[\d+|-|-|/]""", ' ', string.lower())
        output = ' '.join(output.split())
        return output
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        output = X.apply(self._process)
        
        return output

class DictEncoder(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, vocabulary):
        self.vocab = vocabulary
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # X will come in as a Series object.  Return a list of dictionaries corresponding to those inner lists.
        output = []
        for string in X:
            d = {}
            for v in self.vocab:
                num = len(re.findall(r'\b{}\b'.format(v), string))
                if num:
                    d[v] = num
            output.append(d)
                
        return output

## Clean and count transform

In [154]:
autobot_pipeline = Pipeline([('clean', CleanTransformer()), ('count', CountVectorizer(max_features=500, stop_words='english'))])
counts = autobot_pipeline.fit_transform(df['title'])

In [155]:
with open('transforms/counts_sample.pkd', 'wb') as file:
    dill.dump(counts, file)
with open('transforms/autobot.pkd', 'wb') as file:
    dill.dump(autobot_pipeline, file)

In [51]:
with open('transforms/counts_sample.pkd', 'rb') as file:
    counts = dill.load(file)
with open('transforms/autobot.pkd', 'rb') as file:
    autobot_pipeline = dill.load(file)

In [52]:
autobot_pipeline.named_steps['count'].get_feature_names()

['account',
 'accountant',
 'accounting',
 'accounts',
 'acquisition',
 'admin',
 'administrative',
 'administrator',
 'advanced',
 'advisor',
 'affairs',
 'agent',
 'aide',
 'alexa',
 'amazon',
 'america',
 'analysis',
 'analyst',
 'analytics',
 'angeles',
 'apparel',
 'appliance',
 'appliances',
 'application',
 'applications',
 'apply',
 'architect',
 'area',
 'assembler',
 'asset',
 'assistant',
 'associate',
 'associates',
 'assurance',
 'atlanta',
 'attendant',
 'audit',
 'auditor',
 'austin',
 'auto',
 'automation',
 'automotive',
 'avp',
 'aws',
 'az',
 'backroom',
 'backup',
 'bakery',
 'bank',
 'banker',
 'banking',
 'barista',
 'bartender',
 'based',
 'bath',
 'bbw',
 'beach',
 'benefits',
 'big',
 'bilingual',
 'bonus',
 'boston',
 'branch',
 'brand',
 'bulk',
 'business',
 'buyer',
 'ca',
 'car',
 'care',
 'case',
 'cashier',
 'cashiers',
 'catering',
 'cdl',
 'center',
 'central',
 'certified',
 'chain',
 'charlotte',
 'chi',
 'chicago',
 'chili',
 'city',
 'claims',
 'cl

## Build job vocabulary

Use the US Bureau of Labor and Statistics jobs descriptions to build a vocabulary to search the job post data

In [215]:
test = pd.read_csv('other_data/BLS_job_descriptions.csv', usecols=['occ_title'])
test.drop_duplicates(inplace=True)
test.reset_index(inplace=True, drop=True)
idxs = [test.index[0], test.index[-1]]
test.drop(idxs, inplace=True)
test = test['occ_title']

In [262]:
temp = test.str.replace(r'\W', ' ')
temp = temp.str.split(expand=True).stack()

## Reduce Dimensionality

I want to apply SVD onto the count sparse matrix to get the principle axes, but `TruncatedSVD` from `sklearn` is too memory intensive and takes a long time. A solution is to possibly find an on-line algorithm that is something like gradient descent for SVD. I found a couple of resources to go through for this:

- [stack exchange](https://stats.stackexchange.com/questions/177007/updating-svd-decomposition-after-adding-one-new-row-to-the-matrix)
- [gensim](https://pypi.org/project/gensim/)
- [surprise](http://surpriselib.com/)
- [sparsesvd](https://pypi.org/project/sparsesvd/)

I'm leaning towards `gensim` at the moment, there are good resources for it and it seems like it is widely used for this expressed purpose, in particular, the `Latent Semantic Indexing` transformation. [Data Camp](https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python) has a resource outlining this exact procedure.

In [None]:
svd = TruncatedSVD(n_components=50)
svd.fit(counts, df['region'])
np.shape(svd.components_)

In [None]:
with open('transforms/svd_sample.pkd', 'wb') as file:
    dill.dump(svd, file)

In [None]:
with open('svd_sample.pkd', 'rb') as file:
    svd = dill.load(file)