In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import random
import dill
import re
import os

from sklearn import base
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

# Parse and cache .zip files

In [4]:
pop = pd.read_feather('census.feather')
states = pd.read_feather('us_states.feather')
states_dict = states.set_index('STATE').to_dict()['Abrv']

zip_columns = ['title', 'brand', 'category', 'locality', 'region', 'date_added', 'posted_date']

def _no_location(df):
    truth = np.array(df['region'].isna().tolist() and df['locality'].isna().tolist())
    idx = df[truth].index
    df.drop(idx, inplace=True)
    return None

def _abrv_states(df):
    df['region'] = df['region'].str.upper().replace(states_dict)
    return None

def _in_usa(df):
    truth = df[['region']].isin(states_dict.values())['region']
    idx = df[~truth].index
    df.drop(idx, inplace=True)
    return None

def _has_title(df):
    df.dropna(subset=['title'], inplace=True)
    return None

def _combine_dates(df):
    df['posted_date'].fillna(df['date_added'], inplace=True)
    df.drop('date_added', axis=1, inplace=True)
    df.rename(columns={'posted_date': 'date'}, inplace=True)
    return None

def _parse_date(df, columns=['date']):
    for column in columns:
        df[column] = pd.to_datetime(df[column], yearfirst=True)
    return None

def _clean_and_save_chunk(file, columns, num=0, chunksize=1e7, compression='infer'):
    for chunk in pd.read_csv(file, usecols=columns, chunksize=chunksize, compression=compression):
        _has_title(chunk)
        _abrv_states(chunk)
        _in_usa(chunk)
        chunk.reset_index(drop=True).to_feather('raw_cache/data_%s.feather' %num)
        num += 1
    return num

def cache_files(files, columns, num=0, chunksize=1e7, compression='infer'):
    for file in tqdm(files, desc='zip files'):
        num = _clean_and_save_chunk(file, columns, num, chunksize, compression)
    return None

In [None]:
files = ['raw_zips/jobs_{}.zip'.format(i) for i in range(1,8)]
cache_files(files, zip_columns)

# Group data

Group by day, week, and states over time

## Load Feathers

In [None]:
# files = [file for file in os.listdir('raw_cache') if file.endswith('feather')]
files = ['raw_cache/data_{}.feather'.format(i) for i in range(10)]
dfs = []
for file in tqdm(files, desc='load files'):
    temp = pd.read_feather(file, columns=['title', 'region', 'posted_date'])
    dfs.append(temp.dropna(subset=['posted_date']))
    df = pd.concat(dfs)
df.reset_index(drop=True, inplace=True)
df['posted_date'] = pd.to_datetime(df['posted_date'], yearfirst=True, errors='coerce')
df['day_of_week'] = df['posted_date'].dt.day_name()
df.describe()

## Total number of jobs posted per day

In [None]:
per_day = df.set_index('posted_date').groupby(pd.Grouper(freq='D'))['title'].count()
per_day.name = 'posts'
per_day = per_day.to_frame()
per_day.reset_index(inplace=True)
per_day.to_feather('grouped/posts_per_day.feather')

## Weekly distribution

In [None]:
week = df.groupby('day_of_week').count()['title']
week.name = 'posts'
total = week.sum()
week = week.to_frame()
week.reset_index(inplace=True)

week_map = {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7}
week['order'] = week['day_of_week'].replace(week_map)
week.sort_values('order', inplace=True)
week.reset_index(inplace=True)
week.to_feather('grouped/weekly_distribution.feather')

# Dimensionality reduction

Taking the titles from the job posts, I'll want to eliminate some of the words or symbols with `CountVectorizer`, which will output a sparse matrix consisting of the input string and respective counts. The number of unique, interesting keywords will be very large, to make more sense of the data, I will want to perform dimensionality reduction via `TruncatedSVD` with the `CountVectorizer` sparse matrix output. The principle components will generally be the sectors of industry.

At this point, a good stop gap is to visualize the sectors of industry, maybe even as a function of time.

In [None]:
class CleanTransformer(base.BaseEstimator, base.TransformerMixin):
    
    @staticmethod
    def _process(string):
        output = re.sub(r"""[\d+|-|-|/]""", ' ', string.lower())
        output = ' '.join(output.split())
        return output
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        output = X.apply(self._process)
        
        return output

class DictEncoder(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, vocabulary):
        self.vocab = vocabulary
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # X will come in as a Series object.  Return a list of dictionaries corresponding to those inner lists.
        output = []
        for string in X:
            d = {}
            for v in self.vocab:
                num = len(re.findall(r'\b{}\b'.format(v), string))
                if num:
                    d[v] = num
            output.append(d)
                
        return output

## Clean and count transform

In [None]:
autobot_pipeline = Pipeline([('clean', CleanTransformer()), ('count', CountVectorizer(max_features=500, stop_words='english'))])
counts = autobot_pipeline.fit_transform(df['title'])

In [None]:
with open('transforms/counts_sample.pkd', 'wb') as file:
    dill.dump(counts, file)
with open('transforms/autobot.pkd', 'wb') as file:
    dill.dump(autobot_pipeline, file)

In [None]:
with open('transforms/counts_sample.pkd', 'rb') as file:
    counts = dill.load(file)
with open('transforms/autobot.pkd', 'rb') as file:
    autobot_pipeline = dill.load(file)

## SVD

In [None]:
svd = TruncatedSVD(n_components=50)
svd.fit(counts, df['region'])
np.shape(svd.components_)

In [None]:
with open('transforms/svd_sample.pkd', 'wb') as file:
    dill.dump(svd, file)

In [None]:
with open('svd_sample.pkd', 'rb') as file:
    svd = dill.load(file)