In [7]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas(desc='apply')

from random import sample
import dill
import re
import os
import sys

# Parse and cache .zip files

In [2]:
def _no_location(df):
    truth = np.array(df['region'].isna().tolist() and df['locality'].isna().tolist())
    idx = df[truth].index
    df.drop(idx, inplace=True)
    return None

def _abrv_states(df):
    df['region'] = df['region'].str.upper().replace(states_dict)
    return None

def _in_usa(df):
    truth = df[['region']].isin(states_dict.values())['region']
    idx = df[~truth].index
    df.drop(idx, inplace=True)
    return None

def _has_title(df):
    df.dropna(subset=['title'], inplace=True)
    return None

def _combine_dates(df):
    df['posted_date'].fillna(df['date_added'], inplace=True)
    df.drop('date_added', axis=1, inplace=True)
    df.rename(columns={'posted_date': 'date'}, inplace=True)
    return None

def _has_dates(df, columns):
    df.dropna(subset=columns, how='all', inplace=True)
    return None

def _date_parser(s):
    output = pd.to_datetime(s, format='%Y-%m-%d', errors='coerce')
    return output

def _clean_and_save_chunk(file, num=0, **kwargs):
    for chunk in pd.read_csv(file, **kwargs):
        _has_title(chunk)
        _has_dates(chunk, columns=date_cols)
        _abrv_states(chunk)
        _in_usa(chunk)
        chunk.reset_index(drop=True).to_feather('raw_cache/data_{}.feather'.format(num))
        num += 1
    return num

def cache_files(files, num=0, **kwargs):
    for file in tqdm(files, desc='zip files'):
        num = _clean_and_save_chunk(file, num=num, **kwargs)
    return None

def _get_df(file, **kwargs):
    df = pd.read_feather(file, **kwargs)
    _within_range(df)
    df.dropna(subset=['posted_date'], inplace=True)
    return df

def _within_range(df):
    start = pd.datetime(2017, 1, 1)
    end = pd.datetime(2018, 7, 1)
    truth = ~df['posted_date'].isin(pd.date_range(start, end))
    df.drop(df[truth].index, inplace=True)

def _add_day_of_week(df):
    days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday']
    df['day_of_week'] = pd.Categorical(df['posted_date'].dt.day_name(), categories=days, ordered=True)
    return None

In [3]:
pop = pd.read_feather('other_data/census.feather')
states = pd.read_feather('other_data/us_states.feather')
states_dict = states.set_index('STATE').to_dict()['Abrv']

zip_columns = ['title', 'brand', 'category', 'locality', 'region', 'date_added', 'posted_date']
date_cols = ['date_added', 'posted_date']

start = pd.datetime(2017, 12, 1)
end = pd.datetime(2018, 7, 21)

  labels, = index.labels


In [106]:
folder = 'raw_zips'
files = [os.path.join(folder, file) for file in os.listdir(folder)]
cache_files(files, usecols=zip_columns, chunksize=1e7, compression='infer', dtype=str, parse_dates=date_cols, date_parser=_date_parser)

HBox(children=(IntProgress(value=0, description='zip files', max=7, style=ProgressStyle(description_width='ini…

# Group data

Group by day, week, state, and city over time and cache

In [18]:
job_counts = None
folder = 'raw_cache'
files = [os.path.join(folder, file) for file in os.listdir(folder) if file.endswith('.feather')]

for file in tqdm(files, desc='feather_files'):
    df = _get_df(file, columns=['locality', 'region', 'posted_date'])
    df['posts'] = 1
    grouped = df.groupby(['locality', 'region', 'posted_date']).sum()
    if job_counts is None:
        job_counts = pd.Series()
        job_counts.name = 'posts'
        job_counts = job_counts.add(grouped['posts'], level='locality', fill_value=0)
    else:
        job_counts = job_counts.add(grouped['posts'], fill_value=0)
        
job_counts = job_counts.reset_index()
job_counts['posts'] = job_counts['posts'].astype(int)
job_counts.to_feather('grouped/job_counts.feather')

HBox(children=(IntProgress(value=0, description='feather_files', max=41, style=ProgressStyle(description_width…


