In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [6]:
columns = ['title', 'brand', 'category', 'locality', 'region', 'date_added', 'posted_date']
states = pd.read_csv('us_states.csv', names=['STATE', 'State', 'Abrv'], usecols=['STATE', 'Abrv'])
states_dict = states.set_index('STATE').to_dict()['Abrv']
def _no_location(df):
    truth = np.array(df['region'].isna().tolist() and df['locality'].isna().tolist())
    idx = df[truth].index
    df.drop(idx, inplace=True)
    return None

def _abrv_states(df):
    df['region'] = df['region'].str.upper().replace(states_dict)
    return None

def _in_usa(df):
    truth = df[['region']].isin(states_dict.values())['region']
    idx = df[~truth].index
    df.drop(idx, inplace=True)
    return None

def _combine_dates(df):
    df['posted_date'].fillna(df['date_added'], inplace=True)
    df.drop('date_added', axis=1, inplace=True)
    df.rename(columns={'posted_date': 'date'}, inplace=True)
    return None

def _parse_date(df, columns=['date']):
    for column in columns:
        df[column] = pd.to_datetime(df[column], yearfirst=True)
    return None

def _clean_and_save_chunk(file, columns, num=0, chunksize=1e7, compression='infer'):
    for chunk in pd.read_csv(file, usecols=columns, chunksize=chunksize, compression=compression):
        _abrv_states(chunk)
        _in_usa(chunk)
        chunk.reset_index(drop=True).to_feather('raw_cache/data_%s.feather' %num)
        num += 1
    return num

def cache_files(files, columns, num=0, chunksize=1e7, compression='infer'):
    for file in tqdm(files, desc='zip files'):
        num = _clean_and_save_chunk(file, columns, num, chunksize, compression)
    return None

In [7]:
files = ['raw_zips/jobs_%i.zip' %(i) for i in range(1,8)]
cache_files(files, columns)

HBox(children=(IntProgress(value=0, description='zip files', max=7, style=ProgressStyle(description_width='ini…

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
