In [1]:
from src.dictionaries import *
import pandas as pd
import numpy as np
import os
os.chdir('/home/cj/Documents/dsi/capstones/report-card-recession')

ModuleNotFoundError: No module named 'src'

In [None]:
#This file takes the quarterly raw data from the QCEW and turns them into quarterly timelines. From here, we can compute target variables.

def add_qtrid(df):
    '''
    adds a column for the year and quarter.

    params: df(dataframe)
    returns: dataframe with column added'''
    df['qtrid'] = df['year'] + (df['qtr']/4)
    return df

def import_one(year):
    '''brings a single year's woth of data into a dataframe. Used for initial EDA. 
    Referenced in import_all

    params: year(str)
    returns: df(dataframe)'''
    filepath = 'data/' + str(year) + '.csv'
    #all relevant csvs are renamed with only the year
    df = pd.read_csv(filepath, dtype = schema_dict)
    #schema_dict is found in dictionaries.py
    for column in drop_columns:
        if column in df.columns:
            df = df.drop([column], axis = 1)
    return df

def import_all(years):
    '''combines as many years ofdata into a single dataframe, as well as adding quater id
    References import_one and add_qtrid

    params: years (list of str)
    returns: df (dataframe)'''
    df = import_one(years[0])
    for year in years[1:]:
        df = df.append(import_one(year))
    df = add_qtrid(df)
    return df


def third_quarter(index):
    '''imports only every 3rd quarter row- required to import entire dataset (too large)
    Referenced in feature_space

    params: index, int
    returns: boolean'''
    if index == 0:
        return False
    elif (index - 3) % 4 == 0:
        return False
    else:
        return True

In [None]:
### Help functions for target calculation ###

def calc_nadir(s):
    assert isinstance(s, pd.Series)
    return s.min()

def calc_nadir_qtr(s):
    return s.argmin()

def calc_pre_peak(s):
    return s[ : s.argmin()].max()

def calc_pre_peak_quarter(s):
    try:
        qtr = s[ : s.argmin()].argmax()
    except:
        qtr = None
    return qtr

def calc_post_peak(s):
    return s[s.argmin() : ].max()

def calc_post_peak_qtr(s):
    return s[s.argmin() : ].argmax() + s.argmin()

In [None]:
def create_timeline_2001(variable):
    '''produces a timeline dataframe(and exports to json) for 2001

    params: variable, str, one of ['month3_emplvel' (employment), 'avg_wkly_wage' (wages)]
    returns: df, Dataframe'''
    
    df = import_all(recession1_years)
    df = df.pivot_table(columns = 'qtrid', values = variable, index = ['area_fips', 'area_title'], aggfunc = np.sum)
    df = df.reset_index()
    #fill nans
    df = df.fillna(0)
    df2 = df.drop(columns = ['area_fips', 'area_title'])
    df2 = df2.reset_index()
    df2 = df2.drop(columns = 'index')
    df2 = df2.fillna(0)

    
    #this specifies when the jobs numbers "bottom-out" during the recession
    nadir = df2.iloc[:,6:].apply(lambda x: calc_nadir(x), axis=1).rename('nadir')
    
    #counts the number of quarters to the nadir since the beginning of the timeframe
    nadir_qtr = df2.iloc[:,6:].apply(lambda x: calc_nadir_qtr(x), axis=1).rename('nadir_qtr')
    
    #computes the highest points before and after the nadir, and captures the quarter count
    pre_peak = df2.apply(lambda x: calc_pre_peak(x), axis=1).rename('pre_peak')
    pre_peak_qtr = df2.apply(lambda x: calc_pre_peak_quarter(x), axis=1).rename('pre_peak_qtr')
    post_peak = df2.apply(lambda x: calc_post_peak(x), axis=1).rename('post_peak')
    post_peak_qtr = df2.apply(lambda x: calc_post_peak_qtr(x), axis=1).rename('post_peak_qtr')
    
    #puts the computed points in a dataframe, joins with timeline
    df_results = pd.concat([df['area_fips'], nadir, nadir_qtr, pre_peak, pre_peak_qtr, post_peak, post_peak_qtr], axis=1)
    df = df.join(df_results, how = 'outer', rsuffix = '_derive')
    
    #PRIMARY TARGET: did the area decline the entire time(-1), did it start growing again but not avhieve it's former numbers(0), or did it grow and recover(1)?
    df['recovery'] = (df['post_peak'] >= df['pre_peak']) *1

    #SECONDARY TARGET: How long did the jobs numbers decline?
    df['decline'] = (df['nadir_qtr'] - df['pre_peak_qtr'])
    
    #TERTIARY TARGET different in before/after jobs numbers
    df['delta'] = df['post_peak'] - df['pre_peak']
    
    #export the data
    df.to_json('data/Recession1_timeline.json')
    df = df.fillna(0)
    return df