# Money Diaries Data Reshape

In [1]:
import os
import re
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
%matplotlib inline

In [2]:
pd.set_option('max_colwidth', 100)
pd.set_option('max_rows', 250)

## Build Wide `DataFrame`

In [3]:
def parse_total(s):
    total = re.findall(r"\$\d+(?:\.\d+)?", s)[-1]
    return float(total.strip('$'))

In [4]:
def parse_salary(s):
    try:
        salary = re.findall(r"\$\d+(?:\,\d+)?", s)[0]
        salary = ''.join(salary.split(','))
        return int(salary.strip('$'))
    except IndexError:
        return 0

In [5]:
def add_totals_cols(df):
    for day in df.columns[:-2]:
        new_col = day + '_total'
        df[new_col] = df[day].apply(parse_total)
    
    totals = [c for c in df.columns if '_total' in c]
    df['weekly_total'] = df[totals].sum(axis=1)
    
    df['salary'] = df['title'].apply(parse_salary)
    return df

In [6]:
def parse_location(s):
    trim_right = s.split("In ")[-1]
    trim_left = trim_right.split('On')[0]
    location = trim_left.strip().strip(',')
    return location

In [7]:
def find_between(string, start_marker, end_marker, default=np.nan):
    try:
        start = string.index(start_marker) + len(start_marker)
        end = string.index(end_marker, start)
        return string[start:end].strip()
    except:
        return default

In [8]:
def add_occupation(df):
    _occ = df.intro.apply(lambda x: find_between(x, 'Occupation:  ', 'Industry:'))
    df['occupation'] = _occ.apply(lambda x: find_between(x, 'Occupation:  ', '.', default=x))
    return df

In [9]:
def add_industry(df):
    df['industry'] = df.intro.apply(lambda x: find_between(x, 'Industry:  ', 'Age:'))
    return df

In [10]:
def add_location(df):
    df['location'] = df.title.apply(lambda x: parse_location(x))
    return df

In [11]:
filename = 'diaries.json'
wide_df = pd.read_json(filename)
wide_df = add_totals_cols(wide_df)
wide_df = add_occupation(wide_df)
wide_df = add_industry(wide_df)
wide_df = add_location(wide_df)

Wide `DataFrame` with columns representing day-text and some easy-to-parse attributes.

## Build skinny `DataFrame` of Monthly Expenses

In [12]:
def row_to_expenses(intro_text):
    '''
    Return a small DataFrame of the monthly expenses listed for one diary.
    '''
    monthly_expense_text = intro_text.split('Monthly Expenses')[-1]
    dollar_amounts = re.findall(r"\$\d+,?\d*(?:\.\d+)?", monthly_expense_text)
    
    def get_indexes_of_dollar_amounts(text, dollar_amounts):
        indexes = []
        tracker = 0
        for amount in dollar_amounts:
            new_start = text.index(amount, tracker)
            new_end = text.index(amount, tracker) + len(amount)
            indexes.append((new_start, new_end))
            tracker = new_end
            
        return indexes
    
    indexes = get_indexes_of_dollar_amounts(monthly_expense_text, dollar_amounts)
    
    # Loop through indexes and get expenses associated with amounts.
    monthly_expenses = []
    prev_x = 0
    prev_y = 0
    for x, y in indexes:
        amount = monthly_expense_text[x:y]
        amount = float(re.sub(r'[$,]', '', amount))
        expense = monthly_expense_text[prev_y:x].strip()
        expense = expense.split(')')[-1].strip(':').strip()
        monthly_expenses.append((amount, expense))
        prev_x = x
        prev_y = y
    
    df = pd.DataFrame(monthly_expenses)
    return df

In [13]:
dfs = []
for row_ix in range(wide_df.shape[0]):
    small_expenses_df = row_to_expenses(wide_df.loc[row_ix, 'intro'])
    small_expenses_df['diary_index'] = row_ix
    small_expenses_df.rename(columns={0:'mn_exp_amt', 1:'mn_exp_rawtext'}, inplace=True)
    dfs.append(small_expenses_df)
monthly_expense_df = pd.concat(dfs, axis=0)
monthly_expense_df.reset_index(inplace=True, drop=True)

Skinny `DataFrame` with each row representing a diarist's single monthly expense.

Mostly works but there is still some cleanup needed.  Might need to point-and-shoot e.g. in Excel

## Build skinny `DataFrame` of Daily Expenses

In [14]:
day_cols = ['day_one_total', 'day_two_total',
             'day_three_total', 'day_four_total',
             'day_five_total', 'day_six_total',
             'day_seven_total']

day_totals = wide_df[day_cols]

In [15]:
def row_to_day_totals(row_ix, day_totals):
    amounts = day_totals.loc[row_ix,:].values
    amounts = pd.DataFrame(amounts, columns=['day_total'])
    amounts['day_nm'] = np.arange(1,8)
    amounts['diary_index'] = row_ix
    return amounts

In [16]:
dfs = []
for row_ix in range(wide_df.shape[0]):
    df = row_to_day_totals(row_ix, day_totals)
    dfs.append(df)
daily_expense_df = pd.concat(dfs, axis=0)
daily_expense_df.reset_index(inplace=True, drop=True)

Skinny `DataFrame` with rows representing total daily expense in a diary-day.

## Build skinny `DataFrame` of diarists

In [17]:
diarist_df = wide_df[['salary', 'occupation', 'industry', 'location', 'url']]
diarist_df.reset_index(level=0, inplace=True)

Skinny `DataFrame` with each row representing a diarist.

## Build skinny `DataFrame` of day text

In [18]:
day_text_cols = ['day_one', 'day_two',
             'day_three', 'day_four',
             'day_five', 'day_six',
             'day_seven']

day_text = wide_df[day_text_cols]

In [19]:
def row_to_day_text(row_ix, day_text):
    txt = day_text.loc[0,:].values
    txt = pd.DataFrame(txt, columns=['day_rawtext'])
    txt['day_nm'] = np.arange(1,8)
    txt['diary_index'] = row_ix
    return txt

In [20]:
dfs = []
for row_ix in range(wide_df.shape[0]):
    df = row_to_day_text(row_ix, day_text)
    dfs.append(df)
daily_text_df = pd.concat(dfs, axis=0)
daily_text_df.reset_index(inplace=True, drop=True)

Skinny `DataFrame` holding raw text data.  Each row is a diary-day.

# Save to disk for manual cleaning

In [21]:
dataframes = {
    'monthly_expenses.csv': monthly_expense_df,
    'daily_expenses.csv': daily_expense_df,
    'diarists.csv': diarist_df,
    'daily_text.csv': daily_text_df,
}

In [22]:
directory = os.path.dirname(os.path.abspath(__name__))
if not os.path.exists(os.path.join(directory, 'data_reshape')):
    os.mkdir(os.path.join(directory, 'data_reshape'))
for basename, df in dataframes.items():
    savepath = os.path.join(directory, 'data_reshape', basename)
#     df.to_csv(savepath, index=False)

Uncomment the line above to enable the save.