In [1]:
import os
import sys
module_paths = [os.path.abspath(os.path.join(path)) for path in ['.', '..']]
for module_path in module_paths:
    if module_path not in sys.path:
        sys.path.append(module_path)

In [2]:
import re
import pandas as pd
from scraper.settings import PROJECT_PATH
import wrangler
import numpy as np
import glob

ImportError: No module named 'scraper.settings'

In [None]:
# global vars
datasets_path = os.path.normpath(os.path.join(PROJECT_PATH, '../datasets'))

In [None]:
def get_filepath(filename):
    '''
    given a filename, the function returns it's filepath joining with datasets dir.
    '''
    return os.path.join(datasets_path, filename)

In [None]:
def get_normalized_expenditure_dataframe(filename):
    '''
    given a filename, the function does following:
        - add a YEAR column
        - remove Grand Total row because it has redundant data.
        - fill empty rows with forward fill method.
    '''
    df = pd.read_csv(filename)
    
    # set year column
    df['YEAR'] = re.findall(r'\d{4}', filename)[0]

    # reset the index
    df = df.reset_index()
    
    # select the columns which have empty values and forward fill them.
    cols = [col for col in df.columns if col not in ['SOEDESC', 'BILLS', 'GROSS', 'AGDED', 'BTDED', 'NETPAYMENT', 'YEAR']]
    df[cols] = df[cols].replace(r'^\s+$', np.nan, regex=True)
    df[cols] = df[cols].ffill()

    # remove row with Grand Total as it is redundant.
    df = df[(df[:] != 'Grand Total').all(axis=1)]

    return df

In [None]:
def arrange_expenditure_data():
    '''
    the function selects files in the datasets dir, normalize the data and creates new csvs
    from them.
    '''
    def to_include(filename):
        if re.match('.*Expend.*\d{4}\.csv$', filename):
            if not filename.startswith('07') and not filename.startswith('08'):
                return True

    # list all files in datasets dir
    all_files = os.listdir(datasets_path)
    to_arrange_with_same_logic = filter(to_include, all_files)
    for filename in to_arrange_with_same_logic:
        filepath = get_filepath(filename)
        try:
            df = get_normalized_expenditure_dataframe(filepath)
        except Exception as e:
            print(filepath)
            raise e

        # save in a file with _copy appended to the original file's name.
        to_file = '{}_copy.csv'.format(filepath.split('.csv')[0])
        df.to_csv(to_file, index=False)

In [None]:
arrange_expenditure_data()

In [None]:
def concatenate_files(query_str):
    '''
    concatenate files for same query for all the years and all the treasuries.
    '''
    # get all the files to concatenate
    files_to_concatenate = glob.iglob(os.path.join(datasets_path, query_str))
    
    if any(files_to_concatenate):
        # prepare dataframes from all files
        dataframes = (pd.read_csv(file, index_col=0) for file in files_to_concatenate)
    
        # concatenate the dataframes
        concatenated_frames = pd.concat(dataframes, ignore_index=True)
    
        # construct the iterator again to get the first file's name
        files_to_concatenate = glob.iglob(os.path.join(datasets_path, query_str))
        to_file = next(files_to_concatenate)
        to_filepath = get_filepath('{}.csv'.format(re.search('(.*[w|W]ise).*csv', to_file).group(1)))
    
        # save the concatenated dataframes to file
        concatenated_frames.to_csv(to_filepath)

In [None]:
queries_for_concatenation = ('{0:0=2d}._Expend*copy.csv'.format(x) for x in range(1, 11))

In [None]:
for query in queries_for_concatenation:
    concatenate_files(query)