In [1]:
import os
import re
import pandas as pd
import numpy as np
import glob

In [2]:
from utils import get_filepath
from scraper.settings import PROJECT_PATH

In [3]:
# global vars
datasets_path = os.path.normpath(os.path.join(PROJECT_PATH, '../datasets'))

In [4]:
def get_normalized_expenditure_dataframe(filename):
    '''
    given a filename, the function does following:
        - add START_DATE and END_DATE columns
        - remove Grand Total row because it has redundant data.
        - fill empty rows with forward fill method.
    '''
    df = pd.read_csv(filename)
    
    # set year column
    df['START_DATE'], df['END_DATE'] = re.findall(r'(\d{8})', filename)

    # reset the index
    df = df.reset_index()
    
    # select the columns which have empty values and forward fill them.
    exclude_cols = ['SOEDESC', 'BILLS', 'GROSS', 'AGDED', 'BTDED', 'NETPAYMENT', 'START_DATE', 'END_DATE']
    cols = [col for col in df.columns if col not in exclude_cols]
    df[cols] = df[cols].replace(r'^\s+$', np.nan, regex=True)
    df[cols] = df[cols].ffill()

    # remove row with Grand Total as it is redundant.
    df = df[(df[:] != 'Grand Total').all(axis=1)]

    return df

In [11]:
def arrange_expenditure_data():
    '''
    the function selects files in the datasets dir, normalize the data and creates new csvs
    from them.
    '''
    def to_include(filename):
        if re.match('.*Expend.*\d{8}\.csv$', filename):
            if not filename.startswith('07') and not filename.startswith('08'):
                return True

    # list all files in datasets dir
    all_files = os.listdir(datasets_path)
    to_arrange_with_same_logic = filter(to_include, all_files)
    for filename in to_arrange_with_same_logic:
        filepath = get_filepath(filename)
        try:
            df = get_normalized_expenditure_dataframe(filepath)
        except Exception as e:
            print(filepath)
            raise e

        # save in a file with _copy appended to the original file's name.
        to_file = '{}_copy.csv'.format(filepath.split('.csv')[0])
        df.to_csv(to_file, index=False)

In [17]:
# arrange_expenditure_data()

In [12]:
def concatenate_files(query_str):
    '''
    concatenate files for same query for all the years and all the treasuries.
    '''
    # get all the files to concatenate
    files_to_concatenate = glob.iglob(os.path.join(datasets_path, query_str))
    
    if any(files_to_concatenate):
        # prepare dataframes from all files
        dataframes = (pd.read_csv(file, index_col=0) for file in files_to_concatenate)
    
        # concatenate the dataframes
        concatenated_frames = pd.concat(dataframes, ignore_index=True)
    
        # construct the iterator again to get the first file's name
        files_to_concatenate = glob.iglob(os.path.join(datasets_path, query_str))
        to_file = next(files_to_concatenate)
        to_filepath = get_filepath('{}.csv'.format(re.search('(.*[w|W]ise).*csv', to_file).group(1)))
    
        # save the concatenated dataframes to file
        concatenated_frames.to_csv(to_filepath)

In [15]:
queries_for_concatenation = ('{0:0=2d}._Expend*copy.csv'.format(x) for x in range(1, 11))

In [16]:
# for query in queries_for_concatenation:
#     concatenate_files(query)