In [2]:
import matplotlib.dates as dates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

from matplotlib import ticker

In [3]:
import utils

In [4]:
file = utils.get_filepath('01._Expenditure_-_DDO_and_SOE_Wise.csv')
df = pd.read_csv(file, index_col='START_DATE', parse_dates=True)
df = utils.wrangle_data(df, 'SOEDESC')

In [3]:
def plot_district_ranking_for_category(category):
    '''
    Plots bar chart for the Total amount spent in the given category for all districts to see where each district stands.
    for example, we can rank the districts based on their Total GROSS distribution.
    
    input- category:GROSS
    '''
    # group different categories of SOE by district to get the sum of each type.
    district_soe_data = df.groupby(['DISTRICT', 'SOEDESC']).sum().dropna()
    # extract rows which shows the Total figures.
    district_soe_total = district_soe_data.xs('Total', level='SOEDESC')
    
    # extract required category
    category_series = district_soe_total[category]
    
    # sort it so we can see the ranks.
    sorted_category = category_series.sort_values()
    
    # plot horizontally because district names are big and aren't readble when plotted vertically.
    ax = sorted_category.plot(kind='barh', linewidth=6, width=0.8, figsize=(10, 8))

    plt.title('{}: Disctrict Wise'.format(category), fontsize=16, fontweight='bold')

    ax.set_ylabel('Districts', fontsize=16, fontweight='bold')

    plt.xscale('log')

    # set formatter for axis so that we can apply formatter properties
    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
    # set scientific notation to false, because after scaling x axis with log we have exponential ticks on y axis.
    ax.get_xaxis().get_major_formatter().set_scientific(False)

    # format xticks appropriately to show amounts in thousands.
    ax.set_xticklabels(list(map(utils.make_readable_amount, ax.get_xticks())))

    ax.set_xlabel('Amount (in Rupees)', fontsize=14, fontweight='bold')

    plt.show()

In [2]:
def compare_categories_for_districts(categories):
    '''
    Compare Total expense made in different categories for each district,
    for example we can compare the distribution of Total GROSS amount and NETPAYMENT for all districts.
    
    input - categories:['GROSS', 'NETPAYMENT']
    '''
    district_soe_data = df.groupby(['DISTRICT', 'SOEDESC']).sum().dropna()
    district_soe_total = district_soe_data.xs('Total', level='SOEDESC')

    # extract required feature
    category_series = district_soe_total[categories]
    
    # plot horizontally because district names are big and aren't readble when plotted vertically.
    ax = category_series.plot(kind='barh', linewidth=6, width=0.8, figsize=(10, 8))

    plt.title('{} and {}: Disctrict Wise (2017)'.format(*categories), fontsize=16, fontweight='bold')

    ax.set_ylabel('Districts', fontsize=16, fontweight='bold')

    plt.xscale('log')

    # set formatter for axis so that we can apply formatter properties
    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
    # set scientific notation to false, because after scaling x axis with log we have exponential ticks on y axis.
    ax.get_xaxis().get_major_formatter().set_scientific(False)

    # format xticks appropriately to show amounts in thousands.
    ax.set_xticklabels(list(map(utils.make_readable_amount, ax.get_xticks())))

    ax.set_xlabel('Amount (in Rupees)', fontsize=14, fontweight='bold')

    plt.show()

In [749]:
def plot_time_series_for_district_category(district, category):
    '''
    Plot a feature across a monthly timeline for a district.
    For example we could see how GROSS amount was distributed from April 2017 to August 2018 for Dharamsala.
    
    input - district:'DHARAMSALA', category:'GROSS'
    '''
    # group data by districts
    district_grouped = df.groupby(['START_DATE', 'DISTRICT']).sum().swaplevel(0, 1)

    # get data for input district
    district_data = district_grouped.xs(district)
    # select input feature from district data.
    district_category = district_data[category]

    # using x_compat so that we can avoid pandas formatting the datetime
    # and can have it formatted with matplotlib.
    # REF: https://stackoverflow.com/questions/44213781/pandas-dataframe-line-plot-display-date-on-xaxis
    ax = district_category.plot(figsize=(15, 6), x_compat=True)

    plt.title('{} Distribution across Months for {}'.format(category, district.title()),
              fontweight='bold', fontsize=16)
    ax.set_xlabel('Timeline of Expense(Monthly)', fontsize=14, fontweight='bold')
    
    # set locator for month so that we don't skip months on ticks.
    ax.xaxis.set_major_locator(dates.MonthLocator())
    # set format for ticks
    ax.xaxis.set_major_formatter(dates.DateFormatter('%b %Y'))

    # set formatter for axis so that we can apply formatter properties
    ax.yaxis.set_major_formatter(ticker.ScalarFormatter())
    # set scientific notation to false, because after scaling x axis with log we have exponential ticks on y axis.
    ax.get_yaxis().get_major_formatter().set_scientific(False)

    # format xticks appropriately to show amounts in thousands.
    ax.set_yticklabels(list(map(utils.make_readable_amount, ax.get_yticks())))

    ax.set_ylabel('{} Amount (in Rupees)'.format(category), fontsize=14, fontweight='bold')

    plt.show()

In [748]:
def plot_soe_wise_category_for_district(district, category):
    '''
    Plot all soes for a district and category.
    For example we can plot all SOEs' NETPAYMENT for DHARAMSALA
    
    input- district:'DHARAMSALA', category:'NETPAYMENT'
    '''
    soe_grouped_by_districts = df.groupby(['DISTRICT', 'SOEDESC']).sum().dropna()
    required_district_group = soe_grouped_by_districts.xs(district)
    required_category = required_district_group[category]
    soe_wise_category_data = required_category[required_category.index != 'Total'][1:]
    soe_wise_category_data = soe_wise_category_data.sort_values()
    
    ax = soe_wise_category_data.plot(kind='barh', figsize=(15, 6))

    plt.xscale('log')
    # format xticks appropriately to show amounts in thousands.
    ax.set_xticklabels(list(map(utils.make_readable_amount, ax.get_xticks())))
    
    plt.show()

In [747]:
def plot_soe_distribution_for_category(category):
    '''
    Plot distribution of a type of amount of different SOE grouped by districts to see which SOE
    has the most money spent by all the districts.
    
    input- category:'GROSS'
    '''
    grouped_soe = df.groupby(['SOEDESC']).sum()
    required_category = grouped_soe[category]
    soe_grouped_category = required_category[required_category.index != 'Total'][1:]
    soe_grouped_category = soe_grouped_category.sort_values()
    
    ax = soe_grouped_category.plot(kind='barh', figsize=(15, 6))
    
    plt.title('SOE wise {} Expenditure for all districts'.format(category), fontsize=16, fontweight='bold')

    plt.ylabel('Section Of Expenditure', fontsize=15, fontweight='bold')
    
    plt.xscale('log')
    # format xticks appropriately to show amounts in thousands.
    ax.set_xticklabels(list(map(utils.make_readable_amount, ax.get_xticks())))
    plt.xlabel('Amount (in Rupees)', fontsize=15, fontweight='bold')
    
    plt.show()

In [746]:
def plot_gross_soe_comparison_for_district(soe):
    '''
    Plot all districts' GROSS expenditure for given SOE.
    For example, we can plot Gross Salaries for all districts.
    
    input- soe:'01-SALARIES'
    '''
    soe_grouped_by_districts = df.groupby(['DISTRICT', 'SOEDESC']).sum().dropna()
    gross_soe_for_districts = soe_grouped_by_districts['GROSS']
    required_soe = gross_soe_for_districts.loc[:, soe]
    required_soe = required_soe.sort_values()
    
    ax = required_soe.plot(kind='barh', figsize=(15, 6))
    
    soe_name = ' '.join(soe.split('-')[1:]).title()
    plt.title('District Wise {} distribution'.format(soe_name), fontsize=15, fontweight='bold')

    # format xticks appropriately to show amounts in thousands.
    ax.set_xticklabels(list(map(utils.make_readable_amount, ax.get_xticks())))
    
    plt.show()