In [None]:
import matplotlib.dates as dates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

from matplotlib import ticker

In [None]:
import wrangler
import utils

In [None]:
file = '10._Expenditure_-_DDO_Head_of_AccountSOE_and_VoucherBillNO_wise.csv'
filepath = utils.get_filepath(file)
df = pd.read_csv(filepath, parse_dates=True)
df = wrangler.wrangle_data_for_consolidated_query(df, ['DDODESC', 'DISTRICT', 'TREASURY', 'DDO'])

In [None]:
def plot_district_ranking_for_category(category):
    '''
    Plots bar chart for the Total amount spent in the given category for all districts to see where each district stands.
    for example, we can rank the districts based on their Total GROSS distribution.
    
    input- category:GROSS
    '''
    district_grouped = df.groupby('DISTRICT').sum().dropna()
    district_wise_category = district_grouped[category]
    
    # sort it so we can see the ranks.
    sorted_category = district_wise_category.sort_values()
    
    # plot horizontally because district names are big and aren't readble when plotted vertically.
    ax = sorted_category.plot(kind='barh', linewidth=5, width=0.7, figsize=(10, 7))

    plt.title('{} Expenditure: Disctrict Wise'.format(category), fontsize=16, fontweight='bold')

    ax.set_ylabel('Districts', fontsize=16, fontweight='bold')

    # set formatter for axis so that we can apply formatter properties
    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
    # set scientific notation to false, because after scaling x axis with log we have exponential ticks on y axis.
    ax.get_xaxis().get_major_formatter().set_scientific(False)

    # format xticks appropriately to show amounts in readable format.
    ax.set_xticklabels(list(map(utils.make_readable_amount, ax.get_xticks())))

    ax.set_xlabel('Amount (in Rupees)', fontsize=14, fontweight='bold')

    plt.show()

In [None]:
def compare_categories_for_districts(categories):
    '''
    Compare Total expense made in different categories for each district,
    for example we can compare the distribution of Total GROSS amount and NETPAYMENT for all districts.
    
    input - categories:['GROSS', 'NETPAYMENT']
    '''
    district_grouped = df.groupby('DISTRICT').sum().dropna()

    district_wise_category = district_grouped[categories]
    
    # sort it so we can see the ranks.
    sorted_categories = district_wise_category.sort_values(by=categories[0])
    
    # plot horizontally because district names are big and aren't readble when plotted vertically.
    ax = sorted_categories.plot(kind='barh', linewidth=6, width=0.8, figsize=(10, 8))

    plt.title('{} and {}: Disctrict Wise'.format(*categories), fontsize=16, fontweight='bold')

    ax.set_ylabel('Districts', fontsize=16, fontweight='bold')

    # set formatter for axis so that we can apply formatter properties
    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
    # set scientific notation to false, because after scaling x axis with log we have exponential ticks on y axis.
    ax.get_xaxis().get_major_formatter().set_scientific(False)

    # format xticks appropriately to show amounts in readable format.
    ax.set_xticklabels(list(map(utils.make_readable_amount, ax.get_xticks())))

    ax.set_xlabel('Amount (in Rupees)', fontsize=14, fontweight='bold')

    plt.show()

In [None]:
def plot_time_series_for_district_category(district, category):
    '''
    Plot a feature across a monthly timeline for a district.
    For example we could see how GROSS amount was distributed from April 2017 to August 2018 for Shimla.
    
    input - district:'SHIMLA', category:'GROSS'
    '''
    # group data by districts and date.
    district_grouped = df.groupby(['DISTRICT', 'DATE']).sum()

    # get monthly data for input district
    district_monthly = district_grouped.loc[district].resample('M').sum()
    
    # select input feature from district data.
    district_category = district_monthly[category]

    # using x_compat so that we can avoid pandas formatting the datetime
    # and can have it formatted with matplotlib.
    # REF: https://stackoverflow.com/questions/44213781/pandas-dataframe-line-plot-display-date-on-xaxis
    ax = district_category.plot(figsize=(15, 6), x_compat=True)

    plt.title('{} Distribution across Months for {}'.format(category, district.title()),
              fontweight='bold', fontsize=16)
    ax.set_xlabel('Timeline of Expense(Monthly)', fontsize=14, fontweight='bold')
    
    # set locator for month so that we don't skip months on ticks.
    ax.xaxis.set_major_locator(dates.MonthLocator())
    # set format for ticks
    ax.xaxis.set_major_formatter(dates.DateFormatter('%b %Y'))

    # set formatter for axis so that we can apply formatter properties
    ax.yaxis.set_major_formatter(ticker.ScalarFormatter())
    # set scientific notation to false, because after scaling x axis with log we have exponential ticks on y axis.
    ax.get_yaxis().get_major_formatter().set_scientific(False)

    # format xticks appropriately to show amounts in readable format.
    ax.set_yticklabels(list(map(utils.make_readable_amount, ax.get_yticks())))

    ax.set_ylabel('{} Amount (in Rupees)'.format(category), fontsize=14, fontweight='bold')

    plt.show()

In [None]:
def plot_major_head_wise_expenditure_for_district(district, category):
    '''
    Plot Major head wise expenditure for asked district and category.
    
    input - district:'SHIMLA', category:'GROSS'
    '''
    # group data by major head and districts
    district_maj_grouped = df.groupby(['DISTRICT', 'MAJ']).sum().dropna()
    
    # drop the empty MAJ row.
    required_district = district_maj_grouped.loc[district][1:]
    # select the input category, for example: GROSS
    required_category = required_district[category]
    
    # sort it descending and get only top 10.
    sorted_category = required_category.sort_values(ascending=False)[:10]
    # sort ascending because the plot will show descending.
    sorted_category = sorted_category.sort_values()
    
    # plot horizontally because district names are big and aren't readble when plotted vertically.
    ax = sorted_category.plot(kind='barh', linewidth=6, width=0.7, figsize=(8, 5))

    plt.title('{} Expenditure for {}: Major Head Wise'.format(category, district), fontsize=16, fontweight='bold')

    ax.set_ylabel('Major Heads', fontsize=17, fontweight='bold')
    
    plt.xscale('log')

    # set formatter for axis so that we can apply formatter properties
    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
    # set scientific notation to false, because after scaling x axis with log we have exponential ticks on y axis.
    ax.get_xaxis().get_major_formatter().set_scientific(False)

    # format yticks to show major head names instead of codes.
    ax.set_yticklabels(list(map(utils.format_major_head_ticks, ax.get_yticklabels())), wrap=True)

    # format xticks appropriately to show amounts in readable format.
    ax.set_xticklabels(list(map(utils.make_readable_amount, ax.get_xticks())))

    ax.set_xlabel('Amount (in Rupees)', fontsize=14, fontweight='bold')

    plt.show()

In [None]:
def plot_pn_expenditure(category, district):
    '''
    Plot the percentage expense on Plan and Non Plan expenditures.
    
    input - district:'SHIMLA', category:'GROSS'
    '''
    district_plan_grouped = df.groupby(['DISTRICT', 'PN']).sum().dropna()
    district_grouped = district_plan_grouped.loc[district]
    
    # remove the empty row.
    district_grouped = district_grouped[district_grouped.index != '']
    
    # create the pie plot with percentage on.
    ax = district_grouped['GROSS'].plot.pie(autopct='%.2f', figsize=(6, 6))
    
    plt.legend(labels=['Non-Plan', 'Plan'])

    plt.title('{} Expenditure: Plan & Non-Plan for {}'.format(category, district), fontsize=16, fontweight='bold')

    ax.set_ylabel('')
    
    plt.show()

In [None]:
def plot_amount_distribution():
    '''
    Plot district wise distribution of sanctioned amount.
    The amount is distributed in three categories: AGDED, BTDED and NETPAYMENT,
    so we can analyse how does this distribution vary across districts.
    '''
    district_grouped = df.groupby('DISTRICT').sum()

    # create the bar plot with the different sections of amount on x axis and districts on y.
    # it's a stacked bar plotted horizontally.
    ax = district_grouped.plot(kind='barh', y=['AGDED', 'BTDED', 'NETPAYMENT'],
              stacked=True, figsize=(20,8), linewidth=5)

    plt.title('Amount distribution for Districts', fontsize=19, fontweight='bold')

    ax.set_ylabel('Districts', fontsize=18, fontweight='bold')

    # set formatter for axis so that we can apply formatter properties
    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
    # set scientific notation to false, because after scaling y axis with log we have exponential ticks on y axis.
    ax.get_xaxis().get_major_formatter().set_scientific(False)

    # format yticks appropriately to show amounts in thousands.
    ax.set_xticklabels(list(map(utils.make_readable_amount, ax.get_xticks())))

    ax.set_xlabel('Amount (in Rupees)', fontsize=18, fontweight='bold')

    plt.show()