In [None]:
import tabula
import pandas as pd
import itertools
import warnings
warnings.filterwarnings('ignore')

# Fraction of people living in high-density areas in Mumbai

We compute the fraction of population living in high-density areas in the various Mumbai wards.  Our data source is a [summary](https://portal.mcgm.gov.in/irj/go/km/docs/documents/MCGM%20Department%20List/Public%20Health%20Department/Docs/Census%20FAQ%20%26%20Answer.pdf) of the 2011 Census data.  This is stored in the file `bmc-census-data.pdf`.


To run this file, you need to install the `tabula-py` package using

```
pip install tabula-py
```

Please take care not *not* to install the similarly named `tabula` package instead!

To install this package, you need to have Java (version 8 or higher) installed and the `java` command accessible from the shell in which you run `pip`.

In [None]:
dfs = tabula.read_pdf("bmc-census-data.pdf", pages = "all");
#Generates some font warnings that are suppressed in the PDF output

In [None]:
ward_df = dfs[28].iloc[4:-1].reset_index(drop = True).apply(lambda x: pd.to_numeric(x, errors = 'ignore'))
ward_df.columns = ["ward", "num_households",
                   "total", "male", "female",
                   "total_0-6", "male_0-6", "female_0-6",
                   "total_literate", "male_literate", "female_literate",
                   "total_illiterate", "male_illiterate", "female_illiterate"
                  ]
ward_df.iloc[24,0] = "Total"


In [None]:
hd_area_df = dfs[29].iloc[2:].reset_index(drop = True).apply(lambda x: pd.to_numeric(x, errors = 'ignore'))
hd_area_df.columns = ["ward", "rough_total", "non_hd_area_count"]
hd_area_df = pd.concat([
        hd_area_df,
        pd.DataFrame(
            map(lambda x: list(map(int, x.split())),
            hd_area_df["rough_total"].to_list()),
            columns = ["total", "hd_area_count"]
        )],
    axis = 1         
).loc[:, ["ward", "total", "hd_area_count", "non_hd_area_count"]]

#Fix the ward "H/W" appearing as "HW"
hd_area_df.loc[:, "ward"] = hd_area_df.loc[:, "ward"].map(lambda x: x if x != "HW" else "H/W")

#Compute the fraction
hd_area_df.loc[:, "hd_area_fraction"] = hd_area_df.eval('hd_area_count/total')
hd_area_df

In [None]:
#ward_df.to_excel("bmc_census_2011_ward_counts.xlsx")

In [None]:
#hd_area_df.to_excel("bmc_census_2011_hd_area_counts.xlsx")

# Checks

In [None]:
hd_area_df.query('total != hd_area_count + non_hd_area_count').empty

In [None]:
ward_df.query('total != male + female').empty

In [None]:
ward_df.query('`total_0-6` != `male_0-6` + `female_0-6`').empty

In [None]:
ward_df.query('`total_literate` != `male_literate` + `female_literate`').empty

In [None]:
ward_df.query('`total_illiterate` != `male_illiterate` + `female_illiterate`')

### Note:

The above two rows seem to have an inconsistency even in the origianl PDF file.

# Some file generation

In [None]:
ward_frac_df = hd_area_df.drop(
    hd_area_df.query(
        'ward in ["City", "Total", "W.S.", "E.S."]'
    ).index
).sort_values(by = "ward").reset_index(drop = True)
ward_frac_df.loc[:, "wardNo"] = ward_frac_df.index + 1
ward_frac_df.loc[:, "wardIndex"] = ward_frac_df.index
ward_frac_df.rename(columns = {
    "ward": "bmcWardID",
    "hd_area_fraction": "slumFractionalPopulation"
}, inplace = True)

In [None]:
ward_frac_df

In [None]:
output_ward_frac_df = ward_frac_df.loc[:, ["wardIndex", "wardNo", "slumFractionalPopulation", "bmcWardID"]]

In [None]:
output_ward_frac_df

In [None]:
output_ward_frac_df.to_csv("../../slumFraction.csv", index = False)