In [1]:
import os
import json
import csv
import random
import numpy as np
import pandas as pd
import geopandas as gpd
import tabula
import shapely.geometry 
import matplotlib.pyplot as  plt

if not os.path.exists('presampled_points'):
    os.mkdir('presampled_points')

This notebook prepares the data that constitutes the base files to instantiate NCT of Delhi. The data sources processed in this notebook can be found on the [`InstantiationAssumption.xlsx`](https://www.dropbox.com/preview/Epidemiology2/data/InstantiationAssumptions.xlsx)

## 1. `cityProfile.json`

The `cityProfile.json` is put together by combining the age, household-size and school-size distributions in the expected form for the `parse_and_instantiate` to consume. 

In [2]:
## Age Distribution
ageBins = ['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80+']
ageDistribution = [0.08317, 0.09010, 0.09604, 0.10000, 0.10099, 0.09604, 0.09604, 0.07624, 0.06535, 0.05347, 0.04158, 0.03168, 0.02574, 0.01782, 0.01287, 0.00792, 0.00495]

## Household-size Distribution
hhBins = ['1', '2', '3', '4', '5', '6-8', '9+']
hhDistribution = [0.03685, 0.07555, 0.12824, 0.24040, 0.20390, 0.25558, 0.05948]

## School-size Distribution, from Ferguson's paper
schoolBins = ['0-100', '100-200', '200-300', '300-400', '400-500', '500-600', '600-700', '700-800', '800-900', '900+']
schoolWeights = [0.0185, 0.1204, 0.2315, 0.2315, 0.1574, 0.0889, 0.0630, 0.0481, 0.0278, 0.0130]
    
cityProfileDict = {}
cityProfileDict["city"] = "Delhi"
cityProfileDict["age"] = {
        "bins": ageBins, 
        "weights": ageDistribution
    }
cityProfileDict["householdSize"] = {
        "bins": hhBins, 
        "weights": hhDistribution
    }
cityProfileDict["schoolsSize"] = {
        "bins": schoolBins,
        "weights": schoolWeights
    }
cityProfileDict["maxWorkplaceDistance"] = 60 #to update this later



## 2. individual `csv` files

The indiviudal `csv` files required for instantiation are created from a master ward-wise excel sheet which aggregates the raw_data collected from the respective sources.

We have made the ward numbers continiguous for wards under DMC, NDMC and Delhi Cantt. 

In [3]:
delhi_all_wards_master = pd.ExcelFile("raw/ward_wise_data.xlsx")
delhi_demographics = pd.read_excel(delhi_all_wards_master, "demographics")[:289]
delhi_households = pd.read_excel(delhi_all_wards_master, "households")[:289]
delhi_employment = pd.read_excel(delhi_all_wards_master, "employment")[:289]
delhi_households = delhi_households.rename(columns={'households': 'Households'})
delhi_employment = delhi_employment.rename(columns={'Working_Pop': 'Employed', 'Non_working_pop': 'Unemployed'})

delhi_demographics = delhi_demographics.astype({'wardIndex': int, 'wardNo': int})
delhi_households = delhi_households.astype({'wardIndex': int, 'wardNo': int})
delhi_employment = delhi_employment.astype({'wardIndex': int, 'wardNo': int})


delhi_demographics = delhi_demographics.sort_values('wardNo')
delhi_demographics = delhi_demographics.drop(columns=['area(sq km)', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7'])
delhi_demographics

Unnamed: 0,wardIndex,wardNo,wardName,totalPopulation
0,0,1,DMC (U) (Part) WARD NO.-0001,55512
1,1,2,DMC (U) (Part) WARD NO.-0002,61344
2,2,3,Ali Pur (CT) WARD NO.-0003,38361
3,3,4,Tikri Khurd (CT) WARD NO.-0004,31827
4,4,5,Qadi Pur (CT) WARD NO.-0005,55771
...,...,...,...,...
284,284,285,NDMC (Part) WARD NO.-0005,23721
285,285,286,NDMC (Part) WARD NO.-0006,18793
286,286,287,NDMC (Part) WARD NO.-0007,36518
287,287,288,NDMC (Part) WARD NO.-0008,26715


## 3. `city.geojson`
We shall start with processing the geojson file, we found that the geojson properties has an extra row, as well as the `wardNames` in the demographics and geojson do not match.

In [4]:
# read the geojson
geoDF = gpd.read_file("raw/Delhi_Wards.geojson")
geoDF.columns = ['wardName', 'wardNo', 'geometry']
geoDF

Unnamed: 0,wardName,wardNo,geometry
0,DELHI CANTT CHARGE 1,CANT_1,"POLYGON ((77.13228 28.63154, 77.13644 28.62062..."
1,DELHI CANTT CHARGE 2,CANT_2,"POLYGON ((77.15429 28.62335, 77.15501 28.62228..."
2,DELHI CANTT CHARGE 4,CANT_4,"POLYGON ((77.15755 28.57578, 77.15672 28.57564..."
3,DELHI CANTT CHARGE 5,CANT_5,"POLYGON ((77.13480 28.57051, 77.13429 28.57048..."
4,DELHI CANTT CHARGE 6,CANT_6,"POLYGON ((77.12157 28.59308, 77.12878 28.59029..."
...,...,...,...
285,PREM NAGAR,33,"POLYGON ((77.06644 28.70141, 77.06574 28.70063..."
286,SHASTRI NAGAR,73,"POLYGON ((77.19404 28.67288, 77.19196 28.66855..."
287,VISHNU GARDEN,107,"POLYGON ((77.09982 28.64496, 77.09947 28.64504..."
288,TUKHMIR PUR,270,"POLYGON ((77.25834 28.71855, 77.25858 28.71785..."


 However, the following additional processing needs to be done:

> make wardNames in all the `csv` files consistent with `city.geojson`

For this we create a separate dataframe and dump it as a csv, mapping process is done off-line from the script

In [5]:
delhi_ward_name_check = pd.DataFrame()
delhi_ward_name_check['map_ward_name'] = geoDF['wardName']
delhi_ward_name_check['map_ward_no'] = geoDF['wardNo']
delhi_ward_name_check['census_ward_name'] = delhi_demographics['wardName']
delhi_ward_name_check['census_ward_no'] = delhi_demographics['wardNo']
# delhi_ward_name_check.to_csv("raw/delhi_ward_name_check.csv", index=False) #Done only at first
delhi_ward_name_check

Unnamed: 0,map_ward_name,map_ward_no,census_ward_name,census_ward_no
0,DELHI CANTT CHARGE 1,CANT_1,DMC (U) (Part) WARD NO.-0001,1.0
1,DELHI CANTT CHARGE 2,CANT_2,DMC (U) (Part) WARD NO.-0002,2.0
2,DELHI CANTT CHARGE 4,CANT_4,Ali Pur (CT) WARD NO.-0003,3.0
3,DELHI CANTT CHARGE 5,CANT_5,Tikri Khurd (CT) WARD NO.-0004,4.0
4,DELHI CANTT CHARGE 6,CANT_6,Qadi Pur (CT) WARD NO.-0005,5.0
...,...,...,...,...
285,PREM NAGAR,33,NDMC (Part) WARD NO.-0006,286.0
286,SHASTRI NAGAR,73,NDMC (Part) WARD NO.-0007,287.0
287,VISHNU GARDEN,107,NDMC (Part) WARD NO.-0008,288.0
288,TUKHMIR PUR,270,NDMC (Part) WARD NO.-0009,289.0


> Update the wardName and numbering schemes for both the map as well as census data frames

In the offline processing, we initially planned to sort the locations by the ward ID, but we decided to keep the ward number and ward names in the map data as is, and update those value based on the census ward number and the closer ward name. 

In the next cell, we shall update all the dataframes:
- it is a one-to-one map to the census data files, for reference, you may want to first check if the ward numbers match with `updated_names` dataframe.
- it requies us to `map_ward_no` to the appropriate `census_ward_name` and `census_ward_no`
- create a column called `hd_flag` which takes as $0$ for normal wards and $1$ for slums

In [6]:
updated_names = pd.read_csv('raw/delhi_ward_name_check.csv')

delhi_demographics['wardName'] = updated_names['corrected_names']
delhi_households['wardName'] = updated_names['corrected_names']
delhi_employment['wardName'] = updated_names['corrected_names']

delhi_demographics['hd_flag'] = 0
delhi_households['hd_flag']  = 0 
delhi_employment['hd_flag'] = 0

delhi_demographics

Unnamed: 0,wardIndex,wardNo,wardName,totalPopulation,hd_flag
0,0,1,NARELA,55512,0
1,1,2,BANKNER,61344,0
2,2,3,ALIPUR,38361,0
3,3,4,TIKRI KHURD,31827,0
4,4,5,QADI PUR,55771,0
...,...,...,...,...,...
284,284,285,NDMC CHARGE 5,23721,0
285,285,286,NDMC CHARGE 6,18793,0
286,286,287,NDMC CHARGE 7,36518,0
287,287,288,NDMC CHARGE 8,26715,0


In [7]:
updated_geoDF = pd.merge(geoDF, updated_names[['map_ward_no', 'census_ward_no', 'corrected_names']], left_on="wardNo", right_on="map_ward_no")
updated_geoDF = updated_geoDF.drop(columns=['wardName', 'wardNo', 'map_ward_no'])
updated_geoDF = updated_geoDF.dropna()
updated_geoDF.columns = ['geometry', 'wardNo', 'wardName']
updated_geoDF['wardIndex'] = updated_geoDF['wardNo'] - 1
updated_geoDF = updated_geoDF.sort_values(['wardNo'])

updated_geoDF = updated_geoDF.astype({'wardNo': int, 'wardIndex': int})

updated_geoDF

Unnamed: 0,geometry,wardNo,wardName,wardIndex
0,"POLYGON ((77.13228 28.63154, 77.13644 28.62062...",1,NARELA,0
1,"POLYGON ((77.15429 28.62335, 77.15501 28.62228...",2,BANKNER,1
2,"POLYGON ((77.15755 28.57578, 77.15672 28.57564...",3,ALIPUR,2
3,"POLYGON ((77.13480 28.57051, 77.13429 28.57048...",4,TIKRI KHURD,3
4,"POLYGON ((77.12157 28.59308, 77.12878 28.59029...",5,QADI PUR,4
...,...,...,...,...
284,"POLYGON ((77.05920 28.68636, 77.03956 28.68805...",285,NDMC CHARGE 5,284
285,"POLYGON ((77.06644 28.70141, 77.06574 28.70063...",286,NDMC CHARGE 6,285
286,"POLYGON ((77.19404 28.67288, 77.19196 28.66855...",287,NDMC CHARGE 7,286
287,"POLYGON ((77.09982 28.64496, 77.09947 28.64504...",288,NDMC CHARGE 8,287


> Compute the area for each ward in `delhi_demographics` dataframe using shapely

The geojson has the crs, `epsg:4326` where the units for latitude and longitude is in degrees, and needs to be converted to a different unit to compute area in the following cell. You can read more about this [here](https://gis.stackexchange.com/questions/218450/getting-polygon-areas-using-geopandas).

In [8]:
# compute the area, but dump into a sepearate copy of the dataframe
delhi_demographics['area(sq m)'] = updated_geoDF['geometry'].to_crs({'init': 'epsg:3857'}).map(lambda p: p.area / 10**3)
delhi_demographics

  return _prepare_from_string(" ".join(pjargs))


Unnamed: 0,wardIndex,wardNo,wardName,totalPopulation,hd_flag,area(sq m)
0,0,1,NARELA,55512,0,1659.836421
1,1,2,BANKNER,61344,0,11405.082638
2,2,3,ALIPUR,38361,0,10902.894929
3,3,4,TIKRI KHURD,31827,0,4545.305116
4,4,5,QADI PUR,55771,0,19836.436547
...,...,...,...,...,...,...
284,284,285,NDMC CHARGE 5,23721,0,1442.966057
285,285,286,NDMC CHARGE 6,18793,0,1028.992997
286,286,287,NDMC CHARGE 7,36518,0,1875.451977
287,287,288,NDMC CHARGE 8,26715,0,629.163957


## 4. Slums

The below code snippet, uses Tabula to read the PDF document on the different slum locations and the respective number of households from a local copy of [this file](http://delhishelterboard.in/main/wp-content/uploads/2015/12/675_JJ_Cluster_List.pdf). 

The output is a list of pandas dataframes which are concatenated and dumped into a csv file called `processed_delhi_slum_master.csv`. This csv file was cleaned to make the addresses more consistent and also manually sanitize the stray entries in the file. 

The cell below, needs to be done only once, but if you are running it, ensure you manually check the csv output atleast once, if you are running the data preparation process from scratch.

In [9]:
# delhi_slums = tabula.read_pdf("raw/675_JJ_Cluster_List.pdf", pages="all")
# delhi_slums[0].columns = delhi_slums[0].iloc[0]
# delhi_slums[0] = delhi_slums[0].drop(delhi_slums[0].index[0])
# delhi_slums = pd.concat(delhi_slums,ignore_index=True)

# delhi_slums.columns=['S.No.', 'Code', 'AC No.', 'Div', 'address', 'households', 'Owner', 'area(sq m)', 'constituency', 'wardNo', 'district']

# delhi_slums[['AC No.', 'address', 'households', 'area(sq km)', 'wardNo', 'district']].to_csv('raw/processed_delhi_slum_master.csv', index=False)

### Computing Slum population and location

For the remainder processing we will be using the `processed_delhi_slum_master.csv` file. The ward numbers for wards under NDMC and in Delhi Cantt. areas are not available. We *assume* the ward numbers for slums in these areas are assigned based on the ward population and then by the number of slum households present in the ward. 

> Computing slum population

Average household size for the slums is computed based on the total population and total number of households for NCT of Delhi as given in the Census 2011 data on slums state-wise where the total population in slums for the NCT of Delhi was $1785390$ and the total number of households in slums was $367893$, which gives the average household size in slums to be  $4.85$.

Note: The total population in slums computed from the data tallies to $1485943$ and the total number of households is $306380$.

In [10]:
slums = pd.read_csv('raw/processed_delhi_slum_master.csv')

slums_by_wards = slums[['wardNo', 'households', 'district', 'area(sq km)']].groupby(['wardNo', 'district']).agg({'households': sum, 'area(sq km)': sum}).reset_index()
slums_by_wards = slums_by_wards.sort_values(['wardNo'])
slum_population = pd.merge(slums_by_wards, delhi_demographics[['wardNo', 'totalPopulation', 'area(sq m)']], on='wardNo')
slum_population = pd.merge(slum_population, delhi_households[['wardNo', 'Households']], on='wardNo')

slum_population.columns = ['wardNo', 'district', 'slum_households', 'slum_area(sq m)', 'totalPopulation', 'ward_area(sq m)', 'totalHouseholds']
slum_population['not_slum_houses'] = slum_population['totalHouseholds'] - slum_population['slum_households']
slum_population['slum_population'] = (slum_population['slum_households']* 4.85).astype(int)
slum_population

Unnamed: 0,wardNo,district,slum_households,slum_area(sq m),totalPopulation,ward_area(sq m),totalHouseholds,not_slum_houses,slum_population
0,5,North,296,7438.0,55771,19836.436547,10640,10344,1435
1,9,North,158,13665.0,60531,5133.456481,12556,12398,766
2,10,North,1796,125777.5,70476,2396.248466,14426,12630,8710
3,11,North,2156,53438.0,52815,11695.566085,11728,9572,10456
4,12,North,597,14801.0,41957,5697.704390,8968,8371,2895
...,...,...,...,...,...,...,...,...,...
165,277,South West,371,5000.0,10028,986.443561,1834,1463,1799
166,280,South West,77,4100.0,7828,2266.260000,1889,1812,373
167,287,New Delhi,747,12515.0,36518,1875.451977,7979,7232,3622
168,288,New Delhi,204,576.0,26715,629.163957,6584,6380,989


#### make queries to get location of addresses, a greedy way by grouping similar places/ area

To make it easy to query geo-locations for the different slum locations in the dataset, I thought we could use the `geopy` package which has the `Nominatim` class to query from Open-Street Maps. This approach was unsuccessful since the search required the addresses to be searchable, due to the inconsistencies in recording addresses. 

Anyways, the code snippet to query the map service to get location is in the below cell:

In [11]:
# from geopy.geocoders import Nominatim
# import time 

# location_dict = {}

# def get_slum_locations(row):
#     print(row['address'])
#     geolocator = Nominatim(user_agent="Delhi Slum location")
#     location = geolocator.geocode(str(row['address']))
#     if location != None:
#         location_dict[row['address']] = {'address': location, 'lat': location.latitude, 'lon' : location.longitude}
#         print("\tfound at: ", location.address)
#     time.sleep(3)

# slums.apply(get_slum_locations, axis=1)

For the second approach, we first started using Google Earth to draw polygons for each of the areas on the dataset. This was cumbersome and was thus replaced by the following code snippet, which yields a polygon within the ward. 

The code snippet implements a navie algorithm which first creates a random polygon within the ward, and samples points from the polygon to become locations for the slums. 

To create a slum cluster, we first create a square. The computed side for the squuare is based on the area of the slum with an additional discounting factor of 10 to account for irregular polygon geometries.

We then sample locations based on the number of slum households, the implementation of the sampling algorithm is based on [this link](https://gis.stackexchange.com/a/356502).

In [12]:
slum_location = slum_population.copy()
slum_location = pd.merge(slum_location, updated_geoDF, on='wardNo')

def sampleRandomLatLong(row):
    #IMPORTANT: geoDF uses (lon, lat) order
    (lon1,lat1,lon2,lat2) = row['geometry'].bounds
    while True:
        lat = random.uniform(lat1,lat2)
        lon = random.uniform(lon1,lon2)
        point = shapely.geometry.Point(lon,lat)
        if row['geometry'].contains(point) and row['geometry'].centroid != point: #point is not ward centre
            return (lat,lon)

def make_square_cluster(row):
    side = np.sqrt(row['slum_area(sq m)']).astype(int) #this is in metres
    side *= 0.15 #an additional 15/100 times reduction to the side to account for the irregular polygon  
    side = side * 0.0001 #converting to km 
    side_lat = side * 1.11 #(1 degree of latitude is 1.11 km for Delhi, measured through an online tool)
    side_lon = side * 0.98 #(1 degree of longitude is 0.98 km for Delhi, measured through an online tool)
    lt, ln = row['slum_centroid']
    
    return shapely.geometry.Polygon([(ln, lt), ((ln + side_lon), lt), ((ln + side_lon), (lt - side_lat)), (ln, (lt - side_lat))])

def sample_slum_house_locations(row, overestimate=1):
    polygon = row['slum_geometry']
    ward_index = row['wardNo'] - 1
    size = row['slum_households']
    print(f"Generating points for ward {row['wardNo']} ......",end='')
    min_x, min_y, max_x, max_y = polygon.bounds
    ratio = polygon.area / polygon.envelope.area
    samples = np.random.uniform((min_x, min_y), (max_x, max_y), (int(size / ratio * overestimate), 2))
    multipoint = shapely.geometry.MultiPoint(samples)
    multipoint = multipoint.intersection(polygon)
    samples = np.array(multipoint)
    while samples.shape[0] < size:
        # emergency catch in case by bad luck we didn't get enough within the polygon
        samples = np.concatenate([samples, random_points_in_polygon(polygon, size, overestimate=overestimate)])
    
    samples = samples[np.random.choice(len(samples), size)] #IMPORTANT: location in (lon, lat) format
    
    #Write the samples into a csv
    fname = f"{ward_index}.csv"
    with open("presampled_points/"+fname, mode='w+') as file:
        writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        
        for point in samples:
            writer.writerow([str(point[1]),str(point[0])])

    print("done")

slum_location['slum_centroid'] = slum_location.apply(sampleRandomLatLong, axis = 1)      
slum_location['slum_geometry'] = slum_location.apply(make_square_cluster, axis = 1)  
slum_location.apply(sample_slum_house_locations, axis = 1)

Generating points for ward 5 ......done
Generating points for ward 9 ......done
Generating points for ward 10 ......done
Generating points for ward 11 ......done
Generating points for ward 12 ......done
Generating points for ward 14 ......done
Generating points for ward 15 ......done
Generating points for ward 16 ......done
Generating points for ward 17 ......done
Generating points for ward 19 ......done
Generating points for ward 20 ......done
Generating points for ward 25 ......done
Generating points for ward 26 ......done
Generating points for ward 32 ......done
Generating points for ward 34 ......done
Generating points for ward 35 ......done
Generating points for ward 37 ......done
Generating points for ward 38 ......done
Generating points for ward 39 ......done
Generating points for ward 40 ......done
Generating points for ward 42 ......done
Generating points for ward 43 ......done
Generating points for ward 45 ......done
Generating points for ward 46 ......done
Generating points 

0      None
1      None
2      None
3      None
4      None
       ... 
165    None
166    None
167    None
168    None
169    None
Length: 170, dtype: object

## 5. Creating the final base files after counting slums as wards

- demographics, households: non-slum and slum will be separated
- employment: need to think of a strategy

In [13]:
count = 0
wardIndexList = []
wardNoList = []
wardNameList = []
houses = []
population = []
hd_flag = []
area = []
location = []
employed = []
unemployed = []


def population_reassignment(row, delhi_households, delhi_employment, slum_population, slum_location):
    global count
    wardNo = row['wardNo']
    max_wardNo = delhi_demographics.wardNo.values.max()
    

    if (slum_population[slum_population['wardNo'] == wardNo].shape[0] > 0):
        
        slumPopulation = slum_population.loc[slum_population['wardNo'] == wardNo, 'slum_population'].values[0]
        slumArea = slum_population.loc[slum_population['wardNo'] == wardNo, 'slum_area(sq m)'].values[0]
        slumHouseholds = slum_population.loc[slum_population['wardNo'] == wardNo, 'slum_households'].values[0]
        slum_geometry = slum_location.loc[slum_location['wardNo'] == wardNo, 'slum_geometry'].values[0]
        
        delhi_demographics.at[row['wardIndex'], 'totalPopulation'] = row['totalPopulation'] - slumPopulation
        delhi_demographics.at[row['wardIndex'], 'area(sq m)'] = row['area(sq m)'] - slumArea
        delhi_households.at[row['wardIndex'], 'Households'] -= slumHouseholds
        employed.append(delhi_employment.at[row['wardIndex'], 'Employed'] / 2)
        unemployed.append(delhi_employment.at[row['wardIndex'], 'Unemployed'] / 2)
        delhi_employment.at[row['wardIndex'], 'Employed'] = delhi_employment.at[row['wardIndex'], 'Employed'] / 2
        delhi_employment.at[row['wardIndex'], 'Unemployed'] = delhi_employment.at[row['wardIndex'], 'Unemployed'] / 2
        
        wardIndexList.append(max_wardNo + count)
        wardNoList.append(max_wardNo + count + 1)
        wardNameList.append(row["wardName"] + '_HD Area')
        houses.append(slumHouseholds)
        population.append(slumPopulation)
        hd_flag.append(1)
        area.append(slumArea)
        location.append(slum_geometry)
        count += 1

slum_demographics = {}
slum_demographics["wardIndex"] = wardIndexList
slum_demographics["wardNo"] = wardNoList
slum_demographics["wardName"] = wardNameList
slum_demographics["totalPopulation"] = population
slum_demographics["hd_flag"] = hd_flag
slum_demographics["area(sq m)"] = area
slum_households = {}
slum_households["wardIndex"] = wardIndexList
slum_households["wardNo"] = wardNoList
slum_households["wardName"] = wardNameList
slum_households["Households"] = houses
slum_households["hd_flag"] = hd_flag
slum_geoData = {}
slum_geoData["geometry"] = location
slum_geoData["wardNo"] = wardNoList
slum_geoData["wardName"] = wardNameList
slum_geoData["wardIndex"] = wardIndexList
slum_employment = {}
slum_employment["wardIndex"] = wardIndexList
slum_employment["wardNo"] = wardNoList
slum_employment["wardName"] = wardNameList
slum_employment["Employed"] = employed
slum_employment["Unemployed"] = unemployed
slum_employment["hd_flag"] = hd_flag

delhi_demographics.apply(population_reassignment, args=(delhi_households, delhi_employment, slum_population, slum_location,), axis=1)    

demographics = pd.concat([delhi_demographics, pd.DataFrame.from_dict(slum_demographics)], ignore_index=True)
demographics['area(sq m)'] = demographics['area(sq m)'].astype(int)
households = pd.concat([delhi_households, pd.DataFrame.from_dict(slum_households)], ignore_index=True)
#delhi_geoDF = pd.concat([updated_geoDF, pd.DataFrame.from_dict(slum_geoData)], ignore_index=True)
employment = pd.concat([delhi_employment, pd.DataFrame.from_dict(slum_employment)], ignore_index=True)

In [14]:
demographics

Unnamed: 0,wardIndex,wardNo,wardName,totalPopulation,hd_flag,area(sq m)
0,0,1,NARELA,55512,0,1659
1,1,2,BANKNER,61344,0,11405
2,2,3,ALIPUR,38361,0,10902
3,3,4,TIKRI KHURD,31827,0,4545
4,4,5,QADI PUR,54336,0,12398
...,...,...,...,...,...,...
454,454,455,DELHI CANTT CHARGE 6_HD Area,1799,1,5000
455,455,456,DELHI CANTT CHARGE 3_HD Area,373,1,4100
456,456,457,NDMC CHARGE 7_HD Area,3622,1,12515
457,457,458,NDMC CHARGE 8_HD Area,989,1,576


In [15]:
households

Unnamed: 0,wardIndex,wardNo,wardName,Households,hd_flag
0,0,1,NARELA,10713,0
1,1,2,BANKNER,11711,0
2,2,3,ALIPUR,7276,0
3,3,4,TIKRI KHURD,6280,0
4,4,5,QADI PUR,10344,0
...,...,...,...,...,...
454,454,455,DELHI CANTT CHARGE 6_HD Area,371,1
455,455,456,DELHI CANTT CHARGE 3_HD Area,77,1
456,456,457,NDMC CHARGE 7_HD Area,747,1
457,457,458,NDMC CHARGE 8_HD Area,204,1


In [16]:
employment

Unnamed: 0,wardIndex,wardNo,wardName,Employed,Unemployed,hd_flag
0,0,1,NARELA,16545.0,38967.0,0
1,1,2,BANKNER,18498.0,42846.0,0
2,2,3,ALIPUR,11249.0,27112.0,0
3,3,4,TIKRI KHURD,10185.0,21642.0,0
4,4,5,QADI PUR,8259.0,19626.0,0
...,...,...,...,...,...,...
454,454,455,DELHI CANTT CHARGE 6_HD Area,1568.0,3446.0,1
455,455,456,DELHI CANTT CHARGE 3_HD Area,1766.5,2147.5,1
456,456,457,NDMC CHARGE 7_HD Area,7790.0,11473.0,1
457,457,458,NDMC CHARGE 8_HD Area,5790.0,7567.5,1


### Saving the base file with slums as wards

In [18]:
with open('cityProfile.json', 'w', encoding='utf-8') as f:
    json.dump(cityProfileDict, f, ensure_ascii=False, indent=2)
updated_geoDF.to_file("city.geojson", driver="GeoJSON")
demographics.to_csv("demographics.csv", index=False)
households.to_csv("households.csv", index=False)
employment.to_csv("employment.csv", index=False)

## 6. OD Matrix

For the initial run, we will use the uniform OD matrix.