In [1]:
import json
import pandas as pd
import pprint
import plotly.express as px
import numpy as np

In [2]:
import plotly.io as pio
pio.renderers.default = "browser"

In [3]:
# Use pandas to read the tables in the html code for the provided url
dfs = pd.read_html("https://en.wikipedia.org/wiki/List_of_states_and_union_territories_of_India_by_population")

In [4]:
# Prints the dataframe requested
#dfs[0]

# Saves the dataframe to the file specified as a csv
#dfs[0].to_csv("india_census.csv")

In [5]:
# Preload the json file by searching for geojson file for India
india_states = json.load(open("states_india.geojson",'r'))
#india_states

In [6]:
# Looking at the 'feature' object, we are looking for 'id' but it does not exist
# india_states['features'][0].keys()

In [7]:
# Here we create a dictionary to help us map the dataframe to the id value we create below for the Json
state_id_map = {}

# Here we will create add an id value to the Json object. This will help sus map the json with the dataframe
for feature in india_states['features']:
    feature['id'] = feature['properties']['state_code']
    state_id_map[feature['properties']['st_nm']] = feature['id']

# Have to add a state that was forgotten. 'Ladakh' with value 36.
# Added 'India' for the grand total
state_id_map['Ladakh'] = 36
state_id_map['India'] = 37

In [8]:
# create dataframe from the csv file
df = pd.read_csv("india_census.csv")

# PREPROCESSING DATAFRAME part 1 #
# Need to separate one record from the original df into two. Values are below.
Dadra = ["", "", "Dadra and Nagar Haveli", 342853, "", "", "", "", "", "", 491, "698.27/km2", round((149949+98824)*1000/(193760+121666))]
Daman = ["", "", "Daman and Diu", 242911,  "", "", "", "", "", "", 112, "2,952.16/km2", round((92946+65692)*1000/(150301+92512))]

# Now I append them with the original df, into a new df3
df2 = pd.DataFrame([Dadra, Daman], columns=list(df.columns), index=['32','32.1'])
df3 = df.append(df2)
df3 = df3.drop([32]) # Remove the original group
df3

Unnamed: 0.1,Unnamed: 0,Rank,State or union territory,Population,National Share (%),Decadal growth(2001–2012),Rural population,Percent rural,Urban population,Percent urban,Area[14],Density[a],Sex ratio
0.0,0.0,1 (S1),Uttar Pradesh,199812341,,20.2%,155317278.0,,44495063.0,,"240,928 km2 (93,023 sq mi)","828/km2 (2,140/sq mi)",912
1.0,1.0,2 (S2),Maharashtra,112374333,,20.0%,61556074.0,,50818259.0,,"307,713 km2 (118,809 sq mi)",365/km2 (950/sq mi),929
2.0,2.0,3 (S3),Bihar,104099452,,25.4%,92341436.0,,11758016.0,,"94,163 km2 (36,357 sq mi)","1,102/km2 (2,850/sq mi)",918
3.0,3.0,4 (S4),West Bengal,91276115,,13.8%,62183113.0,,29093002.0,,"88,752 km2 (34,267 sq mi)","1,029/km2 (2,670/sq mi)",953
4.0,4.0,5 (S5),Madhya Pradesh,72626809,,16.3%,52557404.0,,20069405.0,,"308,245 km2 (119,014 sq mi)",236/km2 (610/sq mi),931
5.0,5.0,6 (S6),Tamil Nadu,72147030,,15.6%,37229590.0,,34917440.0,,"130,051 km2 (50,213 sq mi)","555/km2 (1,440/sq mi)",996
6.0,6.0,7 (S7),Rajasthan,68548437,,21.3%,51500352.0,,17048085.0,,"342,239 km2 (132,139 sq mi)",201/km2 (520/sq mi),928
7.0,7.0,8 (S8),Karnataka,61095297,,15.6%,30069335.0,,31025962.0,,"191,791 km2 (74,051 sq mi)",319/km2 (830/sq mi),979
8.0,8.0,9 (S9),Gujarat,60439692,,19.3%,34694609.0,,25745083.0,,"196,024 km2 (75,685 sq mi)",308/km2 (800/sq mi),919
9.0,9.0,10 (S10),Andhra Pradesh,"49,577,103[b]",,11.0%,34966693.0,,14610410.0,,"162,968 km2 (62,922 sq mi)",303/km2 (780/sq mi),993


In [9]:
# PREPROCESSING DATAFRAME part 2 #
# preprocess df column 'Density[a]' by removing all chars after backslash '/', commas ',' and turning data into float
df3['Density'] = df3['Density[a]'].apply(lambda x: float(x.split("/")[0].replace(",", "")))

# preprocess dfcolumn 'State or union territory' to replace ' and ' with ' & '
df3['State or union territory v2'] = df3['State or union territory'].apply(lambda x: x.replace(' and ', ' & '))

# preprocess df column 'State or union territory v2' to replace 'Manipur[d]' with 'Manipur'
df3['State or union territory v2'] = df3['State or union territory v2'].apply(lambda x: x.replace('Manipur[d]', 'Manipur'))

# preprocess df column 'State or union territory v2' to replace 'Nicobar Islands' with 'Nicobar Island'
df3['State or union territory v2'] = df3['State or union territory v2'].apply(lambda x: x.replace('Nicobar Islands', 'Nicobar Island'))

# preprocess df column 'State or union territory v2' to replace 
df3['State or union territory v2'] = df3['State or union territory v2'].apply(lambda x: x.replace('Dadara & Nagar Havelli', 'Dadra & Nagar Haveli'))



# PREPROCESSING DICTIONARY part 2 #
# preprocess dictionary "state_id_map" by changing 'Arunanchal' to 'Arunachal'
if 'Arunanchal Pradesh' in state_id_map:
    state_id_map['Arunachal Pradesh'] = state_id_map['Arunanchal Pradesh']
    del state_id_map['Arunanchal Pradesh']

# preprocess dictionary "state_id_map" by changing 'Dadara & Nagar Havelli' with 'Dadra & Nagar Haveli'
if 'Dadara & Nagar Havelli' in state_id_map:
    state_id_map['Dadra & Nagar Haveli'] = state_id_map['Dadara & Nagar Havelli']
    del state_id_map['Dadara & Nagar Havelli']

#print(df3['State or union territory v2'])
df3['id'] = df3['State or union territory v2'].apply(lambda x: state_id_map[x])

In [10]:
### Testing, no actual relevant steps ###
"""
print(len(df3['State or union territory v2']))
print(len(state_id_map.keys()))
sortedListDF = sorted(list(df3['State or union territory v2']))
sortedDict = {k:v for k, v in sorted(state_id_map.items())}
i_iter = -1
print("%-40s %s" %("**KEY**", "**DATAFRAME**"))
for key in sortedDict.keys():
    i_iter += 1
    print("%-40s %s" %(key, sortedListDF[i_iter]))"""

'\nprint(len(df3[\'State or union territory v2\']))\nprint(len(state_id_map.keys()))\nsortedListDF = sorted(list(df3[\'State or union territory v2\']))\nsortedDict = {k:v for k, v in sorted(state_id_map.items())}\ni_iter = -1\nprint("%-40s %s" %("**KEY**", "**DATAFRAME**"))\nfor key in sortedDict.keys():\n    i_iter += 1\n    print("%-40s %s" %(key, sortedListDF[i_iter]))'

In [11]:
# base 10 conversion to log scale
df3['DensityScale'] = np.log10(df3['Density'])

# Sex Ratio Scale
df3['Sex ratio scale'] = df3['Sex ratio'] -1000

### India: Sex Ratio Choropleth

In [16]:
#df3.tail(15)
# Create figure object
fig = px.choropleth(df3,
                    locations='id',
                    geojson=india_states,
                    color='Sex ratio scale',
                   hover_name='State or union territory v2',
                   hover_data=['Sex ratio scale'],
                   color_continuous_scale=px.colors.diverging.BrBG,
                   color_continuous_midpoint=0)

# This zooms into the relevant geometry
fig.update_geos(fitbounds="locations", visible=False)
fig.show()

### India: Density Log-scale Choropleth

In [17]:
# Use the mapbox architecture package. Requires internet.
fig = px.choropleth_mapbox(df3,
                    locations='id',
                    geojson=india_states,
                    color='DensityScale',
                   hover_name='State or union territory v2',
                   hover_data=['Density'],
                   mapbox_style="carto-positron",
                   center={'lat':24, 'lon':78},
                    zoom=3, opacity=0.5)
fig.show()