In [29]:
import requests as rq
import bs4
import pandas as pd
import plotly.express as px
from io import StringIO
import plotly.graph_objects as go

In [30]:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'
headers = {'User-Agent': 'Mozilla/5.0'}
page = rq.get(url, headers=headers)

In [31]:
bs4page = bs4.BeautifulSoup(page.text, 'html.parser')
tables = bs4page.find_all('table', {'class': 'wikitable'})

In [32]:
table_string = str(tables[0])
dat_1 = pd.read_html(StringIO(table_string))[0]
dat_1


Unnamed: 0,Country/Territory,IMF (2026)[6],World Bank (2024)[7],United Nations (2024)[8]
0,World,123584494,111326370,100834796
1,United States,31821293,28750956,29298000
2,China[n 1],20650754,18743803,18743802
3,Germany,5328184,4685593,4659929
4,Japan,4463634,4027598,4026211
...,...,...,...,...
217,Kiribati,343,308,343
218,Marshall Islands,332,290,281
219,Nauru,183,163,187
220,Montserrat,—N/a,—N/a,81


In [33]:
# Fill in the missing data with predictions from the UN and Worldbank
na_counts = dat_1.isna().sum()
print(na_counts)
dat_1['IMF (2026)[6]'] = dat_1['IMF (2026)[6]'].fillna(dat_1['World Bank (2024)[7]'])
na_counts = dat_1.isna().sum()
print(na_counts)

Country/Territory           0
IMF (2026)[6]               0
World Bank (2024)[7]        0
United Nations (2024)[8]    0
dtype: int64
Country/Territory           0
IMF (2026)[6]               0
World Bank (2024)[7]        0
United Nations (2024)[8]    0
dtype: int64


In [34]:
#The region data can be found from an older table here
url = 'https://en.wikipedia.org/w/index.php?title=List_of_countries_by_GDP_(nominal)&oldid=1187446467'
headers = {'User-Agent': 'Mozilla/5.0'}
page = rq.get(url, headers=headers)

bs4page = bs4.BeautifulSoup(page.text, 'html.parser')
tables = bs4page.find_all('table')
print(f"Number of tables found: {len(tables)}")


table_string = str(tables[2])
dat_2 = pd.read_html(StringIO(table_string))[0]
dat_2

Number of tables found: 7


Unnamed: 0_level_0,Country/Territory,UN region,IMF[1][13],IMF[1][13],World Bank[14],World Bank[14],United Nations[15],United Nations[15]
Unnamed: 0_level_1,Country/Territory,UN region,Forecast,Year,Estimate,Year,Estimate,Year
0,World,—,104476432,2023,100562011,2022,96698005,2021
1,United States,Americas,26949643,2023,25462700,2022,23315081,2021
2,China,Asia,17700899,[n 1]2023,17963171,[n 3]2022,17734131,[n 1]2021
3,Germany,Europe,4429838,2023,4072192,2022,4259935,2021
4,Japan,Asia,4230862,2023,4231141,2022,4940878,2021
...,...,...,...,...,...,...,...,...
209,Palau,Oceania,267,2023,—N/a,—N/a,218,2021
210,Kiribati,Oceania,246,2023,223,2022,227,2021
211,Nauru,Oceania,150,2023,151,2022,155,2021
212,Montserrat,Americas,—N/a,—N/a,—N/a,—N/a,72,2021


In [35]:
#Align Tables
print(dat_1.columns)
print(dat_2.columns)
dat_2.columns = dat_2.columns.map(lambda x: x[0] if isinstance(x, tuple) else x)

# Merge to include the 'UN region' into dat_1
merged_df = pd.merge(
    dat_1, dat_2[['Country/Territory', 'UN region']], on='Country/Territory', how='left')

# Drop the row with index 0
merged_df = merged_df.drop(index=0).reset_index(drop=True)

merged_df

Index(['Country/Territory', 'IMF (2026)[6]', 'World Bank (2024)[7]',
       'United Nations (2024)[8]'],
      dtype='str')
MultiIndex([( 'Country/Territory', 'Country/Territory'),
            (         'UN region',         'UN region'),
            (        'IMF[1][13]',          'Forecast'),
            (        'IMF[1][13]',              'Year'),
            (    'World Bank[14]',          'Estimate'),
            (    'World Bank[14]',              'Year'),
            ('United Nations[15]',          'Estimate'),
            ('United Nations[15]',              'Year')],
           )


Unnamed: 0,Country/Territory,IMF (2026)[6],World Bank (2024)[7],United Nations (2024)[8],UN region
0,United States,31821293,28750956,29298000,Americas
1,China[n 1],20650754,18743803,18743802,
2,Germany,5328184,4685593,4659929,Europe
3,Japan,4463634,4027598,4026211,Asia
4,India,4452839,3909892,3952244,Asia
...,...,...,...,...,...
216,Kiribati,343,308,343,Oceania
217,Marshall Islands,332,290,281,Oceania
218,Nauru,183,163,187,Oceania
219,Montserrat,—N/a,—N/a,81,Americas


In [36]:

# Now we will group by 'UN region' and aggregate the IMF values
# Now create the stacked bar plot
fig = px.bar(
    merged_df,
    x='UN region',                             # Use regions as x-axis
    y='IMF (2026)[6]',                        # IMF numbers on y-axis
    color='Country/Territory',                # Stack by countries
    title='IMF Numbers by Region',            # Title of the plot
)

# Update layout for better readability
fig.update_layout(barmode='stack', yaxis_title='IMF Values', xaxis_title='Regions')
fig.update_layout(showlegend=False)

# Show the plot
fig.show()

In [37]:
## Look at the chapter on interactive graphicsLinks to an external site. and, specifically, the code to display a subject's MRICloud data as a sunburst plot.
## load in the hierarchy information
url = "https://raw.githubusercontent.com/bcaffo/MRIcloudT1volumetrics/master/inst/extdata/multilevel_lookup_table.txt"
multilevel_lookup = pd.read_csv(url, sep = "\t").drop(['Level5'], axis = 1)
multilevel_lookup = multilevel_lookup.rename(columns = {
    "modify"   : "roi",
    "modify.1" : "level4",
    "modify.2" : "level3",
    "modify.3" : "level2",
    "modify.4" : "level1"})
multilevel_lookup = multilevel_lookup[['roi', 'level4', 'level3', 'level2', 'level1']]
multilevel_lookup.head()

## Now load in the subject data
id = 127
subjectData = pd.read_csv("https://raw.githubusercontent.com/smart-stats/ds4bio_book/main/book/assetts/kirby21AllLevels.csv")
subjectData = subjectData.loc[(subjectData.type == 1) & (subjectData.level == 5) & (subjectData.id == id)]
subjectData = subjectData[['roi', 'volume']]
## Merge the subject data with the multilevel data
subjectData = pd.merge(subjectData, multilevel_lookup, on = "roi")
subjectData = subjectData.assign(icv = "ICV")
subjectData = subjectData.assign(comp = subjectData.volume / np.sum(subjectData.volume))
subjectData.head()

fig = px.sunburst(subjectData, path=['icv', 'level1', 'level2', 'level3', 'level4', 'roi'],
                  values='comp', width=800, height=800)
fig.show()

In [38]:
# Add Intracranial Volume (ICV)
subjectData = subjectData.assign(icv=np.sum(subjectData['volume']))

# Prepare data for Sankey diagram
# Create labels for the Sankey diagram
labels = subjectData[['level1', 'level2', 'level3', 'level4', 'roi']].values.flatten().tolist() + ["ICV"]
labels = list(set(labels))  # Get unique labels

# Generate source and target indices for the Sankey diagram
sources = []
targets = []
values = []

# Building the Sankey connections by levels
for index, row in subjectData.iterrows():
    # ICV to level 1
    sources.append(labels.index("ICV"))
    targets.append(labels.index(row['level1']))
    values.append(row['volume'])
    
    # Level 1 to Level 2
    sources.append(labels.index(row['level1']))
    targets.append(labels.index(row['level2']))
    values.append(row['volume'])
    
    # Level 2 to Level 3
    sources.append(labels.index(row['level2']))
    targets.append(labels.index(row['level3']))
    values.append(row['volume'])
    

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=labels,
        color="blue"
    ),
    link=dict(
        source=sources,  # Indices corresponding to labels
        target=targets,  # Indices corresponding to labels
        value=values     # Values corresponding to each link
    )
))

fig.update_layout(title_text="Sankey Diagram for Subject's ICV Data", font_size=10)
fig.show()

Put the link to your live web page in a markdown cell of your hw5.ipynb file as a text block.