#### Demonstration Jupyter Notebook for the zero-to-AI series
##### Chris Joakim, 3Cloud/Cognizant, 2026

This is file **python/jupyter-demo.ipynb** in the GitHub repository

In [None]:
# Simple example Cell.  It imports the python "arrow" library and prints the current time.
import arrow
import os

# cwd = current working directory
cwd = os.getcwd()
print(f"current working directory is: {cwd}")

# arrow is a python library for date and time logic
utc_time = arrow.utcnow()
print("utc_time is: {}".format(utc_time))
!date

# Optionally show the list of python libraries in the current virtual environment.
# Notice how this executes a program on your system (with !), not python.

#!uv pip list

In [None]:
# Read the 'top-pypi-packages.csv' file into a pandas dataframe.
# Produce a horizontal bar chart of the top 100 libraries by download count, with matplotlib.
# Also print summary statistics.

import pandas as pd
import matplotlib.pyplot as plt

infile = "data/pypi/top-pypi-packages.csv"
df_packages = pd.read_csv(infile)
print(f"Dataframe shape: {df_packages.shape}")
print(f"\nFirst few rows:")
print(df_packages.head())

# Get top 100 libraries by download_count
top_100 = df_packages.nlargest(100, "download_count").sort_values("download_count", ascending=True)

# Create horizontal bar chart for better readability
plt.figure(figsize=(12, 16))
plt.barh(range(len(top_100)), top_100["download_count"], color="steelblue")
plt.yticks(range(len(top_100)), top_100["project"])
plt.xlabel("Download Count", fontsize=12)
plt.ylabel("Library", fontsize=12)
plt.title("Top 100 PyPI Libraries by Download Count", fontsize=14, fontweight="bold")
plt.tight_layout()
plt.show()

# Also print summary statistics
print(f"\nTop 100 Libraries Summary:")
print(f"Total downloads (top 100): {top_100['download_count'].sum():,}")
print(f"Average downloads: {top_100['download_count'].mean():,.0f}")
print(f"Median downloads: {top_100['download_count'].median():,.0f}")

In [None]:
# Similar to the above, but filter the libraries with "azure" in their name.
# Reuse the df_packages dataframe from the previous cell.

import pandas as pd
import matplotlib.pyplot as plt

# Filter libraries with "azure" in their name (case-insensitive)
df_azure = df_packages[df_packages["project"].str.contains("azure", case=False, na=False)]

# Get top 100 Azure libraries by download_count
df_azure_top100 = df_azure.nlargest(100, "download_count").sort_values(
    "download_count", ascending=True
)

print(f"Azure libraries found: {len(df_azure)}")
print(f"Top 100 Azure libraries (by download_count): {len(df_azure_top100)}")
print(f"\nFirst few rows:")
print(df_azure_top100.head())

# Create horizontal bar chart for better readability
plt.figure(figsize=(12, max(8, len(df_azure_top100) * 0.3)))
plt.barh(range(len(df_azure_top100)), df_azure_top100["download_count"], color="steelblue")
plt.yticks(range(len(df_azure_top100)), df_azure_top100["project"])
plt.xlabel("Download Count", fontsize=12)
plt.ylabel("Library", fontsize=12)
plt.title("Top 100 Azure Libraries by Download Count", fontsize=14, fontweight="bold")
plt.tight_layout()
plt.show()

# Also print summary statistics
if len(df_azure_top100) > 0:
    print(f"\nTop 100 Azure Libraries Summary:")
    print(f"Total downloads: {df_azure_top100['download_count'].sum():,}")
    print(f"Average downloads: {df_azure_top100['download_count'].mean():,.0f}")
    print(f"Median downloads: {df_azure_top100['download_count'].median():,.0f}")
else:
    print("\nNo Azure libraries found.")

In [None]:
# Add a column to df_packages showing the percentage of total downloads for each library

# Calculate the total sum of all downloads
total_downloads = df_packages["download_count"].sum()

# Add a new column with the percentage of total downloads
df_packages["download_percentage"] = (df_packages["download_count"] / total_downloads) * 100

# Display the updated dataframe
print(f"Total downloads across all libraries: {total_downloads:,}")
print(f"\nFirst few rows with percentage column:")
print(df_packages[["project", "download_count", "download_percentage"]].head(10))

# Show some statistics about the percentages
print(f"\nPercentage Statistics:")
print(f"Max percentage: {df_packages['download_percentage'].max():.4f}%")
print(f"Min percentage: {df_packages['download_percentage'].min():.6f}%")
print(f"Mean percentage: {df_packages['download_percentage'].mean():.4f}%")

# Display the first 20 rows of the dataframe, using head()
print(f"\nFirst 40 rows of the dataframe:")
print(df_packages.head(40))

# Display the first 40 rows of the dataframe, all columns, using iloc
first_40_rows_all_cols = df_packages.iloc[:40, :]
print(first_40_rows_all_cols)

# Save the entire dataframe to file
import os

os.makedirs("tmp", exist_ok=True)
df_packages.to_csv("tmp/calculated_libraties_df.csv", index=False)
print(f"\nDataframe saved to tmp/calculated_libraties_df.csv")