In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# --- Load & clean ------------------------------------------------------------
df = pd.read_csv("ceara_cities.csv", skiprows=1)

# Normalize column names just in case (BOMs/spaces)
df.columns = [c.strip().replace("\ufeff", "") for c in df.columns]

# Coerce population to numeric (remove thousand separators etc. if present)
pop = pd.to_numeric(df["2022"], errors="coerce")

# Keep only valid, positive populations
valid = pop > 0
df = df.loc[valid, ["Município", "2022"]].copy()
df["2022"] = pd.to_numeric(df["2022"], errors="coerce")
df = df.dropna(subset=["2022"])

# --- Rank & logs -------------------------------------------------------------
df = df.sort_values(by="2022", ascending=False).reset_index(drop=True)
df["Rank"] = np.arange(1, len(df) + 1)

x = np.log(df["Rank"].to_numpy(dtype=float))
y = np.log(df["2022"].to_numpy(dtype=float))

# --- Fit (log-log) -----------------------------------------------------------
slope, intercept = np.polyfit(x, y, 1)
y_pred = slope * x + intercept

# R^2
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - y.mean()) ** 2)
r2 = 1 - ss_res / ss_tot if ss_tot > 0 else np.nan

# --- Plot --------------------------------------------------------------------
plt.figure(figsize=(8, 6))

# Scatter (log-log already via x,y)
plt.scatter(x, y, alpha=0.7, label="Cities", zorder=2)

# Smooth line across the full x-range
x_fit = np.linspace(x.min(), x.max(), 200)
y_fit = slope * x_fit + intercept
plt.plot(x_fit, y_fit, color="red", linewidth=2.0, label=f"Fit: slope={slope:.2f}, R²={r2:.3f}", zorder=3)

plt.xlabel("log(Rank)")
plt.ylabel("log(Population)")
plt.title("Zipf’s Law for Ceará Cities (2022)")
plt.grid(True, which="both", linestyle="--", linewidth=0.5)
plt.legend()
plt.tight_layout()
plt.show()

# --- (Optional) quick diagnostics -------------------------------------------
print({
    "rows_total": len(pop),
    "rows_valid_positive": len(df),
    "min_population": float(df["2022"].min()) if len(df) else None
})

# --- Plot --------------------------------------------------------------------
plt.figure(figsize=(8, 6))

plt.loglog()
plt.xlabel("log(Population)")
plt.ylabel("log(Frequency)")
plt.title("Histogram for Ceará Cities Population (2022)")
plt.grid(True, which="both", linestyle="--", linewidth=0.5)
plt.tight_layout()
plt.hist(df["2022"], bins=50)
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# --- Load & clean ------------------------------------------------------------
df = pd.read_csv("brazil_cities.csv", skiprows=1)

# Normalize column names just in case (BOMs/spaces)
df.columns = [c.strip().replace("\ufeff", "") for c in df.columns]

# Coerce population to numeric (remove thousand separators etc. if present)
pop = pd.to_numeric(df["2022"], errors="coerce")

# Keep only valid, positive populations
valid = pop > 0
df = df.loc[valid, ["Município", "2022"]].copy()
df["2022"] = pd.to_numeric(df["2022"], errors="coerce")
df = df.dropna(subset=["2022"])

# --- Rank & logs -------------------------------------------------------------
df = df.sort_values(by="2022", ascending=False).reset_index(drop=True)
df["Rank"] = np.arange(1, len(df) + 1)

x = np.log(df["Rank"].to_numpy(dtype=float))
y = np.log(df["2022"].to_numpy(dtype=float))

# --- Fit (log-log) -----------------------------------------------------------
slope, intercept = np.polyfit(x, y, 1)
y_pred = slope * x + intercept

# R^2
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - y.mean()) ** 2)
r2 = 1 - ss_res / ss_tot if ss_tot > 0 else np.nan

# --- Plot --------------------------------------------------------------------
plt.figure(figsize=(8, 6))

# Scatter (log-log already via x,y)
plt.scatter(x, y, alpha=0.7, label="Cities", zorder=2)

# Smooth line across the full x-range
x_fit = np.linspace(x.min(), x.max(), 200)
y_fit = slope * x_fit + intercept
plt.plot(x_fit, y_fit, color="red", linewidth=2.0, label=f"Fit: slope={slope:.2f}, R²={r2:.3f}", zorder=3)

plt.xlabel("log(Rank)")
plt.ylabel("log(Population)")
plt.title("Zipf’s Law for Brazil Cities (2022)")
plt.grid(True, which="both", linestyle="--", linewidth=0.5)
plt.legend()
plt.tight_layout()
plt.show()

# --- (Optional) quick diagnostics -------------------------------------------
print({
    "rows_total": len(pop),
    "rows_valid_positive": len(df),
    "min_population": float(df["2022"].min()) if len(df) else None
})
