In [None]:
NUMBER_TO_CHART = 30
METRICS = ["Albums", "Tracks", "Spend"]

In [None]:
%run database.ipynb
%run pathutils.ipynb
%run export.ipynb

In [None]:
def calculate_metrics(df, metric, top_n):
    # Calculate and sort by the âˆž of total spend, taking the top N results
    total_spend = df[metric].sum()
    percent_column = f"{metric.casefold()}_pct"
    df[percent_column] = df[metric] / total_spend * 100
    sorted_df = df.sort_values(percent_column, ascending=False).head(top_n)
    return sorted_df

In [None]:
import pandas as pd
import numpy as np

def calculate_cumsum(df, metric):
    # Ensure numeric values, drop zeros and sort the resulting values
    values = pd.to_numeric(df[metric], errors="coerce").fillna(0).values
    values = values[values > 0]
    values = np.sort(values)

    # Calculate the cumulative sum
    cumulative_values = np.cumsum(values)
    return cumulative_values

In [None]:
import numpy as np

def calculate_gini(values):
    """
    Calculate the Gini coefficient of a 1D array. This is a single number that measures how uneven a distribution is
    """
    n = len(values)
    lorenz_area = np.trapezoid(values / values[-1], dx=1/n)
    return 1 - 2 * lorenz_area

In [None]:
import matplotlib.pyplot as plt

def plot_metrics_barchart(df, metric, top_n):
    plt.figure(figsize=(12, 5))
    percent_column = f"{metric.casefold()}_pct"
    plt.bar(df["Name"], df[percent_column])
    plt.xticks(rotation=60, ha="right")
    plt.ylabel(f"% of total {metric.casefold()}")
    plt.title(f"Top {top_n} Artists by {metric}")

    # Export the chart
    export_chart(f"{metric.casefold()}-by-artist", "", "png")

    plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def plot_lorenz_curve(values, metric):
    # Normalise
    cumulative_share = np.insert(values / values[-1], 0, 0)
    population = np.linspace(0, 1, len(cumulative_share))

    # Plot the data
    plt.figure(figsize=(12, 5))
    plt.plot(population, cumulative_share, linewidth=2)
    plt.plot([0, 1], [0, 1], linestyle="--")
    plt.xlabel("Fraction of artists")
    plt.ylabel(f"Fraction of total {metric.lower()}")
    plt.title(f"Lorenz Curve - {metric}")
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.grid(True)

    # Export the chart
    export_chart(f"{metric.casefold()}-by-artist-lorenz", "", "png")

    plt.show()

In [None]:
# Load and preview the data
connection = connect()
artist_stats_df = load_artist_statistics(connection)
display(artist_stats_df)

## Chart and Export the Data

Two charts are produced - a barchart of "top N artists by metric" for each metric of interest, which is easy to interpret, and a *Lorenz Curve* for each metric.

The latter includes two lines:

1. A diagonal straight line along which every artist contributes equally to the metric in question
2. A curve in which each point represents the fraction of the metric (Y) contributed by the fraction of the artists (X)

Also exported is the Gini coefficient for each metric, a single number that measures how uneven a distribution is. The following table illustrates how to interpret the Gini values:

| Gini | Interpretation        |
| ---- | --------------------- |
| 0.0  | Perfect equality      |
| 0.2  | Very even             |
| 0.4  | Moderately uneven     |
| 0.6  | Strong favourites     |
| 0.8+ | Extreme concentration |

For example, suppose the Gini coefficients came out as follows:

```
Albums  <  Tracks  <  Spend
0.36       0.45       0.54
```

The interpretation might be:

- 0.36 is fairly even and indicates that albums are owned by a wide range of artists
- 0.45 is moderately concentrated, indicating potentially greater listening depth for certain artists, based on the amount of material by them in the collection
- 0.54 noticeably but not extremely concentrated, indicating a relatively small group of artists accounts for a large share of spend

Note that the number of tracks per album does have an influence on the tracks Gini, though the effect should be small to moderate rather than dominant.

In [None]:
import pandas as pd

export = {
    "Data": artist_stats_df
}

gini = {}

# Iterate over the metrics of interest
for metric in METRICS:
    # Calculate the % by artist for the current metric and add it to the export dictionary
    calculated_metrics = calculate_metrics(artist_stats_df, metric, NUMBER_TO_CHART)
    export[f"{metric} Statistics"] = calculated_metrics

    # Calculate the cumulative sum of the values of the metric and use this to calculate Gini
    cumulative_values = calculate_cumsum(artist_stats_df, metric)
    gini[metric] = calculate_gini(cumulative_values)

    # Plot and export the barchart
    plot_metrics_barchart(calculated_metrics, metric, NUMBER_TO_CHART)

    # Plot and export the Lorenz curve
    plot_lorenz_curve(cumulative_values, metric)

# Add the Gini results to the export data
export["Gini"] = pd.DataFrame.from_dict(gini, orient='index', columns=['value'])

# Export the data
export_to_spreadsheet("artist-statistics", export)