fairlearn · coreysharris · Jul 17, 2021 · Jul 17, 2021 · Jul 17, 2021 · Jul 17, 2021
diff --git a/examples/plot_synthetic.py b/examples/plot_synthetic.py
@@ -0,0 +1,65 @@
+# Copyright (c) Microsoft Corporation and Fairlearn contributors.
+# Licensed under the MIT License.
+
+"""
+==========================
+Synthetic dataset example with visualization
+==========================
+"""
+
+# %%
+from fairlearn.datasets import make_synthetic_dataset
+from fairlearn.metrics import (
+    MetricFrame,
+    false_positive_rate,
+    true_positive_rate,
+    selection_rate,
+    count
+)
+import numpy as np
+from sklearn.metrics import accuracy_score, precision_score, recall_score
+from sklearn.model_selection import train_test_split
+from sklearn.tree import DecisionTreeClassifier
+
+
+rng = np.random.RandomState(seed=42)
+X, y, gender = make_synthetic_dataset(rng=rng)
+
+X_train, X_test, y_train, y_test, gender_train, gender_test = train_test_split(
+    X, y, gender, test_size=0.3, random_state=rng
+)
+
+classifier = DecisionTreeClassifier(min_samples_leaf=10, max_depth=4)
+classifier.fit(X, y)
+y_pred = classifier.predict(X)
+
+# Analyze metrics using MetricFrame
+metrics = {
+    'accuracy': accuracy_score,
+    'precision': precision_score,
+    'recall': recall_score,
+    'false positive rate': false_positive_rate,
+    'true positive rate': true_positive_rate,
+    'selection rate': selection_rate,
+    'count': count}
+metric_frame = MetricFrame(metrics=metrics,
+                           y_true=y,
+                           y_pred=y_pred,
+                           sensitive_features=gender)
+metric_frame.by_group.plot.bar(
+    subplots=True,
+    layout=[3, 3],
+    legend=False,
+    figsize=[12, 8],
+    title="Show all metrics",
+)
+
+# Customize plots with kind
+metric_frame.by_group.plot(
+    kind="pie",
+    subplots=True,
+    layout=[3, 3],
+    legend=False,
+    figsize=[12, 8],
+    title="Show all metrics in pie",
+)
diff --git a/fairlearn/datasets/__init__.py b/fairlearn/datasets/__init__.py
@@ -8,9 +8,11 @@
 from ._fetch_adult import fetch_adult
 from ._fetch_boston import fetch_boston
 from ._fetch_bank_marketing import fetch_bank_marketing
+from .synthetic import make_synthetic_dataset
 
 __all__ = [
     "fetch_adult",
     "fetch_boston",
     "fetch_bank_marketing",
+    "make_synthetic_dataset",
 ]
diff --git a/fairlearn/datasets/synthetic.py b/fairlearn/datasets/synthetic.py
@@ -0,0 +1,67 @@
+# Copyright (c) Microsoft Corporation and Fairlearn contributors.
+# Licensed under the MIT License.
+import numpy as np
+from sklearn.datasets import make_classification
+
+
+def make_synthetic_dataset(seed=12345):
+    """Create a synthetic dataset.
+
+    Parameters
+    ----------
+    seed : int, default=12345
+        The random number generator seed to use.
+
+    Returns
+    -------
+    (X, y, gender) : tuple of numpy.ndarray
+        X : ndarray
+            The generated samples.
+        y : ndarray
+            Labels for the binary classification.
+        gender : ndarray
+            The sensitive feature label.
+    """
+    if rng is None:
+        rng = np.random.RandomState(seed=42)
+
+    classification_kwargs = {
+        'n_features': 20,
+        'n_informative': 4,
+        'n_classes': 2,
+        'random_state': rng,
+    }
+
+    X_women, y_women = make_classification(
+        n_samples=500,
+        class_sep=1,
+        **classification_kwargs,
+    )
+
+    X_men, y_men = make_classification(
+        n_samples=500,
+        class_sep=2,
+        **classification_kwargs,
+    )
+
+    X_other, y_other = make_classification(
+        n_samples=500,
+        class_sep=0.5,
+        **classification_kwargs,
+    )
+
+    X_unspecified, y_unspecified = make_classification(
+        n_samples=500,
+        class_sep=0.5,
+        **classification_kwargs,
+    )
+
+    X = np.r_[X_women, X_men, X_other, X_unspecified]
+    y = np.r_[y_women, y_men, y_other, y_unspecified]
+    gender = np.r_[
+        ["Woman"] * 500,
+        ["Man"] * 500,
+        ["Other"] * 500,
+        ["Unspecified"] * 500
+    ].reshape(-1)
+    return X, y, gender