Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT Synthetic dataset creation #907

Open
wants to merge 45 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
d5c8c1d
First draft of synthetic dataset creation
coreysharris Jul 17, 2021
f5a1553
Preemptive cleanup
coreysharris Jul 17, 2021
3c6a079
Fix seed parameter
coreysharris Jul 17, 2021
3e8f574
Fix RST title
coreysharris Jul 17, 2021
1d93760
Don't set a default seed
coreysharris Jul 17, 2021
a3d2142
Remove pie charts in example notebook
coreysharris Jul 17, 2021
b198111
Make n_features, n_informative into parameters
coreysharris Jul 17, 2021
9855060
Move synthetic.py -> _synthetic.py
coreysharris Jul 17, 2021
d848727
Fix copyright
coreysharris Jul 17, 2021
915f68c
Fix train/predict dataset usage
coreysharris Jul 17, 2021
17f3ea4
Update changelog/contributors
coreysharris Jul 17, 2021
c5d371b
Add unit test
coreysharris Jul 17, 2021
a27ae76
Fix typo
coreysharris Jul 17, 2021
c775f4d
Fix flake8 issue
coreysharris Jul 17, 2021
772b9b1
Clean things up
coreysharris Jul 17, 2021
1b392cf
Allow custom labels
coreysharris Jul 17, 2021
d056bbd
Update test to match new settings
coreysharris Jul 17, 2021
353aa94
Fix copyright
coreysharris Jul 17, 2021
2b82966
Use sklearn's check_random_state
coreysharris Jul 17, 2021
e6bb7f8
Address PR comments
coreysharris Jul 18, 2021
85c33fa
Simplify unit test
coreysharris Jul 18, 2021
005bef3
Add unit test for custom dataset
coreysharris Jul 18, 2021
0d2cda7
Flip classes to make less fair
coreysharris Jul 18, 2021
2146134
Remove unneeded import
coreysharris Jul 18, 2021
835ddf9
make_synthetic_data -> make_sensitive_classification
coreysharris Jul 20, 2021
d9e69ee
Add tests for bad input
coreysharris Jul 20, 2021
9d37d25
Let make_classification produce the biases
coreysharris Jul 20, 2021
e8712e5
Refactor to clean up API
coreysharris Jul 22, 2021
cd9499e
Clean up notebook slightly
coreysharris Jul 24, 2021
804156b
Move changes to user_guide
coreysharris Aug 26, 2021
c623b2a
Get multiple feature support working
coreysharris Sep 17, 2021
7f58cce
Fix feature ordering
coreysharris Sep 17, 2021
f54880c
Don't hardcode num samples
coreysharris Sep 17, 2021
1f7d3f5
Show two-feature example
coreysharris Sep 17, 2021
5c9aba1
Improve naming
coreysharris Sep 17, 2021
434ec78
More cleanup
coreysharris Sep 17, 2021
b8b3ecf
Fix tests
coreysharris Sep 17, 2021
5b20f01
Merge branch 'main' into enh/synthetic-datasets
coreysharris Sep 17, 2021
8632985
Improve changelog message
coreysharris Sep 25, 2021
d7546ab
Improve docstrings
coreysharris Sep 25, 2021
824430d
Shuffle
coreysharris Sep 25, 2021
b1848e9
Improve notebook example
coreysharris Sep 25, 2021
351f151
Make init_configured_groups private
coreysharris Sep 26, 2021
84e62cd
Add some helepr methods
coreysharris Sep 26, 2021
fd22427
Get tests passing
coreysharris Sep 26, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
65 changes: 65 additions & 0 deletions examples/plot_synthetic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright (c) Microsoft Corporation and Fairlearn contributors.
coreysharris marked this conversation as resolved.
Show resolved Hide resolved
# Licensed under the MIT License.

"""
==========================
romanlutz marked this conversation as resolved.
Show resolved Hide resolved
Synthetic dataset example with visualization
==========================
"""

# %%
from fairlearn.datasets import make_synthetic_dataset
from fairlearn.metrics import (
MetricFrame,
false_positive_rate,
true_positive_rate,
selection_rate,
count
)
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


rng = np.random.RandomState(seed=42)
X, y, gender = make_synthetic_dataset(rng=rng)

X_train, X_test, y_train, y_test, gender_train, gender_test = train_test_split(
X, y, gender, test_size=0.3, random_state=rng
)

classifier = DecisionTreeClassifier(min_samples_leaf=10, max_depth=4)
classifier.fit(X, y)
romanlutz marked this conversation as resolved.
Show resolved Hide resolved
y_pred = classifier.predict(X)

# Analyze metrics using MetricFrame
metrics = {
'accuracy': accuracy_score,
'precision': precision_score,
'recall': recall_score,
'false positive rate': false_positive_rate,
'true positive rate': true_positive_rate,
'selection rate': selection_rate,
'count': count}
metric_frame = MetricFrame(metrics=metrics,
y_true=y,
y_pred=y_pred,
sensitive_features=gender)
metric_frame.by_group.plot.bar(
subplots=True,
layout=[3, 3],
legend=False,
figsize=[12, 8],
title="Show all metrics",
)

# Customize plots with kind
romanlutz marked this conversation as resolved.
Show resolved Hide resolved
metric_frame.by_group.plot(
kind="pie",
subplots=True,
layout=[3, 3],
legend=False,
figsize=[12, 8],
title="Show all metrics in pie",
)
2 changes: 2 additions & 0 deletions fairlearn/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
from ._fetch_adult import fetch_adult
from ._fetch_boston import fetch_boston
from ._fetch_bank_marketing import fetch_bank_marketing
from .synthetic import make_synthetic_dataset
coreysharris marked this conversation as resolved.
Show resolved Hide resolved

__all__ = [
"fetch_adult",
"fetch_boston",
"fetch_bank_marketing",
"make_synthetic_dataset",
]
67 changes: 67 additions & 0 deletions fairlearn/datasets/synthetic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Copyright (c) Microsoft Corporation and Fairlearn contributors.
# Licensed under the MIT License.
import numpy as np
from sklearn.datasets import make_classification


def make_synthetic_dataset(seed=12345):
romanlutz marked this conversation as resolved.
Show resolved Hide resolved
"""Create a synthetic dataset.

Parameters
----------
seed : int, default=12345
The random number generator seed to use.

Returns
-------
(X, y, gender) : tuple of numpy.ndarray
X : ndarray
The generated samples.
y : ndarray
Labels for the binary classification.
gender : ndarray
The sensitive feature label.
"""
if rng is None:
rng = np.random.RandomState(seed=42)

classification_kwargs = {
'n_features': 20,
'n_informative': 4,
coreysharris marked this conversation as resolved.
Show resolved Hide resolved
'n_classes': 2,
'random_state': rng,
}

X_women, y_women = make_classification(
n_samples=500,
class_sep=1,
**classification_kwargs,
)
romanlutz marked this conversation as resolved.
Show resolved Hide resolved

X_men, y_men = make_classification(
n_samples=500,
class_sep=2,
**classification_kwargs,
)

X_other, y_other = make_classification(
n_samples=500,
class_sep=0.5,
**classification_kwargs,
)

X_unspecified, y_unspecified = make_classification(
romanlutz marked this conversation as resolved.
Show resolved Hide resolved
n_samples=500,
class_sep=0.5,
**classification_kwargs,
)

X = np.r_[X_women, X_men, X_other, X_unspecified]
y = np.r_[y_women, y_men, y_other, y_unspecified]
gender = np.r_[
["Woman"] * 500,
["Man"] * 500,
["Other"] * 500,
["Unspecified"] * 500
].reshape(-1)
return X, y, gender
romanlutz marked this conversation as resolved.
Show resolved Hide resolved