Merge pull request #100 from ealcobaca/0.4.x

0.4.x
ealcobaca · Jul 6, 2020 · c1cfcb5 · c1cfcb5
2 parents db6b207 + c8e1cff
commit c1cfcb5
Show file tree

Hide file tree

Showing 35 changed files with 2,733 additions and 1,887 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -2,7 +2,8 @@ language: python
 
 python:
     - "3.6"
-    - "3.7-dev"
+    - "3.7"
+    - "3.8"
 
 install:
     - make install-dev
@@ -24,7 +25,7 @@ env:
 #
 script:
     - mypy $SOURCE_FILES --ignore-missing-imports
-    - pylint $SOURCE_FILES -d 'C0103, R0913, R0902, R0914, C0302, R0904, R0801, E1101'
+    - pylint $SOURCE_FILES -d 'C0103, R0913, R0902, R0914, C0302, R0904, R0801, E1101, C0330'
     - pytest tests/ --showlocals -v --cov=pymfe/
     - make html
 

diff --git a/Makefile b/Makefile
@@ -26,7 +26,7 @@ t: test-cov ## Shortcut to test-cov
 
 code-check: ## Execute the code check with flake8, pylint, mypy.
 	flake8 $(PACKAGE)
-	pylint $(PACKAGE) -d 'C0103, R0913, R0902, R0914, C0302, R0904, R0801, E1101'
+	pylint $(PACKAGE) -d 'C0103, R0913, R0902, R0914, C0302, R0904, R0801, E1101, C0330'
 	mypy $(PACKAGE) --ignore-missing-imports
 
 c: code-check # Shortcut to code-check
@@ -52,3 +52,6 @@ help: ## List target command description.
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
 
 h: help ## Shortcut to help
+
+format: ## format all the package using black
+	@black --line-length 79 pymfe/
diff --git a/README.md b/README.md
diff --git a/docs/source/_static/theme_overrides.css b/docs/source/_static/theme_overrides.css
diff --git a/docs/source/about.rst b/docs/source/about.rst
@@ -13,12 +13,30 @@ You can find the contributors of this package here_.
 Citing PyMFE
 ------------
 
-If you use PyMFE in a scientific publication, we would appreciate
-citations to the following paper::
+If you use the `pymfe` in scientific publication, we would appreciate citations
+to the following paper:
+
+`Edesio Alcobaça, Felipe Siqueira, Adriano Rivolli, Luís P. F. Garcia,
+Jefferson T. Oliva, & André C. P. L. F. de Carvalho (2020).
+MFE: Towards reproducible meta-feature extraction. Journal of Machine Learning
+Research, 21(111), 1-5. <http://jmlr.org/papers/v21/19-348.html>`_
+
+You can also use the bibtex format::
+
+  @article{JMLR:v21:19-348,
+    author  = {Edesio Alcobaça and
+               Felipe Siqueira and
+               Adriano Rivolli and
+               Luís P. F. Garcia and
+               Jefferson T. Oliva and
+               André C. P. L. F. de Carvalho
+    },
+    title   = {MFE: Towards reproducible meta-feature extraction},
+    journal = {Journal of Machine Learning Research},
+    year    = {2020},
+    volume  = {21},
+    number  = {111},
+    pages   = {1-5},
+    url     = {http://jmlr.org/papers/v21/19-348.html}
+  }
 
-  None
-
-Extra information
------------------
-See the `README <https://github.com/ealcobaca/pymfe/blob/master/README.md>`_
-file from GitHub for extra information.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -87,13 +87,7 @@
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 
-html_static_path = ['_static']
-
-html_context = {
-    'css_files': [
-        '_static/theme_overrides.css',  # override wide tables in RTD theme
-        ],
-     }
+# html_static_path = ['_static']
 
 
 # Output file base name for HTML help builder.
@@ -124,8 +118,8 @@
     'reference_url': {
         # The module you locally document uses None
         'pymfe': None,
-    }
-    # 'plot_gallery': True,
+    },
+    # 'thumbnail_size': (50, 50),
     # 'junit': '../test-results/sphinx-gallery/junit.xml',
     # 'log_level': {'backreference_missing': 'warning'},
     # 'subsection_order': ExplicitOrder(['../examples/sin_func',

diff --git a/docs/source/new.rst b/docs/source/new.rst
@@ -6,8 +6,8 @@ The PyMFE releases are available in PyPI_ and GitHub_.
 .. _GitHub: https://github.com/ealcobaca/pymfe/releases
 
 
-Version 0.3.0 (Available on PyPI)
----------------------------------
+Version 0.3.0
+-------------
 * Metafeature extraction with confidence intervals
 
 * Pydoc fixes and package documentation/code consistency improvements
@@ -41,8 +41,8 @@ Version 0.3.0 (Available on PyPI)
 * Online documentation improvement
 
 
-Version 0.2.0 (Available on PyPI)
----------------------------------
+Version 0.2.0
+-------------
 * New meta-feature groups
 
   * Complexity
@@ -70,8 +70,8 @@ Version 0.2.0 (Available on PyPI)
 * Statistical group updated
 
 
-Version 0.1.1 (Available on PyPI)
----------------------------------
+Version 0.1.1
+-------------
 * Bugs solved
 
    * False positive of mypy fixed
@@ -88,8 +88,8 @@ Version 0.1.1 (Available on PyPI)
   current percentage of progress done so far.
 
 
-Version 0.1.0 (Available on PyPI)
----------------------------------
+Version 0.1.0
+-------------
 * Meta-feature groups available
 
    * Relative landmarking
@@ -123,15 +123,12 @@ Version 0.1.0 (Available on PyPI)
    * Several new tests added
 
 
-Version 0.0.3 (Available on PyPI)
----------------------------------
+Version 0.0.3
+-------------
 * Documentation improvement
 
 * Setup improvement
 
-
-Initial Release
----------------
 * Meta-feature groups available:
 
   * Simple
@@ -144,4 +141,3 @@ Initial Release
 
   * Landmarking
 
-
diff --git a/docs/source/using.rst b/docs/source/using.rst
@@ -1,12 +1,13 @@
 Using PyMFE
 ###########
 Extracting metafeatures with PyMFE is easy.                                     
-
-The parameters are the measures, the group of measures and the summarization
-functions to be extracted. The default behavior is to extract all default
-measures, which is. The ``fit`` function can be called by passing the ``X``
-and ``y``. The ``extract`` function is used to extract the related measures.
-See this example::
+
+The simplest way to extract meta-features is by instantiating the `MFE` class.
+It computes five meta-features groups by default using mean and standard
+deviation as summary functions:  General, Statistical, Information-theoretic,
+Model-based, and Landmarking. The `fit` method can be called by passing the `X`
+and `y`. Then the `extract` method is used to extract the related measures.
+A simple example using `pymfe` for supervised tasks is given next::
 
    # Load a dataset
    from sklearn.datasets import load_iris

diff --git a/docs/sphinxext/github_link.py b/docs/sphinxext/github_link.py
@@ -1,3 +1,5 @@
+#  Authors : https://github.com/scikit-learn-contrib/imbalanced-learn/blob/master/doc/sphinxext/github_link.py
+
 from operator import attrgetter
 import inspect
 import subprocess

diff --git a/examples/01_introductory_examples/plot_extract_from_model.py b/examples/01_introductory_examples/plot_extract_from_model.py
@@ -0,0 +1,36 @@
+"""
+Meta-features from a model
+==========================
+
+In this example, we will show you how to extract meta-features from a
+pre-fitted model.
+"""
+
+# Load a dataset
+import sklearn.tree
+from sklearn.datasets import load_iris
+from pymfe.mfe import MFE
+
+iris = load_iris()
+
+###############################################################################
+# If you want to extract metafeatures from a pre-fitted machine learning model
+# (from sklearn package), you can use the `extract_from_model` method without
+# needing to use the training data:
+
+# Extract from model
+
+model = sklearn.tree.DecisionTreeClassifier().fit(iris.data, iris.target)
+extractor = MFE()
+ft = extractor.extract_from_model(model)
+print("\n".join("{:50} {:30}".format(x, y) for x, y in zip(ft[0], ft[1])))
+
+# Extract specific metafeatures from model
+extractor = MFE(features=["tree_shape", "nodes_repeated"], summary="histogram")
+
+ft = extractor.extract_from_model(
+    model,
+    arguments_fit={"verbose": 1},
+    arguments_extract={"verbose": 1, "histogram": {"bins": 5}})
+
+print("\n".join("{:50} {:30}".format(x, y) for x, y in zip(ft[0], ft[1])))
diff --git a/examples/01_introductory_examples/plot_unsupervised_meta_features.py b/examples/01_introductory_examples/plot_unsupervised_meta_features.py
@@ -0,0 +1,33 @@
+"""
+Extracting meta-features from unsupervised learning
+===================================================
+
+In this example we will show you how to extract meta-features from unsupervised
+machine learning tasks.
+"""
+
+# Load a dataset
+from sklearn.datasets import load_iris
+from pymfe.mfe import MFE
+
+data = load_iris()
+y = data.target
+X = data.data
+
+###############################################################################
+#
+# You can simply omit the target attribute for unsupervised tasks while
+# fitting the data into the MFE model. The `pymfe` package automatically finds
+# and extracts only the metafeatures suitable for this type of task.
+
+# Extract default unsupervised measures
+mfe = MFE()
+mfe.fit(X)
+ft = mfe.extract()
+print("\n".join("{:50} {:30}".format(x, y) for x, y in zip(ft[0], ft[1])))
+
+# Extract all available unsupervised measures
+mfe = MFE(groups="all")
+mfe.fit(X)
+ft = mfe.extract()
+print("\n".join("{:50} {:30}".format(x, y) for x, y in zip(ft[0], ft[1])))
diff --git a/examples/02_advanced_examples/README.txt b/examples/02_advanced_examples/README.txt
@@ -2,4 +2,4 @@ Advanced Examples
 -----------------
 
 These examples will show you how to use some advanced configurations and tricks
-to make codification more comfortable.
+to code more comfortable.
diff --git a/examples/02_advanced_examples/plot_confidence_interval.py b/examples/02_advanced_examples/plot_confidence_interval.py
@@ -0,0 +1,35 @@
+"""
+Meta-feature confidence interval
+================================
+
+In this example, we will show you how to extract meta-features with confidence
+interval.
+"""
+
+# Load a dataset
+import sklearn.tree
+from sklearn.datasets import load_iris
+from pymfe.mfe import MFE
+
+data = load_iris()
+y = data.target
+X = data.data
+
+# You can also extract your meta-features with confidence intervals using
+# bootstrap. Keep in mind that this method extracts each meta-feature several
+# times, and may be very expensive depending mainly on your data and the
+# number of meta-feature extract methods called.
+
+# Extract meta-features with confidence interval
+mfe = MFE(features=["mean", "nr_cor_attr", "sd", "max"])
+mfe.fit(X, y)
+
+ft = mfe.extract_with_confidence(
+    sample_num=256,
+    confidence=0.99,
+    verbose=1,
+)
+
+print("\n".join("{:50} {:30} {:30}".format(x, y[0], y[1])
+                for x, y in zip(ft[0], ft[2])))
+
diff --git a/examples/02_advanced_examples/plot_custom_arguments.py b/examples/02_advanced_examples/plot_custom_arguments.py
@@ -1,6 +1,6 @@
 """
 Customizing measures arguments
-===================================
+==============================
 
 In this example we will show you how to custorize the measures.
 """

diff --git a/examples/03_miscellaneous_examples/plot_default_value_for_attr_conc.py b/examples/03_miscellaneous_examples/plot_default_value_for_attr_conc.py
@@ -0,0 +1,57 @@
+"""
+Meta-feature confidence interval
+================================
+
+In this example, we will show you how the default value `max_attr_num` of
+meta-feature `attr_conc` was solved.
+"""
+
+# Load a dataset
+from sklearn.datasets import load_iris
+import numpy as np
+import pymfe.mfe
+import matplotlib.pyplot as plt
+
+iris = load_iris()
+
+# Added a default value for `max_attr_num` parameter of the `attr_conc`
+# meta-feature extraction method, which is the most expensive meta-feature
+# extraction method by far.
+
+# The default parameter was determined by a simple inspection at the feature
+# extraction time growing rate to the number of attributes on the fitted data.
+# The threshold accepted for the time extraction is a value less than 2
+# seconds.
+
+# The test dataset was the iris dataset. The test code used is reproduced
+# below.
+np.random.seed(0)
+
+arrsize = np.zeros(10)
+time = np.zeros(10)
+
+X = np.empty((iris.target.size, 0))
+
+for i in np.arange(10):
+    X = np.hstack((X, iris.data))
+    print(f"{i}. Number of attributes: {X.shape[1]} ...")
+    model = pymfe.mfe.MFE(features="attr_conc",
+                          summary="mean",
+                          measure_time="total").fit(X)
+    res = model.extract(suppress_warnings=True)
+
+    arrsize[i] = model._custom_args_ft["C"].shape[1]
+    time[i] = res[2][0]
+
+plt.plot(arrsize, time, label="time elapsed")
+plt.hlines(y=np.arange(1, 1 + int(np.ceil(np.max(time)))),
+           xmin=0,
+           xmax=arrsize[-1],
+           linestyle="dotted",
+           color="red")
+plt.legend()
+plt.show()
+
+# The time cost of extraction for the attr_conc meta-feature does not grow
+# significantly with the number of instance and, hence, it is not necessary to
+# sample in the instance axis.