Skip to content

Commit

Permalink
update to use tqdm instead of easydev/joblib
Browse files Browse the repository at this point in the history
  • Loading branch information
cokelaer committed Sep 14, 2022
1 parent d94875b commit 3327912
Show file tree
Hide file tree
Showing 11 changed files with 57 additions and 82 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ on:
- main
pull_request:
branches-ignore: []
schedule:
- cron: '0 0 * * SUN'

jobs:
build-linux:
Expand All @@ -14,6 +16,7 @@ jobs:
max-parallel: 5
matrix:
python: [3.6, 3.7, 3.8, 3.9]
fail-fast: false

steps:
- uses: actions/checkout@v2
Expand Down
6 changes: 6 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ FITTER documentation

Compatible with Python 3.6, 3.7, and 3.8, 3.9

.. image:: https://raw.githubusercontent.com/cokelaer/fitter/main/doc/_static/fitter_256x256.png
:target: https://raw.githubusercontent.com/cokelaer/fitter/main/doc/_static/fitter_256x256.png



What is it ?
################
Expand Down Expand Up @@ -102,6 +106,8 @@ Changelog
========= ==========================================================================
Version Description
========= ==========================================================================
1.5.0 * removed easydev and replaced by tqdm for progress bar
* progressbar from tqdm also allows replacement of joblib need
1.4.1 * Update timeout in docs from 10 to 30 seconds by @mpadge in
https://github.com/cokelaer/fitter/pull/47
* Add Kolmogorov-Smirnov goodness-of-fit statistic by @lahdjirayhan in
Expand Down
Binary file added doc/_static/fitter_256x256.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/_static/fitter_64x64.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/_static/fitter_680x680.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 3 additions & 3 deletions doc/conf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#
# Configuration file for the Sphinx documentation builder.
#
Expand All @@ -25,7 +24,7 @@
version = pkg_resources.require("fitter")[0].version

project = 'fitter'
copyright = '2019, Thomas Cokelaer'
copyright = '2019-2022, Thomas Cokelaer'
author = 'Thomas Cokelaer'

# The short X.Y version
Expand All @@ -51,7 +50,6 @@
'sphinx.ext.coverage',
'sphinx.ext.doctest',
'sphinx.ext.todo',
#'sphinx.ext.mathjax',
'sphinx.ext.ifconfig',
'sphinx.ext.viewcode',
'matplotlib.sphinxext.plot_directive',
Expand Down Expand Up @@ -112,6 +110,8 @@
#
# html_sidebars = {}

html_logo = "_static/fitter_256x256.png"


# -- Options for HTMLHelp output ---------------------------------------------

Expand Down
9 changes: 4 additions & 5 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
easydev
numpy
click
matplotlib
scipy>=0.18
numpy
pandas
click
joblib
scipy>=0.18
tqdm
66 changes: 23 additions & 43 deletions src/fitter/fitter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This file is part of the fitter software
#
# Copyright (c) 2014
# Copyright (c) 2014-2022
#
# File author(s): Thomas Cokelaer <cokelaer@gmail.com>
#
Expand All @@ -27,8 +27,7 @@
import pandas as pd
import pylab
import scipy.stats
from easydev import Progress
from joblib import Parallel, delayed
from tqdm import tqdm
from scipy.stats import entropy as kl_div, kstest

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -199,17 +198,15 @@ def _init(self):
self._ks_stat = {}
self._ks_pval = {}
self._fit_i = 0 # fit progress
self.pb = None
#self.pb = None

def _update_data_pdf(self):
# histogram retuns X with N+1 values. So, we rearrange the X output into only N
self.y, self.x = np.histogram(self._data, bins=self.bins, density=self._density)
self.x = [(this + self.x[i + 1]) / 2.0 for i, this in enumerate(self.x[0:-1])]

def _trim_data(self):
self._data = self._alldata[
np.logical_and(self._alldata >= self._xmin, self._alldata <= self._xmax)
]
self._data = self._alldata[np.logical_and(self._alldata >= self._xmin, self._alldata <= self._xmax)]

def _get_xmin(self):
return self._xmin
Expand All @@ -223,9 +220,7 @@ def _set_xmin(self, value):
self._trim_data()
self._update_data_pdf()

xmin = property(
_get_xmin, _set_xmin, doc="consider only data above xmin. reset if None"
)
xmin = property(_get_xmin, _set_xmin, doc="consider only data above xmin. reset if None")

def _get_xmax(self):
return self._xmax
Expand All @@ -239,9 +234,7 @@ def _set_xmax(self, value):
self._trim_data()
self._update_data_pdf()

xmax = property(
_get_xmax, _set_xmax, doc="consider only data below xmax. reset if None "
)
xmax = property(_get_xmax, _set_xmax, doc="consider only data below xmax. reset if None ")

def _load_all_distributions(self):
"""Replace the :attr:`distributions` attribute with all scipy distributions"""
Expand All @@ -263,7 +256,7 @@ def hist(self):
_ = pylab.hist(self._data, bins=self.bins, density=self._density)
pylab.grid(True)

def _fit_single_distribution(self, distribution, progress: bool):
def _fit_single_distribution(self, distribution):
try:
# need a subprocess to check time it takes. If too long, skip it
dist = eval("scipy.stats." + distribution)
Expand Down Expand Up @@ -300,9 +293,7 @@ def _fit_single_distribution(self, distribution, progress: bool):
dist_fitted = dist(*param)
ks_stat, ks_pval = kstest(self._data, dist_fitted.cdf)

logging.info(
"Fitted {} distribution with error={})".format(distribution, sq_error)
)
logging.info("Fitted {} distribution with error={})".format(distribution, sq_error))

# compute some errors now
self._fitted_errors[distribution] = sq_error
Expand All @@ -312,21 +303,17 @@ def _fit_single_distribution(self, distribution, progress: bool):
self._ks_stat[distribution] = ks_stat
self._ks_pval[distribution] = ks_pval
except Exception: # pragma: no cover
logging.warning(
"SKIPPED {} distribution (taking more than {} seconds)".format(
distribution, self.timeout
)
)
logging.warning("SKIPPED {} distribution (taking more than {} seconds)".format(distribution, self.timeout))
# if we cannot compute the error, set it to large values
self._fitted_errors[distribution] = np.inf
self._aic[distribution] = np.inf
self._bic[distribution] = np.inf
self._kldiv[distribution] = np.inf
if progress:
self._fit_i += 1
self.pb.animate(self._fit_i)
#if srogress:
# self._fit_i += 1
# #self.pb.animate(self._fit_i)

def fit(self, amp=1, progress=False, n_jobs=-1):
def fit(self, progress=False, n_jobs=-1):
r"""Loop over distributions and find best parameter to fit the data for each
When a distribution is fitted onto the data, we populate a set of
Expand All @@ -344,23 +331,22 @@ def fit(self, amp=1, progress=False, n_jobs=-1):

warnings.filterwarnings("ignore", category=RuntimeWarning)

if progress:
self.pb = Progress(len(self.distributions))
from tqdm.contrib.concurrent import thread_map

result = thread_map(self._fit_single_distribution, self.distributions, max_workers=4, disable=not progress)

#jobs = (delayed(self._fit_single_distribution)(dist, progress) for dist in self.distributions)
#pool = Parallel(n_jobs=n_jobs, backend="threading")
#_ = pool(jobs)

jobs = (
delayed(self._fit_single_distribution)(dist, progress)
for dist in self.distributions
)
pool = Parallel(n_jobs=n_jobs, backend="threading")
_ = pool(jobs)
self.df_errors = pd.DataFrame(
{
"sumsquare_error": self._fitted_errors,
"aic": self._aic,
"bic": self._bic,
"kl_div": self._kldiv,
"ks_statistic": self._ks_stat,
"ks_pvalue": self._ks_pval
"ks_pvalue": self._ks_pval,
}
)

Expand Down Expand Up @@ -406,11 +392,7 @@ def get_best(self, method="sumsquare_error"):
name = self.df_errors.sort_values(method).iloc[0].name
params = self.fitted_param[name]
distribution = getattr(scipy.stats, name)
param_names = (
(distribution.shapes + ", loc, scale").split(", ")
if distribution.shapes
else ["loc", "scale"]
)
param_names = (distribution.shapes + ", loc, scale").split(", ") if distribution.shapes else ["loc", "scale"]

param_dict = {}
for d_key, d_val in zip(param_names, params):
Expand Down Expand Up @@ -463,9 +445,7 @@ def suicide(self): # pragma: no cover
ended_at = datetime.now()
diff = ended_at - started_at

if (
it.exc_info[0] is not None
): # pragma: no cover ; if there were any exceptions
if it.exc_info[0] is not None: # pragma: no cover ; if there were any exceptions
a, b, c = it.exc_info
raise Exception(a, b, c) # communicate that to caller

Expand Down
30 changes: 6 additions & 24 deletions src/fitter/histfit.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,7 @@ def __init__(self, data=None, X=None, Y=None, bins=None):

self.N = len(self.X)
self.guess_mean = self.X[int(self.N / 2)]
self.guess_std = sqrt(sum((self.X - mean(self.X)) ** 2) / self.N) / (
sqrt(2 * 3.14)
)
self.guess_std = sqrt(sum((self.X - mean(self.X)) ** 2) / self.N) / (sqrt(2 * 3.14))
self.guess_amp = 1.0

self.func = self._func_normal
Expand All @@ -112,16 +110,10 @@ def fit(
# 5% error on the data to add errors
self.E = [scipy.stats.norm.rvs(0, error_rate) for y in self.Y]
# [scipy.stats.norm.rvs(0, self.std_data * error_rate) for x in range(self.N)]
self.result = scipy.optimize.least_squares(
self.func, (self.guess_mean, self.guess_std, self.guess_amp)
)
self.result = scipy.optimize.least_squares(self.func, (self.guess_mean, self.guess_std, self.guess_amp))

mu, sigma, amplitude = self.result["x"]
pylab.plot(
self.X,
amplitude * scipy.stats.norm.pdf(self.X, mu, sigma),
**error_kwargs
)
pylab.plot(self.X, amplitude * scipy.stats.norm.pdf(self.X, mu, sigma), **error_kwargs)
self.sigmas.append(sigma)
self.amplitudes.append(amplitude)
self.mus.append(mu)
Expand All @@ -131,11 +123,7 @@ def fit(
self.amplitude = mean(self.amplitudes)
self.mu = mean(self.mus)

pylab.plot(
self.X,
self.amplitude * scipy.stats.norm.pdf(self.X, self.mu, self.sigma),
**fit_kwargs
)
pylab.plot(self.X, self.amplitude * scipy.stats.norm.pdf(self.X, self.mu, self.sigma), **fit_kwargs)
if semilogy:
pylab.semilogy()
pylab.grid()
Expand All @@ -150,18 +138,12 @@ def fit(
pylab.fill_between(self.X, M - S, M + S, color="gray", alpha=0.5)
# pylab.plot(self.X, M-S, color="k")
# pylab.plot(self.X, M+S, color="k")
pylab.plot(
self.X,
self.amplitude * scipy.stats.norm.pdf(self.X, self.mu, self.sigma),
**fit_kwargs
)
pylab.plot(self.X, self.amplitude * scipy.stats.norm.pdf(self.X, self.mu, self.sigma), **fit_kwargs)
pylab.grid()

return self.mu, self.sigma, self.amplitude

def _func_normal(self, param):
# amplitude is supposed to be 1./(np.sqrt(2*np.pi)*sigma)* if normalised
mu, sigma, A = param
return sum(
(A * scipy.stats.norm.pdf(self.X, mu, sigma) - (self.Y + self.E)) ** 2
)
return sum((A * scipy.stats.norm.pdf(self.X, mu, sigma) - (self.Y + self.E)) ** 2)
14 changes: 8 additions & 6 deletions src/fitter/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,14 @@ def main(): # pragma: no cover
@main.command()
@click.argument("filename", type=click.STRING)
@click.option("--column-number", type=click.INT, default=1)
@click.option(
"--delimiter", type=click.STRING, default=",", help="look at the first column"
)
@click.option("--delimiter", type=click.STRING, default=",", help="look at the first column")
@click.option(
"--distributions",
type=click.STRING,
default="gamma,beta",
help="llist of distribution",
)
@click.option(
"--tag", type=click.STRING, default="fitter", help="tag to name output files"
)
@click.option("--tag", type=click.STRING, default="fitter", help="tag to name output files")
@click.option("--progress/--no-progress", default=True)
@click.option("--verbose/--no-verbose", default=True)
def fitdist(**kwargs):
Expand Down Expand Up @@ -94,6 +90,12 @@ def fitdist(**kwargs):
with open("{}.log".format(tag), "w") as fout:
fout.write(msg)

@main.command()
def show_distributions(**kwargs):
from fitter import get_distributions
print("\n".join(get_distributions()))



if __name__ == "__main__": # pragma: no cover
main()
5 changes: 4 additions & 1 deletion test/test_main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import pytest
from pathlib import Path
from easydev import TempFile
import subprocess
from scipy import stats
from fitter.main import fitdist
from fitter.main import show_distributions

@pytest.fixture
def setup_teardown():
Expand Down Expand Up @@ -37,3 +37,6 @@ def test_main_app(setup_teardown):

results = runner.invoke(fitdist, ['test.csv', "--progress", "--column-number", 1])
assert results.exit_code == 0

results = runner.invoke(show_distributions, [])
assert results.exit_code == 0

0 comments on commit 3327912

Please sign in to comment.