diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f312232..21ae22e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -8,15 +8,46 @@ on: branches: - master workflow_dispatch: + inputs: + force: + description: 'Set to "true" to mark this run as forced when manually triggered' + required: false + default: 'false' jobs: build: + # Skip this job on push events when the head commit message contains [skip ci] + if: ${{ github.event_name != 'push' || !contains(github.event.head_commit.message, '[skip ci]') }} + permissions: + contents: write runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v3 + with: + fetch-depth: 0 + persist-credentials: true - name: Setup Quarto uses: quarto-dev/quarto-actions/setup@v2 + - name: refresh publications and commit changes + if: ${{ github.event_name == 'workflow_dispatch' || github.event.inputs.force == 'true' }} + env: + API_GITHUB_TOKEN: ${{ secrets.API_GITHUB_TOKEN }} + run: | + dotnet fsi getcomputo-pub.fsx + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + # Stage the generated files (ignore errors if files missing) + git add site/published.yml site/pipeline.yml site/mock-papers.yml || true + # Only commit if there are staged changes + if git diff --staged --quiet; then + echo "No publication changes to commit" + else + git commit -m "Update publications from getcomputo-pub.fsx [skip ci]" + # push to the branch that triggered the workflow + git push origin HEAD:${{ github.ref_name }} + fi + - name: Build site uses: quarto-dev/quarto-actions/render@v2 - name: Upload artifact diff --git a/README.md b/README.md index d7c9a90..d15feab 100644 --- a/README.md +++ b/README.md @@ -9,11 +9,9 @@ This repository stores the source of Computorg. Our website has been built with [Quarto](https://quarto.org), an open-source scientific and technical publishing system. The first thing you need to compile the website is therefore to install Quarto, which can be done by downloading the corresponing installer here: . -::: {.callout-note} ## Positron If you are using the new [Positron IDE](https://positron.posit.co), quarto is already bundled with it. You can simply type `which quarto` within the built-in terminal in Positron and add the returned path to your `PATH`. -::: ### Microsoft DotNet SDK diff --git a/_quarto.yml b/_quarto.yml index 7724844..f61cab8 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -1,7 +1,6 @@ project: type: website output-dir: _site -# pre-render: dotnet fsi getcomputo-pub.fsx website: title: COMPUTO site-url: https://computo.sfds.asso.fr/ diff --git a/getcomputo-pub.fsx b/getcomputo-pub.fsx index 237fd21..f58321d 100644 --- a/getcomputo-pub.fsx +++ b/getcomputo-pub.fsx @@ -15,16 +15,16 @@ open DrBiber open System.Threading.Tasks // exit if QUARTO_PROJECT_RENDER_ALL is set in the environment -if System.Environment.GetEnvironmentVariable("QUARTO_PROJECT_RENDER_ALL") = null then - printfn "QUARTO_PROJECT_RENDER_ALL is not set, exiting." - exit 0 +// if System.Environment.GetEnvironmentVariable("QUARTO_PROJECT_RENDER_ALL") = null then +// printfn "QUARTO_PROJECT_RENDER_ALL is not set, exiting." +// exit 0 // Load environment variables from .env file Env.Load(".env-secret") let client = let client = new GitHubClient(new ProductHeaderValue("computo")) // Using environment variable for token is a good security practice - match System.Environment.GetEnvironmentVariable("GITHUB_TOKEN") with + match System.Environment.GetEnvironmentVariable("API_GITHUB_TOKEN") with | null | "" -> client // No authentication | token -> diff --git a/site/mock-papers.yml b/site/mock-papers.yml index c9915cd..d4e923d 100644 --- a/site/mock-papers.yml +++ b/site/mock-papers.yml @@ -1,4 +1,69 @@ -- abstract': >- +- abstract'@: >- + We present a new technique called “t-SNE” that visualizes + high-dimensional data by giving each datapoint a location in a two + or three-dimensional map. The technique is a variation of Stochastic + Neighbor Embedding {[}@hinton:stochastic{]} that is much easier to + optimize, and produces significantly better visualizations by + reducing the tendency to crowd points together in the center of the + map. t-SNE is better than existing techniques at creating a single + map that reveals structure at many different scales. This is + particularly important for high-dimensional data that lie on several + different, but related, low-dimensional manifolds, such as images of + objects from multiple classes seen from multiple viewpoints. For + visualizing the structure of very large data sets, we show how t-SNE + can use random walks on neighborhood graphs to allow the implicit + structure of all the data to influence the way in which a subset of + the data is displayed. We illustrate the performance of t-SNE on a + wide variety of data sets and compare it with many other + non-parametric visualization techniques, including Sammon mapping, + Isomap, and Locally Linear Embedding. The visualization produced by + t-SNE are significantly better than those produced by other + techniques on almost all of the data sets. + authors@: Laurens van der Maaten and Geoffrey Hinton + bibtex@: >+ + @article{van_der_maaten2008, + author = {van der Maaten, Laurens and Hinton, Geoffrey}, + publisher = {French Statistical Society}, + title = {Visualizing {Data} Using {t-SNE} (Mock Contributon)}, + journal = {Computo}, + date = {2008-08-11}, + doi = {10.57750/xxxxxx}, + issn = {2824-7795}, + langid = {en}, + abstract = {We present a new technique called “t-SNE” that visualizes + high-dimensional data by giving each datapoint a location in a two + or three-dimensional map. The technique is a variation of Stochastic + Neighbor Embedding {[}@hinton:stochastic{]} that is much easier to + optimize, and produces significantly better visualizations by + reducing the tendency to crowd points together in the center of the + map. t-SNE is better than existing techniques at creating a single + map that reveals structure at many different scales. This is + particularly important for high-dimensional data that lie on several + different, but related, low-dimensional manifolds, such as images of + objects from multiple classes seen from multiple viewpoints. For + visualizing the structure of very large data sets, we show how t-SNE + can use random walks on neighborhood graphs to allow the implicit + structure of all the data to influence the way in which a subset of + the data is displayed. We illustrate the performance of t-SNE on a + wide variety of data sets and compare it with many other + non-parametric visualization techniques, including Sammon mapping, + Isomap, and Locally Linear Embedding. The visualization produced by + t-SNE are significantly better than those produced by other + techniques on almost all of the data sets.} + } + + date@: 2008-08-11 + description@: > + This page is a reworking of the original t-SNE article using the Computo template. It aims to help authors submitting to the journal by using some advanced formatting features. We warmly thank the authors of t-SNE and the editor of JMLR for allowing us to use their work to illustrate the Computo spirit. + doi@: 10.57750/xxxxxx + draft@: false + journal@: Computo + pdf@: '' + repo@: published-paper-tsne + title@: Visualizing Data using t-SNE (mock contributon) + url@: '' + year@: 2008 + abstract': >- We present a new technique called “t-SNE” that visualizes high-dimensional data by giving each datapoint a location in a two or three-dimensional map. The technique is a variation of Stochastic @@ -23,12 +88,11 @@ bibtex: >+ @article{van_der_maaten2008, author = {van der Maaten, Laurens and Hinton, Geoffrey}, - publisher = {Société Française de Statistique}, - title = {Visualizing {Data} Using {t-SNE:} A Practical Computo Example - (Mock)}, + publisher = {French Statistical Society}, + title = {Visualizing {Data} Using {t-SNE} (Mock Contributon)}, journal = {Computo}, date = {2008-08-11}, - url = {https://computo.sfds.asso.fr/published-paper-tsne}, + doi = {10.57750/xxxxxx}, issn = {2824-7795}, langid = {en}, abstract = {We present a new technique called “t-SNE” that visualizes @@ -56,15 +120,80 @@ date: 2008-08-11 description: > This page is a reworking of the original t-SNE article using the Computo template. It aims to help authors submitting to the journal by using some advanced formatting features. We warmly thank the authors of t-SNE and the editor of JMLR for allowing us to use their work to illustrate the Computo spirit. - doi: '' + doi: 10.57750/xxxxxx draft: false journal: Computo pdf: '' repo: published-paper-tsne title: Visualizing Data using t-SNE (mock contributon) - url: https://computo-journal.org/published-paper-tsne + url: '' year: 2008 -- abstract': >- +- abstract'@: >- + We present a new technique called “t-SNE” that visualizes + high-dimensional data by giving each datapoint a location in a two + or three-dimensional map. The technique is a variation of Stochastic + Neighbor Embeddi{[}@hinton:stochastic{]} that is much easier to + optimize, and produces significantly better visualizations by + reducing the tendency to crowd points together in the center of the + map. t-SNE is better than existing techniques at creating a single + map that reveals structure at many different scales. This is + particularly important for high-dimensional data that lie on several + different, but related, low-dimensional manifolds, such as images of + objects from multiple classes seen from multiple viewpoints. For + visualizing the structure of very large data sets, we show how t-SNE + can use random walks on neighborhood graphs to allow the implicit + structure of all the data to influence the way in which a subset of + the data is displayed. We illustrate the performance of t-SNE on a + wide variety of data sets and compare it with many other + non-parametric visualization techniques, including Sammon mapping, + Isomap, and Locally Linear Embedding. The visualization produced by + t-SNE are significantly better than those produced by other + techniques on almost all of the data sets. + authors@: Laurens van der Maaten and Geoffrey Hinton + bibtex@: >+ + @article{van_der_maaten2008, + author = {van der Maaten, Laurens and Hinton, Geoffrey}, + publisher = {French Statistical Society}, + title = {Visualizing {Data} Using {t-SNE} (Mock Contributon)}, + journal = {Computo}, + date = {2008-08-11}, + doi = {10.57750/xxxxxx}, + issn = {2824-7795}, + langid = {en}, + abstract = {We present a new technique called “t-SNE” that visualizes + high-dimensional data by giving each datapoint a location in a two + or three-dimensional map. The technique is a variation of Stochastic + Neighbor Embeddi{[}@hinton:stochastic{]} that is much easier to + optimize, and produces significantly better visualizations by + reducing the tendency to crowd points together in the center of the + map. t-SNE is better than existing techniques at creating a single + map that reveals structure at many different scales. This is + particularly important for high-dimensional data that lie on several + different, but related, low-dimensional manifolds, such as images of + objects from multiple classes seen from multiple viewpoints. For + visualizing the structure of very large data sets, we show how t-SNE + can use random walks on neighborhood graphs to allow the implicit + structure of all the data to influence the way in which a subset of + the data is displayed. We illustrate the performance of t-SNE on a + wide variety of data sets and compare it with many other + non-parametric visualization techniques, including Sammon mapping, + Isomap, and Locally Linear Embedding. The visualization produced by + t-SNE are significantly better than those produced by other + techniques on almost all of the data sets.} + } + + date@: 2008-08-11 + description@: > + This page is a reworking of the original t-SNE article using the Computo template. It aims to help authors submitting to the journal by using some advanced formatting features. We warmly thank the authors of t-SNE and the editor of JMLR for allowing us to use their work to illustrate the Computo spirit. + doi@: 10.57750/xxxxxx + draft@: false + journal@: Computo + pdf@: '' + repo@: published-paper-tsne-R + title@: Visualizing Data using t-SNE (mock contributon) + url@: '' + year@: 2008 + abstract': >- We present a new technique called “t-SNE” that visualizes high-dimensional data by giving each datapoint a location in a two or three-dimensional map. The technique is a variation of Stochastic @@ -90,11 +219,10 @@ @article{van_der_maaten2008, author = {van der Maaten, Laurens and Hinton, Geoffrey}, publisher = {French Statistical Society}, - title = {Visualizing {Data} Using {t-SNE:} A Practical {Computo} - Example (Mock)}, + title = {Visualizing {Data} Using {t-SNE} (Mock Contributon)}, journal = {Computo}, date = {2008-08-11}, - url = {https://computo-journal.org/published-paper-tsne-R}, + doi = {10.57750/xxxxxx}, issn = {2824-7795}, langid = {en}, abstract = {We present a new technique called “t-SNE” that visualizes @@ -122,11 +250,11 @@ date: 2008-08-11 description: > This page is a reworking of the original t-SNE article using the Computo template. It aims to help authors submitting to the journal by using some advanced formatting features. We warmly thank the authors of t-SNE and the editor of JMLR for allowing us to use their work to illustrate the Computo spirit. - doi: '' + doi: 10.57750/xxxxxx draft: false journal: Computo pdf: '' repo: published-paper-tsne-R title: Visualizing Data using t-SNE (mock contributon) - url: https://computo-journal.org/published-paper-tsne-R + url: '' year: 2008 diff --git a/site/published.yml b/site/published.yml index c2d8746..264ec1c 100644 --- a/site/published.yml +++ b/site/published.yml @@ -1,4 +1,133 @@ -- abstract': >- +- abstract'@: >- + This study investigates the use of Variational + Auto-Encoders to build a simulator that approximates the law of + genuine observations. Using both simulated and real data in + scenarios involving counterfactuality, we discuss the general task + of evaluating a simulator’s quality, with a focus on comparisons of + statistical properties and predictive performance. While the + simulator built from simulated data shows minor discrepancies, the + results with real data reveal more substantial challenges. Beyond + the technical analysis, we reflect on the broader implications of + simulator design, and consider its role in modeling reality. + authors@: Sandrine Boulet and Antoine Chambaz + bibtex@: >+ + @article{boulet2025, + author = {Boulet, Sandrine and Chambaz, Antoine}, + publisher = {French Statistical Society}, + title = {Draw {Me} a {Simulator}}, + journal = {Computo}, + date = {2025-09-08}, + doi = {10.57750/w1hj-dw22}, + issn = {2824-7795}, + langid = {en}, + abstract = {This study investigates the use of Variational + Auto-Encoders to build a simulator that approximates the law of + genuine observations. Using both simulated and real data in + scenarios involving counterfactuality, we discuss the general task + of evaluating a simulator’s quality, with a focus on comparisons of + statistical properties and predictive performance. While the + simulator built from simulated data shows minor discrepancies, the + results with real data reveal more substantial challenges. Beyond + the technical analysis, we reflect on the broader implications of + simulator design, and consider its role in modeling reality.} + } + + date@: 2025-09-08 + description@: '' + doi@: 10.57750/w1hj-dw22 + draft@: false + journal@: Computo + pdf@: '' + repo@: published-202509-boulet-simulator + title@: Draw Me a Simulator + url@: '' + year@: 2025 + abstract': >- + This study investigates the use of Variational + Auto-Encoders to build a simulator that approximates the law of + genuine observations. Using both simulated and real data in + scenarios involving counterfactuality, we discuss the general task + of evaluating a simulator’s quality, with a focus on comparisons of + statistical properties and predictive performance. While the + simulator built from simulated data shows minor discrepancies, the + results with real data reveal more substantial challenges. Beyond + the technical analysis, we reflect on the broader implications of + simulator design, and consider its role in modeling reality. + authors: Sandrine Boulet and Antoine Chambaz + bibtex: >+ + @article{boulet2025, + author = {Boulet, Sandrine and Chambaz, Antoine}, + publisher = {French Statistical Society}, + title = {Draw {Me} a {Simulator}}, + journal = {Computo}, + date = {2025-09-08}, + doi = {10.57750/w1hj-dw22}, + issn = {2824-7795}, + langid = {en}, + abstract = {This study investigates the use of Variational + Auto-Encoders to build a simulator that approximates the law of + genuine observations. Using both simulated and real data in + scenarios involving counterfactuality, we discuss the general task + of evaluating a simulator’s quality, with a focus on comparisons of + statistical properties and predictive performance. While the + simulator built from simulated data shows minor discrepancies, the + results with real data reveal more substantial challenges. Beyond + the technical analysis, we reflect on the broader implications of + simulator design, and consider its role in modeling reality.} + } + + date: 2025-09-08 + description: '' + doi: 10.57750/w1hj-dw22 + draft: false + journal: Computo + pdf: '' + repo: published-202509-boulet-simulator + title: Draw Me a Simulator + url: '' + year: 2025 +- abstract'@: >- + Model-based clustering provides a principled way of + developing clustering methods. We develop a new model-based + clustering methods for count data. The method combines clustering + and variable selection for improved clustering. The method is based + on conditionally independent Poisson mixture models and Poisson + generalized linear models. The method is demonstrated on simulated + data and data from an ultra running race, where the method yields + excellent clustering and variable selection performance. + authors@: Julien Jacques and Thomas Brendan Murphy + bibtex@: >+ + @article{jacques2025, + author = {Jacques, Julien and Brendan Murphy, Thomas}, + publisher = {French Statistical Society}, + title = {Model-Based {Clustering} and {Variable} {Selection} for + {Multivariate} {Count} {Data}}, + journal = {Computo}, + date = {2025-07-01}, + doi = {10.57750/6v7b-8483}, + issn = {2824-7795}, + langid = {en}, + abstract = {Model-based clustering provides a principled way of + developing clustering methods. We develop a new model-based + clustering methods for count data. The method combines clustering + and variable selection for improved clustering. The method is based + on conditionally independent Poisson mixture models and Poisson + generalized linear models. The method is demonstrated on simulated + data and data from an ultra running race, where the method yields + excellent clustering and variable selection performance.} + } + + date@: 2025-07-01 + description@: '' + doi@: 10.57750/6v7b-8483 + draft@: false + journal@: Computo + pdf@: '' + repo@: published-202507-jacques-count-data + title@: Model-Based Clustering and Variable Selection for Multivariate Count Data + url@: '' + year@: 2025 + abstract': >- Model-based clustering provides a principled way of developing clustering methods. We develop a new model-based clustering methods for count data. The method combines clustering @@ -39,7 +168,74 @@ title: Model-Based Clustering and Variable Selection for Multivariate Count Data url: '' year: 2025 -- abstract': >- +- abstract'@: >- + Reservoir Computing (RC) is a machine learning method + based on neural networks that efficiently process information + generated by dynamical systems. It has been successful in solving + various tasks including time series forecasting, language processing + or voice processing. RC is implemented in `Python` and `Julia` but + not in `R`. This article introduces `reservoirnet`, an `R` package + providing access to the `Python` API `ReservoirPy`, allowing `R` + users to harness the power of reservoir computing. This article + provides an introduction to the fundamentals of RC and showcases its + real-world applicability through three distinct sections. First, we + cover the foundational concepts of RC, setting the stage for + understanding its capabilities. Next, we delve into the practical + usage of `reservoirnet` through two illustrative examples. These + examples demonstrate how it can be applied to real-world problems, + specifically, regression of COVID-19 hospitalizations and + classification of Japanese vowels. Finally, we present a + comprehensive analysis of a real-world application of + `reservoirnet`, where it was used to forecast COVID-19 + hospitalizations at Bordeaux University Hospital using public data + and electronic health records. + authors@: Thomas Ferté, Kalidou Ba, Dan Dutartre, Pierrick Legrand, Vianney Jouhet, Rodolphe Thiébaut, Xavier Hinaut and Boris P Hejblum + bibtex@: >+ + @article{ferté2025, + author = {Ferté, Thomas and Ba, Kalidou and Dutartre, Dan and Legrand, + Pierrick and Jouhet, Vianney and Thiébaut, Rodolphe and Hinaut, + Xavier and P Hejblum, Boris}, + publisher = {French Statistical Society}, + title = {Reservoir {Computing} in {R:} A {Tutorial} for {Using} + Reservoirnet to {Predict} {Complex} {Time-Series}}, + journal = {Computo}, + date = {2025-06-27}, + doi = {10.57750/arxn-6z34}, + issn = {2824-7795}, + langid = {en}, + abstract = {Reservoir Computing (RC) is a machine learning method + based on neural networks that efficiently process information + generated by dynamical systems. It has been successful in solving + various tasks including time series forecasting, language processing + or voice processing. RC is implemented in `Python` and `Julia` but + not in `R`. This article introduces `reservoirnet`, an `R` package + providing access to the `Python` API `ReservoirPy`, allowing `R` + users to harness the power of reservoir computing. This article + provides an introduction to the fundamentals of RC and showcases its + real-world applicability through three distinct sections. First, we + cover the foundational concepts of RC, setting the stage for + understanding its capabilities. Next, we delve into the practical + usage of `reservoirnet` through two illustrative examples. These + examples demonstrate how it can be applied to real-world problems, + specifically, regression of COVID-19 hospitalizations and + classification of Japanese vowels. Finally, we present a + comprehensive analysis of a real-world application of + `reservoirnet`, where it was used to forecast COVID-19 + hospitalizations at Bordeaux University Hospital using public data + and electronic health records.} + } + + date@: 2025-06-27 + description@: '' + doi@: 10.57750/arxn-6z34 + draft@: false + journal@: Computo + pdf@: '' + repo@: published-202505-ferte-reservoirnet + title@: 'Reservoir Computing in R: a Tutorial for Using reservoirnet to Predict Complex Time-Series' + url@: '' + year@: 2025 + abstract': >- Reservoir Computing (RC) is a machine learning method based on neural networks that efficiently process information generated by dynamical systems. It has been successful in solving @@ -106,7 +302,62 @@ title: 'Reservoir Computing in R: a Tutorial for Using reservoirnet to Predict Complex Time-Series' url: '' year: 2025 -- abstract': >- +- abstract'@: >- + The `R` Package `IBMPopSim` facilitates the simulation of + the random evolution of heterogeneous populations using stochastic + Individual-Based Models (IBMs). The package enables users to + simulate population evolution, in which individuals are + characterized by their age and some characteristics, and the + population is modified by different types of events, including + births/arrivals, death/exit events, or changes of characteristics. + The frequency at which an event can occur to an individual can + depend on their age and characteristics, but also on the + characteristics of other individuals (interactions). Such models + have a wide range of applications in fields including actuarial + science, biology, ecology or epidemiology. `IBMPopSim` overcomes the + limitations of time-consuming IBMs simulations by implementing new + efficient algorithms based on thinning methods, which are compiled + using the `Rcpp` package while providing a user-friendly interface. + authors@: Daphné Giorgi, Sarah Kaakai and Vincent Lemaire + bibtex@: >+ + @article{giorgi2025, + author = {Giorgi, Daphné and Kaakai, Sarah and Lemaire, Vincent}, + publisher = {French Statistical Society}, + title = {Efficient Simulation of Individual-Based Population Models}, + journal = {Computo}, + date = {2025-01-27}, + doi = {10.57750/sfxn-1t05}, + issn = {2824-7795}, + langid = {en}, + abstract = {The `R` Package `IBMPopSim` facilitates the simulation of + the random evolution of heterogeneous populations using stochastic + Individual-Based Models (IBMs). The package enables users to + simulate population evolution, in which individuals are + characterized by their age and some characteristics, and the + population is modified by different types of events, including + births/arrivals, death/exit events, or changes of characteristics. + The frequency at which an event can occur to an individual can + depend on their age and characteristics, but also on the + characteristics of other individuals (interactions). Such models + have a wide range of applications in fields including actuarial + science, biology, ecology or epidemiology. `IBMPopSim` overcomes the + limitations of time-consuming IBMs simulations by implementing new + efficient algorithms based on thinning methods, which are compiled + using the `Rcpp` package while providing a user-friendly interface.} + } + + date@: 2025-01-27 + description@: > + This document provides a full description of the Stochastic Individual-Based Models (IBMs) that can be implemented in the IBMPopSim package. A unified mathematical and simulation framework is given, with a detailed description of the simulation algorithm. Examples of applications for the package are also provided, showing the performance and flexibility of IBMPopSim. + doi@: 10.57750/sfxn-1t05 + draft@: false + journal@: Computo + pdf@: '' + repo@: published-202412-giorgi-efficient + title@: Efficient simulation of individual-based population models + url@: '' + year@: 2025 + abstract': >- The `R` Package `IBMPopSim` facilitates the simulation of the random evolution of heterogeneous populations using stochastic Individual-Based Models (IBMs). The package enables users to @@ -161,7 +412,65 @@ title: Efficient simulation of individual-based population models url: '' year: 2025 -- abstract': >- +- abstract'@: >- + In this paper, Spectral Bridges, a novel clustering + algorithm, is introduced. This algorithm builds upon the traditional + k-means and spectral clustering frameworks by subdividing data into + small Voronoï regions, which are subsequently merged according to a + connectivity measure. Drawing inspiration from Support Vector + Machine’s margin concept, a non-parametric clustering approach is + proposed, building an affinity margin between each pair of Voronoï + regions. This approach delineates intricate, non-convex cluster + structures and is robust to hyperparameter choice. The numerical + experiments underscore Spectral Bridges as a fast, robust, and + versatile tool for clustering tasks spanning diverse domains. Its + efficacy extends to large-scale scenarios encompassing both + real-world and synthetic datasets. The Spectral Bridge algorithm is + implemented both in Python (\textless + https://pypi.org/project/spectral-bridges\textgreater) and R + \textless + https://github.com/cambroise/spectral-bridges-Rpackage\textgreater). + authors@: Félix Laplante and Christophe Ambroise + bibtex@: >+ + @article{laplante2024, + author = {Laplante, Félix and Ambroise, Christophe}, + publisher = {French Statistical Society}, + title = {Spectral {Bridges}}, + journal = {Computo}, + date = {2024-12-13}, + doi = {10.57750/1gr8-bk61}, + issn = {2824-7795}, + langid = {en}, + abstract = {In this paper, Spectral Bridges, a novel clustering + algorithm, is introduced. This algorithm builds upon the traditional + k-means and spectral clustering frameworks by subdividing data into + small Voronoï regions, which are subsequently merged according to a + connectivity measure. Drawing inspiration from Support Vector + Machine’s margin concept, a non-parametric clustering approach is + proposed, building an affinity margin between each pair of Voronoï + regions. This approach delineates intricate, non-convex cluster + structures and is robust to hyperparameter choice. The numerical + experiments underscore Spectral Bridges as a fast, robust, and + versatile tool for clustering tasks spanning diverse domains. Its + efficacy extends to large-scale scenarios encompassing both + real-world and synthetic datasets. The Spectral Bridge algorithm is + implemented both in Python (\textless + https://pypi.org/project/spectral-bridges\textgreater) and R + \textless + https://github.com/cambroise/spectral-bridges-Rpackage\textgreater).} + } + + date@: 2024-12-13 + description@: Scalable Spectral Clustering Based on Vector Quantization + doi@: 10.57750/1gr8-bk61 + draft@: false + journal@: Computo + pdf@: '' + repo@: published-202412-ambroise-spectral + title@: Spectral Bridges + url@: '' + year@: 2024 + abstract': >- In this paper, Spectral Bridges, a novel clustering algorithm, is introduced. This algorithm builds upon the traditional k-means and spectral clustering frameworks by subdividing data into @@ -219,7 +528,66 @@ title: Spectral Bridges url: '' year: 2024 -- abstract': >- +- abstract'@: >- + Conformal Inference (CI) is a popular approach for + generating finite sample prediction intervals based on the output of + any point prediction method when data are exchangeable. Adaptive + Conformal Inference (ACI) algorithms extend CI to the case of + sequentially observed data, such as time series, and exhibit strong + theoretical guarantees without having to assume exchangeability of + the observed data. The common thread that unites algorithms in the + ACI family is that they adaptively adjust the width of the generated + prediction intervals in response to the observed data. We provide a + detailed description of five ACI algorithms and their theoretical + guarantees, and test their performance in simulation studies. We + then present a case study of producing prediction intervals for + influenza incidence in the United States based on black-box point + forecasts. Implementations of all the algorithms are released as an + open-source `R` package, `AdaptiveConformal`, which also includes + tools for visualizing and summarizing conformal prediction + intervals. + authors@: Herbert Susmann, Antoine Chambaz and Julie Josse + bibtex@: >+ + @article{susmann2024, + author = {Susmann, Herbert and Chambaz, Antoine and Josse, Julie}, + publisher = {French Statistical Society}, + title = {AdaptiveConformal: {An} {`R`} {Package} for {Adaptive} + {Conformal} {Inference}}, + journal = {Computo}, + date = {2024-07-18}, + doi = {10.57750/edan-5f53}, + issn = {2824-7795}, + langid = {en}, + abstract = {Conformal Inference (CI) is a popular approach for + generating finite sample prediction intervals based on the output of + any point prediction method when data are exchangeable. Adaptive + Conformal Inference (ACI) algorithms extend CI to the case of + sequentially observed data, such as time series, and exhibit strong + theoretical guarantees without having to assume exchangeability of + the observed data. The common thread that unites algorithms in the + ACI family is that they adaptively adjust the width of the generated + prediction intervals in response to the observed data. We provide a + detailed description of five ACI algorithms and their theoretical + guarantees, and test their performance in simulation studies. We + then present a case study of producing prediction intervals for + influenza incidence in the United States based on black-box point + forecasts. Implementations of all the algorithms are released as an + open-source `R` package, `AdaptiveConformal`, which also includes + tools for visualizing and summarizing conformal prediction + intervals.} + } + + date@: 2024-07-18 + description@: '' + doi@: 10.57750/edan-5f53 + draft@: false + journal@: Computo + pdf@: '' + repo@: published-202407-susmann-adaptive-conformal + title@: 'AdaptiveConformal: An `R` Package for Adaptive Conformal Inference' + url@: '' + year@: 2024 + abstract': >- Conformal Inference (CI) is a popular approach for generating finite sample prediction intervals based on the output of any point prediction method when data are exchangeable. Adaptive @@ -278,7 +646,7 @@ title: 'AdaptiveConformal: An `R` Package for Adaptive Conformal Inference' url: '' year: 2024 -- abstract': >- +- abstract'@: >- Appropriate spatiotemporal modelling of wildfire activity is crucial for its prediction and risk management. Here, we focus on wildfire risk in the Aquitaine region in the Southwest of France and @@ -308,8 +676,8 @@ this paper is also intended to provide a full workflow for implementing the Bayesian estimation of marked log-Gaussian Cox processes using the R-INLA package of the R statistical software. - authors: Juliette Legrand, François Pimont, Jean-Luc Dupuy and Thomas Opitz - bibtex: >+ + authors@: Juliette Legrand, François Pimont, Jean-Luc Dupuy and Thomas Opitz + bibtex@: >+ @article{legrand2024, author = {Legrand, Juliette and Pimont, François and Dupuy, Jean-Luc and Opitz, Thomas}, @@ -352,27 +720,177 @@ processes using the R-INLA package of the R statistical software.} } - date: 2024-07-12 - description: '' - doi: 10.57750/4y84-4t68 - draft: false - journal: Computo - pdf: '' - repo: published-202407-legrand-wildfires - title: Bayesian spatiotemporal modelling of wildfire occurrences and sizes for projections under climate change - url: '' - year: 2024 -- abstract': >- - We address the challenge of identifying multiple change - points in a group of independent time series, assuming these change - points occur simultaneously in all series and their number is - unknown. The search for the best segmentation can be expressed as a - minimization problem over a given cost function. We focus on dynamic - programming algorithms that solve this problem exactly. When the - number of changes is proportional to data length, an - inequality-based pruning rule encoded in the PELT algorithm leads to - a linear time complexity. Another type of pruning, called functional - pruning, gives a close-to-linear time complexity whatever the number + date@: 2024-07-12 + description@: '' + doi@: 10.57750/4y84-4t68 + draft@: false + journal@: Computo + pdf@: '' + repo@: published-202407-legrand-wildfires + title@: Bayesian spatiotemporal modelling of wildfire occurrences and sizes for projections under climate change + url@: '' + year@: 2024 + abstract': >- + Appropriate spatiotemporal modelling of wildfire activity + is crucial for its prediction and risk management. Here, we focus on + wildfire risk in the Aquitaine region in the Southwest of France and + its projection under climate change. We study whether wildfire risk + could further increase under climate change in this specific region, + which does not lie in the historical core area of wildfires in + Southeastern France, corresponding to the Southwest. For this + purpose, we consider a marked spatiotemporal point process, a + flexible model for occurrences and magnitudes of such environmental + risks, where the magnitudes are defined as the burnt areas. The + model is first calibrated using 14 years of past observation data of + wildfire occurrences and weather variables, and then applied for + projection of climate-change impacts using simulations of numerical + climate models until 2100 as new inputs. We work within the + framework of a spatiotemporal Bayesian hierarchical model, and we + present the workflow of its implementation for a large dataset at + daily resolution for 8km-pixels using the INLA-SPDE approach. The + assessment of the posterior distributions shows a satisfactory fit + of the model for the observation period. We stochastically simulate + projections of future wildfire activity by combining climate model + output with posterior simulations of model parameters. Depending on + climate models, spline-smoothed projections indicate low to moderate + increase of wildfire activity under climate change. The increase is + weaker than in the historical core area, which we attribute to + different weather conditions (oceanic versus Mediterranean). Besides + providing a relevant case study of environmental risk modelling, + this paper is also intended to provide a full workflow for + implementing the Bayesian estimation of marked log-Gaussian Cox + processes using the R-INLA package of the R statistical software. + authors: Juliette Legrand, François Pimont, Jean-Luc Dupuy and Thomas Opitz + bibtex: >+ + @article{legrand2024, + author = {Legrand, Juliette and Pimont, François and Dupuy, Jean-Luc + and Opitz, Thomas}, + publisher = {French Statistical Society}, + title = {Bayesian Spatiotemporal Modelling of Wildfire Occurrences and + Sizes for Projections Under Climate Change}, + journal = {Computo}, + date = {2024-07-12}, + doi = {10.57750/4y84-4t68}, + issn = {2824-7795}, + langid = {en}, + abstract = {Appropriate spatiotemporal modelling of wildfire activity + is crucial for its prediction and risk management. Here, we focus on + wildfire risk in the Aquitaine region in the Southwest of France and + its projection under climate change. We study whether wildfire risk + could further increase under climate change in this specific region, + which does not lie in the historical core area of wildfires in + Southeastern France, corresponding to the Southwest. For this + purpose, we consider a marked spatiotemporal point process, a + flexible model for occurrences and magnitudes of such environmental + risks, where the magnitudes are defined as the burnt areas. The + model is first calibrated using 14 years of past observation data of + wildfire occurrences and weather variables, and then applied for + projection of climate-change impacts using simulations of numerical + climate models until 2100 as new inputs. We work within the + framework of a spatiotemporal Bayesian hierarchical model, and we + present the workflow of its implementation for a large dataset at + daily resolution for 8km-pixels using the INLA-SPDE approach. The + assessment of the posterior distributions shows a satisfactory fit + of the model for the observation period. We stochastically simulate + projections of future wildfire activity by combining climate model + output with posterior simulations of model parameters. Depending on + climate models, spline-smoothed projections indicate low to moderate + increase of wildfire activity under climate change. The increase is + weaker than in the historical core area, which we attribute to + different weather conditions (oceanic versus Mediterranean). Besides + providing a relevant case study of environmental risk modelling, + this paper is also intended to provide a full workflow for + implementing the Bayesian estimation of marked log-Gaussian Cox + processes using the R-INLA package of the R statistical software.} + } + + date: 2024-07-12 + description: '' + doi: 10.57750/4y84-4t68 + draft: false + journal: Computo + pdf: '' + repo: published-202407-legrand-wildfires + title: Bayesian spatiotemporal modelling of wildfire occurrences and sizes for projections under climate change + url: '' + year: 2024 +- abstract'@: >- + We address the challenge of identifying multiple change + points in a group of independent time series, assuming these change + points occur simultaneously in all series and their number is + unknown. The search for the best segmentation can be expressed as a + minimization problem over a given cost function. We focus on dynamic + programming algorithms that solve this problem exactly. When the + number of changes is proportional to data length, an + inequality-based pruning rule encoded in the PELT algorithm leads to + a linear time complexity. Another type of pruning, called functional + pruning, gives a close-to-linear time complexity whatever the number + of changes, but only for the analysis of univariate time series. We + propose a few extensions of functional pruning for multiple + independent time series based on the use of simple geometric shapes + (balls and hyperrectangles). We focus on the Gaussian case, but some + of our rules can be easily extended to the exponential family. In a + simulation study we compare the computational efficiency of + different geometric-based pruning rules. We show that for a small + number of time series some of them ran significantly faster than + inequality-based approaches in particular when the underlying number + of changes is small compared to the data length. + authors@: Liudmila Pishchagina, Guillem Rigaill and Vincent Runge + bibtex@: >+ + @article{pishchagina2024, + author = {Pishchagina, Liudmila and Rigaill, Guillem and Runge, + Vincent}, + publisher = {French Statistical Society}, + title = {Geometric-Based {Pruning} {Rules} for {Change} {Point} + {Detection} in {Multiple} {Independent} {Time} {Series}}, + journal = {Computo}, + date = {2024-07-12}, + doi = {10.57750/9vvx-eq57}, + issn = {2824-7795}, + langid = {en}, + abstract = {We address the challenge of identifying multiple change + points in a group of independent time series, assuming these change + points occur simultaneously in all series and their number is + unknown. The search for the best segmentation can be expressed as a + minimization problem over a given cost function. We focus on dynamic + programming algorithms that solve this problem exactly. When the + number of changes is proportional to data length, an + inequality-based pruning rule encoded in the PELT algorithm leads to + a linear time complexity. Another type of pruning, called functional + pruning, gives a close-to-linear time complexity whatever the number + of changes, but only for the analysis of univariate time series. We + propose a few extensions of functional pruning for multiple + independent time series based on the use of simple geometric shapes + (balls and hyperrectangles). We focus on the Gaussian case, but some + of our rules can be easily extended to the exponential family. In a + simulation study we compare the computational efficiency of + different geometric-based pruning rules. We show that for a small + number of time series some of them ran significantly faster than + inequality-based approaches in particular when the underlying number + of changes is small compared to the data length.} + } + + date@: 2024-07-12 + description@: '' + doi@: 10.57750/9vvx-eq57 + draft@: false + journal@: Computo + pdf@: '' + repo@: published-202406-pishchagina-change-point + title@: Geometric-Based Pruning Rules for Change Point Detection in Multiple Independent Time Series + url@: '' + year@: 2024 + abstract': >- + We address the challenge of identifying multiple change + points in a group of independent time series, assuming these change + points occur simultaneously in all series and their number is + unknown. The search for the best segmentation can be expressed as a + minimization problem over a given cost function. We focus on dynamic + programming algorithms that solve this problem exactly. When the + number of changes is proportional to data length, an + inequality-based pruning rule encoded in the PELT algorithm leads to + a linear time complexity. Another type of pruning, called functional + pruning, gives a close-to-linear time complexity whatever the number of changes, but only for the analysis of univariate time series. We propose a few extensions of functional pruning for multiple independent time series based on the use of simple geometric shapes @@ -428,7 +946,60 @@ title: Geometric-Based Pruning Rules for Change Point Detection in Multiple Independent Time Series url: '' year: 2024 -- abstract': >- +- abstract'@: >- + Crowdsourcing is a quick and easy way to collect labels + for large datasets, involving many workers. However, workers often + disagree with each other. Sources of error can arise from the + workers’ skills, but also from the intrinsic difficulty of the task. + We present `peerannot`: a `Python` library for managing and learning + from crowdsourced labels for classification. Our library allows + users to aggregate labels from common noise models or train a deep + learning-based classifier directly from crowdsourced labels. In + addition, we provide an identification module to easily explore the + task difficulty of datasets and worker capabilities. + authors@: Tanguy Lefort, Benjamin Charlier, Alexis Joly and Joseph Salmon + bibtex@: >+ + @article{lefort2024, + author = {Lefort, Tanguy and Charlier, Benjamin and Joly, Alexis and + Salmon, Joseph}, + publisher = {French Statistical Society}, + title = {Peerannot: Classification for Crowdsourced Image Datasets + with {Python}}, + journal = {Computo}, + date = {2024-05-07}, + doi = {10.57750/qmaz-gr91}, + issn = {2824-7795}, + langid = {en}, + abstract = {Crowdsourcing is a quick and easy way to collect labels + for large datasets, involving many workers. However, workers often + disagree with each other. Sources of error can arise from the + workers’ skills, but also from the intrinsic difficulty of the task. + We present `peerannot`: a `Python` library for managing and learning + from crowdsourced labels for classification. Our library allows + users to aggregate labels from common noise models or train a deep + learning-based classifier directly from crowdsourced labels. In + addition, we provide an identification module to easily explore the + task difficulty of datasets and worker capabilities.} + } + + date@: 2024-05-07 + description@: > + Crowdsourcing is a quick and easy way to collect labels for large datasets, involving many workers. + + However, it is common for workers to disagree with each other. + + Sources of error can arise from the workers' skills, but also from the intrinsic difficulty of the task. + + We introduce `peerannot`, a Python library for managing and learning from crowdsourced labels of image classification tasks. + doi@: 10.57750/qmaz-gr91 + draft@: false + journal@: Computo + pdf@: '' + repo@: published-202402-lefort-peerannot + title@: 'Peerannot: classification for crowdsourced image datasets with Python' + url@: '' + year@: 2024 + abstract': >- Crowdsourcing is a quick and easy way to collect labels for large datasets, involving many workers. However, workers often disagree with each other. Sources of error can arise from the @@ -481,7 +1052,63 @@ title: 'Peerannot: classification for crowdsourced image datasets with Python' url: '' year: 2024 -- abstract': >- +- abstract'@: >- + We propose a dimension reduction strategy in order to + improve the performance of importance sampling in high dimensions. + The idea is to estimate variance terms in a small number of suitably + chosen directions. We first prove that the optimal directions, i.e., + the ones that minimize the Kullback-\/-Leibler divergence with the + optimal auxiliary density, are the eigenvectors associated with + extreme (small or large) eigenvalues of the optimal covariance + matrix. We then perform extensive numerical experiments showing that + as dimension increases, these directions give estimations which are + very close to optimal. Moreover, we demonstrate that the estimation + remains accurate even when a simple empirical estimator of the + covariance matrix is used to compute these directions. The + theoretical and numerical results open the way for different + generalizations, in particular the incorporation of such ideas in + adaptive importance sampling schemes. + authors@: Maxime El Masri, Jérôme Morio and Florian Simatos + bibtex@: >+ + @article{el_masri2024, + author = {El Masri, Maxime and Morio, Jérôme and Simatos, Florian}, + publisher = {French Statistical Society}, + title = {Optimal Projection for Parametric Importance Sampling in High + Dimensions}, + journal = {Computo}, + date = {2024-03-11}, + doi = {10.57750/jjza-6j82}, + issn = {2824-7795}, + langid = {en}, + abstract = {We propose a dimension reduction strategy in order to + improve the performance of importance sampling in high dimensions. + The idea is to estimate variance terms in a small number of suitably + chosen directions. We first prove that the optimal directions, i.e., + the ones that minimize the Kullback-\/-Leibler divergence with the + optimal auxiliary density, are the eigenvectors associated with + extreme (small or large) eigenvalues of the optimal covariance + matrix. We then perform extensive numerical experiments showing that + as dimension increases, these directions give estimations which are + very close to optimal. Moreover, we demonstrate that the estimation + remains accurate even when a simple empirical estimator of the + covariance matrix is used to compute these directions. The + theoretical and numerical results open the way for different + generalizations, in particular the incorporation of such ideas in + adaptive importance sampling schemes.} + } + + date@: 2024-03-11 + description@: > + This document provides a dimension-reduction strategy in order to improve the performance of importance sampling in high dimensions. + doi@: 10.57750/jjza-6j82 + draft@: false + journal@: Computo + pdf@: '' + repo@: published-202402-elmasri-optimal + title@: Optimal projection for parametric importance sampling in high dimensions + url@: '' + year@: 2024 + abstract': >- We propose a dimension reduction strategy in order to improve the performance of importance sampling in high dimensions. The idea is to estimate variance terms in a small number of suitably @@ -537,7 +1164,45 @@ title: Optimal projection for parametric importance sampling in high dimensions url: '' year: 2024 -- abstract': >- +- abstract'@: >- + In numerous applications, cloud of points do seem to + exhibit *repulsion* in the intuitive sense that there is no local + cluster as in a Poisson process. Motivated by data coming from + cellular networks, we devise a classification algorithm based on the + form of the Voronoi cells. We show that, in the particular set of + data we are given, we can retrieve some repulsiveness between + antennas, which was expected for engineering reasons. + authors@: Hamza Adrat and Laurent Decreusefond + bibtex@: >+ + @article{adrat2024, + author = {Adrat, Hamza and Decreusefond, Laurent}, + publisher = {French Statistical Society}, + title = {Point {Process} {Discrimination} {According} to {Repulsion}}, + journal = {Computo}, + date = {2024-01-25}, + doi = {10.57750/3r07-aw28}, + issn = {2824-7795}, + langid = {en}, + abstract = {In numerous applications, cloud of points do seem to + exhibit *repulsion* in the intuitive sense that there is no local + cluster as in a Poisson process. Motivated by data coming from + cellular networks, we devise a classification algorithm based on the + form of the Voronoi cells. We show that, in the particular set of + data we are given, we can retrieve some repulsiveness between + antennas, which was expected for engineering reasons.} + } + + date@: 2024-01-25 + description@: '' + doi@: 10.57750/3r07-aw28 + draft@: false + journal@: Computo + pdf@: '' + repo@: published-202401-adrat-repulsion + title@: Point Process Discrimination According to Repulsion + url@: '' + year@: 2024 + abstract': >- In numerous applications, cloud of points do seem to exhibit *repulsion* in the intuitive sense that there is no local cluster as in a Poisson process. Motivated by data coming from @@ -575,7 +1240,7 @@ title: Point Process Discrimination According to Repulsion url: '' year: 2024 -- abstract': >- +- abstract'@: >- In plant epidemiology, pest abundance is measured in field trials using metrics assessing either pest prevalence (fraction of the plant population infected) or pest intensity (average number of @@ -618,8 +1283,117 @@ agronomists, plant pathologists, and applied statisticians to analyze pest surveys and field experiments conducted to assess the efficacy of pest treatments. - authors: Armand Favrot and David Makowski - bibtex: >+ + authors@: Armand Favrot and David Makowski + bibtex@: >+ + @article{favrot2024, + author = {Favrot, Armand and Makowski, David}, + publisher = {French Statistical Society}, + title = {A Hierarchical Model to Evaluate Pest Treatments from + Prevalence and Intensity Data}, + journal = {Computo}, + date = {2024-01-09}, + doi = {10.57750/6cgk-g727}, + issn = {2824-7795}, + langid = {en}, + abstract = {In plant epidemiology, pest abundance is measured in field + trials using metrics assessing either pest prevalence (fraction of + the plant population infected) or pest intensity (average number of + pest individuals present in infected plants). Some of these trials + rely on prevalence, while others rely on intensity, depending on the + protocols. In this paper, we present a hierarchical Bayesian model + able to handle both types of data. In this model, the intensity and + prevalence variables are derived from a latent variable representing + the number of pest individuals on each host individual, assumed to + follow a Poisson distribution. Effects of pest treaments, time + trend, and between-trial variability are described using fixed and + random effects. We apply the model to a real data set in the context + of aphid control in sugar beet fields. In this data set, prevalence + and intensity were derived from aphid counts observed on either + factorial trials testing different types of pesticides treatments or + field surveys monitoring aphid abundance. Next, we perform + simulations to assess the impacts of using either prevalence or + intensity data, or both types of data simultaneously, on the + accuracy of the model parameter estimates and on the ranking of + pesticide treatment efficacy. Our results show that, when pest + prevalence and pest intensity data are collected separately in + different trials, the model parameters are more accurately estimated + using both types of trials than using one type of trials only. When + prevalence data are collected in all trials and intensity data are + collected in a subset of trials, estimations and pest treatment + ranking are more accurate using both types of data than using + prevalence data only. When only one type of observation can be + collected in a pest survey or in an experimental trial, our analysis + indicates that it is better to collect intensity data than + prevalence data when all or most of the plants are expected to be + infested, but that both types of data lead to similar results when + the level of infestation is low to moderate. Finally, our + simulations show that it is unlikely to obtain accurate results with + fewer than 40 trials when assessing the efficacy of pest control + treatments based on prevalence and intensity data. Because of its + flexibility, our model can be used to evaluate and rank the efficacy + of pest treatments using either prevalence or intensity data, or + both types of data simultaneously. As it can be easily implemented + using standard Bayesian packages, we hope that it will be useful to + agronomists, plant pathologists, and applied statisticians to + analyze pest surveys and field experiments conducted to assess the + efficacy of pest treatments.} + } + + date@: 2024-01-09 + description@: '' + doi@: 10.57750/6cgk-g727 + draft@: false + journal@: Computo + pdf@: '' + repo@: published-202312-favrot-hierarchical + title@: A hierarchical model to evaluate pest treatments from prevalence and intensity data + url@: '' + year@: 2024 + abstract': >- + In plant epidemiology, pest abundance is measured in field + trials using metrics assessing either pest prevalence (fraction of + the plant population infected) or pest intensity (average number of + pest individuals present in infected plants). Some of these trials + rely on prevalence, while others rely on intensity, depending on the + protocols. In this paper, we present a hierarchical Bayesian model + able to handle both types of data. In this model, the intensity and + prevalence variables are derived from a latent variable representing + the number of pest individuals on each host individual, assumed to + follow a Poisson distribution. Effects of pest treaments, time + trend, and between-trial variability are described using fixed and + random effects. We apply the model to a real data set in the context + of aphid control in sugar beet fields. In this data set, prevalence + and intensity were derived from aphid counts observed on either + factorial trials testing different types of pesticides treatments or + field surveys monitoring aphid abundance. Next, we perform + simulations to assess the impacts of using either prevalence or + intensity data, or both types of data simultaneously, on the + accuracy of the model parameter estimates and on the ranking of + pesticide treatment efficacy. Our results show that, when pest + prevalence and pest intensity data are collected separately in + different trials, the model parameters are more accurately estimated + using both types of trials than using one type of trials only. When + prevalence data are collected in all trials and intensity data are + collected in a subset of trials, estimations and pest treatment + ranking are more accurate using both types of data than using + prevalence data only. When only one type of observation can be + collected in a pest survey or in an experimental trial, our analysis + indicates that it is better to collect intensity data than + prevalence data when all or most of the plants are expected to be + infested, but that both types of data lead to similar results when + the level of infestation is low to moderate. Finally, our + simulations show that it is unlikely to obtain accurate results with + fewer than 40 trials when assessing the efficacy of pest control + treatments based on prevalence and intensity data. Because of its + flexibility, our model can be used to evaluate and rank the efficacy + of pest treatments using either prevalence or intensity data, or + both types of data simultaneously. As it can be easily implemented + using standard Bayesian packages, we hope that it will be useful to + agronomists, plant pathologists, and applied statisticians to + analyze pest surveys and field experiments conducted to assess the + efficacy of pest treatments. + authors: Armand Favrot and David Makowski + bibtex: >+ @article{favrot2024, author = {Favrot, Armand and Makowski, David}, publisher = {French Statistical Society}, @@ -684,7 +1458,78 @@ title: A hierarchical model to evaluate pest treatments from prevalence and intensity data url: '' year: 2024 -- abstract': >- +- abstract'@: >- + Random Forests (RF) {[}@breiman:2001{]} are very popular + machine learning methods. They perform well even with little or no + tuning, and have some theoretical guarantees, especially for sparse + problems {[}@biau:2012;@scornet:etal:2015{]}. These learning + strategies have been used in several contexts, also outside the + field of classification and regression. To perform Bayesian model + selection in the case of intractable likelihoods, the ABC Random + Forests (ABC-RF) strategy of @pudlo:etal:2016 consists in applying + Random Forests on training sets composed of simulations coming from + the Bayesian generative models. The ABC-RF technique is based on an + underlying RF for which the training and prediction phases are + separated. The training phase does not take into account the data to + be predicted. This seems to be suboptimal as in the ABC framework + only one observation is of interest for the prediction. In this + paper, we study tree-based methods that are built to predict a + specific instance in a classification setting. This type of methods + falls within the scope of local (lazy/instance-based/case specific) + classification learning. We review some existing strategies and + propose two new ones. The first consists in modifying the tree + splitting rule by using kernels, the second in using a first RF to + compute some local variable importance that is used to train a + second, more local, RF. Unfortunately, these approaches, although + interesting, do not provide conclusive results. + authors@: Alice Cleynen, Louis Raynal and Jean-Michel Marin + bibtex@: >+ + @article{cleynen2023, + author = {Cleynen, Alice and Raynal, Louis and Marin, Jean-Michel}, + publisher = {French Statistical Society}, + title = {Local Tree Methods for Classification: A Review and Some Dead + Ends}, + journal = {Computo}, + date = {2023-12-14}, + doi = {10.57750/3j8m-8d57}, + issn = {2824-7795}, + langid = {en}, + abstract = {Random Forests (RF) {[}@breiman:2001{]} are very popular + machine learning methods. They perform well even with little or no + tuning, and have some theoretical guarantees, especially for sparse + problems {[}@biau:2012;@scornet:etal:2015{]}. These learning + strategies have been used in several contexts, also outside the + field of classification and regression. To perform Bayesian model + selection in the case of intractable likelihoods, the ABC Random + Forests (ABC-RF) strategy of @pudlo:etal:2016 consists in applying + Random Forests on training sets composed of simulations coming from + the Bayesian generative models. The ABC-RF technique is based on an + underlying RF for which the training and prediction phases are + separated. The training phase does not take into account the data to + be predicted. This seems to be suboptimal as in the ABC framework + only one observation is of interest for the prediction. In this + paper, we study tree-based methods that are built to predict a + specific instance in a classification setting. This type of methods + falls within the scope of local (lazy/instance-based/case specific) + classification learning. We review some existing strategies and + propose two new ones. The first consists in modifying the tree + splitting rule by using kernels, the second in using a first RF to + compute some local variable importance that is used to train a + second, more local, RF. Unfortunately, these approaches, although + interesting, do not provide conclusive results.} + } + + date@: 2023-12-14 + description@: '' + doi@: 10.57750/3j8m-8d57 + draft@: false + journal@: Computo + pdf@: '' + repo@: published-202312-cleynen-local + title@: 'Local tree methods for classification: a review and some dead ends' + url@: '' + year@: 2023 + abstract': >- Random Forests (RF) {[}@breiman:2001{]} are very popular machine learning methods. They perform well even with little or no tuning, and have some theoretical guarantees, especially for sparse @@ -755,7 +1600,72 @@ title: 'Local tree methods for classification: a review and some dead ends' url: '' year: 2023 -- abstract': >- +- abstract'@: >- + The Fisher information matrix (FIM) is a key quantity in + statistics. However its exact computation is often not trivial. In + particular in many latent variable models, it is intricated due to + the presence of unobserved variables. Several methods have been + proposed to approximate the FIM when it can not be evaluated + analytically. Different estimates have been considered, in + particular moment estimates. However some of them require to compute + second derivatives of the complete data log-likelihood which leads + to some disadvantages. In this paper, we focus on the empirical + Fisher information matrix defined as an empirical estimate of the + covariance matrix of the score, which only requires to compute the + first derivatives of the log-likelihood. Our contribution consists + in presenting a new numerical method to evaluate this empirical + Fisher information matrix in latent variable model when the proposed + estimate can not be directly analytically evaluated. We propose a + stochastic approximation estimation algorithm to compute this + estimate as a by-product of the parameter estimate. We evaluate the + finite sample size properties of the proposed estimate and the + convergence properties of the estimation algorithm through + simulation studies. + authors@: Maud Delattre and Estelle Kuhn + bibtex@: >+ + @article{delattre2023, + author = {Delattre, Maud and Kuhn, Estelle}, + publisher = {French Statistical Society}, + title = {Computing an Empirical {Fisher} Information Matrix Estimate + in Latent Variable Models Through Stochastic Approximation}, + journal = {Computo}, + date = {2023-11-21}, + doi = {10.57750/r5gx-jk62}, + issn = {2824-7795}, + langid = {en}, + abstract = {The Fisher information matrix (FIM) is a key quantity in + statistics. However its exact computation is often not trivial. In + particular in many latent variable models, it is intricated due to + the presence of unobserved variables. Several methods have been + proposed to approximate the FIM when it can not be evaluated + analytically. Different estimates have been considered, in + particular moment estimates. However some of them require to compute + second derivatives of the complete data log-likelihood which leads + to some disadvantages. In this paper, we focus on the empirical + Fisher information matrix defined as an empirical estimate of the + covariance matrix of the score, which only requires to compute the + first derivatives of the log-likelihood. Our contribution consists + in presenting a new numerical method to evaluate this empirical + Fisher information matrix in latent variable model when the proposed + estimate can not be directly analytically evaluated. We propose a + stochastic approximation estimation algorithm to compute this + estimate as a by-product of the parameter estimate. We evaluate the + finite sample size properties of the proposed estimate and the + convergence properties of the estimation algorithm through + simulation studies.} + } + + date@: 2023-11-21 + description@: '' + doi@: 10.57750/r5gx-jk62 + draft@: false + journal@: Computo + pdf@: '' + repo@: published-202311-delattre-fim + title@: Computing an empirical Fisher information matrix estimate in latent variable models through stochastic approximation + url@: '' + year@: 2023 + abstract': >- The Fisher information matrix (FIM) is a key quantity in statistics. However its exact computation is often not trivial. In particular in many latent variable models, it is intricated due to @@ -820,7 +1730,75 @@ title: Computing an empirical Fisher information matrix estimate in latent variable models through stochastic approximation url: '' year: 2023 -- abstract': >- +- abstract'@: >- + Gaussian Graphical Models (GGMs) are widely used in + high-dimensional data analysis to synthesize the interaction between + variables. In many applications, such as genomics or image analysis, + graphical models rely on sparsity and clustering to reduce + dimensionality and improve performances. This paper explores a + slightly different paradigm where clustering is not knowledge-driven + but performed simultaneously with the graph inference task. We + introduce a novel Multiscale Graphical Lasso (MGLasso) to improve + networks interpretability by proposing graphs at different + granularity levels. The method estimates clusters through a convex + clustering approach -\/-\/- a relaxation of \$k\$-means, and + hierarchical clustering. The conditional independence graph is + simultaneously inferred through a neighborhood selection scheme for + undirected graphical models. MGLasso extends and generalizes the + sparse group fused lasso problem to undirected graphical models. We + use continuation with Nesterov smoothing in a shrinkage-thresholding + algorithm (CONESTA) to propose a regularization path of solutions + along the group fused Lasso penalty, while the Lasso penalty is kept + constant. Extensive experiments on synthetic data compare the + performances of our model to state-of-the-art clustering methods and + network inference models. Applications to gut microbiome data and + poplar’s methylation mixed with transcriptomic data are presented. + authors@: Edmond Sanou, Christophe Ambroise and Geneviève Robin + bibtex@: >+ + @article{sanou2023, + author = {Sanou, Edmond and Ambroise, Christophe and Robin, Geneviève}, + publisher = {French Statistical Society}, + title = {Inference of {Multiscale} {Gaussian} {Graphical} {Models}}, + journal = {Computo}, + date = {2023-06-28}, + doi = {10.57750/1f4p-7955}, + issn = {2824-7795}, + langid = {en}, + abstract = {Gaussian Graphical Models (GGMs) are widely used in + high-dimensional data analysis to synthesize the interaction between + variables. In many applications, such as genomics or image analysis, + graphical models rely on sparsity and clustering to reduce + dimensionality and improve performances. This paper explores a + slightly different paradigm where clustering is not knowledge-driven + but performed simultaneously with the graph inference task. We + introduce a novel Multiscale Graphical Lasso (MGLasso) to improve + networks interpretability by proposing graphs at different + granularity levels. The method estimates clusters through a convex + clustering approach -\/-\/- a relaxation of \$k\$-means, and + hierarchical clustering. The conditional independence graph is + simultaneously inferred through a neighborhood selection scheme for + undirected graphical models. MGLasso extends and generalizes the + sparse group fused lasso problem to undirected graphical models. We + use continuation with Nesterov smoothing in a shrinkage-thresholding + algorithm (CONESTA) to propose a regularization path of solutions + along the group fused Lasso penalty, while the Lasso penalty is kept + constant. Extensive experiments on synthetic data compare the + performances of our model to state-of-the-art clustering methods and + network inference models. Applications to gut microbiome data and + poplar’s methylation mixed with transcriptomic data are presented.} + } + + date@: 2023-06-28 + description@: '' + doi@: 10.57750/1f4p-7955 + draft@: false + journal@: Computo + pdf@: '' + repo@: published-202306-sanou-multiscale_glasso + title@: Inference of Multiscale Gaussian Graphical Models + url@: '' + year@: 2023 + abstract': >- Gaussian Graphical Models (GGMs) are widely used in high-dimensional data analysis to synthesize the interaction between variables. In many applications, such as genomics or image analysis, @@ -888,7 +1866,71 @@ title: Inference of Multiscale Gaussian Graphical Models url: '' year: 2023 -- abstract': >- +- abstract'@: >- + Litter is a known cause of degradation in marine + environments and most of it travels in rivers before reaching the + oceans. In this paper, we present a novel algorithm to assist waste + monitoring along watercourses. While several attempts have been made + to quantify litter using neural object detection in photographs of + floating items, we tackle the more challenging task of counting + directly in videos using boat-embedded cameras. We rely on + multi-object tracking (MOT) but focus on the key pitfalls of false + and redundant counts which arise in typical scenarios of poor + detection performance. Our system only requires supervision at the + image level and performs Bayesian filtering via a state space model + based on optical flow. We present a new open image dataset gathered + through a crowdsourced campaign and used to train a center-based + anchor-free object detector. Realistic video footage assembled by + water monitoring experts is annotated and provided for evaluation. + Improvements in count quality are demonstrated against systems built + from state-of-the-art multi-object trackers sharing the same + detection capabilities. A precise error decomposition allows clear + analysis and highlights the remaining challenges. + authors@: Mathis Chagneux, Sylvain Le Corff, Pierre Gloaguen, Charles Ollion, Océane Lepâtre and Antoine Bruge + bibtex@: >+ + @article{chagneux2023, + author = {Chagneux, Mathis and Le Corff, Sylvain and Gloaguen, Pierre + and Ollion, Charles and Lepâtre, Océane and Bruge, Antoine}, + publisher = {French Statistical Society}, + title = {Macrolitter Video Counting on Riverbanks Using State Space + Models and Moving Cameras}, + journal = {Computo}, + date = {2023-02-16}, + doi = {10.57750/845m-f805}, + issn = {2824-7795}, + langid = {en}, + abstract = {Litter is a known cause of degradation in marine + environments and most of it travels in rivers before reaching the + oceans. In this paper, we present a novel algorithm to assist waste + monitoring along watercourses. While several attempts have been made + to quantify litter using neural object detection in photographs of + floating items, we tackle the more challenging task of counting + directly in videos using boat-embedded cameras. We rely on + multi-object tracking (MOT) but focus on the key pitfalls of false + and redundant counts which arise in typical scenarios of poor + detection performance. Our system only requires supervision at the + image level and performs Bayesian filtering via a state space model + based on optical flow. We present a new open image dataset gathered + through a crowdsourced campaign and used to train a center-based + anchor-free object detector. Realistic video footage assembled by + water monitoring experts is annotated and provided for evaluation. + Improvements in count quality are demonstrated against systems built + from state-of-the-art multi-object trackers sharing the same + detection capabilities. A precise error decomposition allows clear + analysis and highlights the remaining challenges.} + } + + date@: 2023-02-16 + description@: '' + doi@: 10.57750/845m-f805 + draft@: false + journal@: Computo + pdf@: '' + repo@: published-202301-chagneux-macrolitter + title@: 'Macrolitter video counting on riverbanks using state space models and moving cameras ' + url@: '' + year@: 2023 + abstract': >- Litter is a known cause of degradation in marine environments and most of it travels in rivers before reaching the oceans. In this paper, we present a novel algorithm to assist waste @@ -952,7 +1994,56 @@ title: 'Macrolitter video counting on riverbanks using state space models and moving cameras ' url: '' year: 2023 -- abstract': >- +- abstract'@: >- + The package \$\textbackslash textsf\{clayton\}\$ is + designed to be intuitive, user-friendly, and efficient. It offers a + wide range of copula models, including Archimedean, Elliptical, and + Extreme. The package is implemented in pure \$\textbackslash + textsf\{Python\}\$, making it easy to install and use. In addition, + we provide detailed documentation and examples to help users get + started quickly. We also conduct a performance comparison with + existing \$\textbackslash textsf\{R\}\$ packages, demonstrating the + efficiency of our implementation. The \$\textbackslash + textsf\{clayton\}\$ package is a valuable tool for researchers and + practitioners working with copulae in \$\textbackslash + textsf\{Python\}\$. + authors@: Alexis Boulin + bibtex@: >+ + @article{boulin2023, + author = {Boulin, Alexis}, + publisher = {French Statistical Society}, + title = {A {Python} {Package} for {Sampling} from {Copulae:} Clayton}, + journal = {Computo}, + date = {2023-01-12}, + doi = {10.57750/4szh-t752}, + issn = {2824-7795}, + langid = {en}, + abstract = {The package \$\textbackslash textsf\{clayton\}\$ is + designed to be intuitive, user-friendly, and efficient. It offers a + wide range of copula models, including Archimedean, Elliptical, and + Extreme. The package is implemented in pure \$\textbackslash + textsf\{Python\}\$, making it easy to install and use. In addition, + we provide detailed documentation and examples to help users get + started quickly. We also conduct a performance comparison with + existing \$\textbackslash textsf\{R\}\$ packages, demonstrating the + efficiency of our implementation. The \$\textbackslash + textsf\{clayton\}\$ package is a valuable tool for researchers and + practitioners working with copulae in \$\textbackslash + textsf\{Python\}\$.} + } + + date@: 2023-01-12 + description@: > + The package $\textsf{clayton}$ is designed to be intuitive, user-friendly, and efficient. It offers a wide range of copula models, including Archimedean, Elliptical, and Extreme. The package is implemented in pure $\textsf{Python}$, making it easy to install and use. + doi@: 10.57750/4szh-t752 + draft@: false + journal@: Computo + pdf@: '' + repo@: published-202301-boulin-clayton + title@: 'A Python Package for Sampling from Copulae: clayton' + url@: '' + year@: 2023 + abstract': >- The package \$\textbackslash textsf\{clayton\}\$ is designed to be intuitive, user-friendly, and efficient. It offers a wide range of copula models, including Archimedean, Elliptical, and @@ -1001,7 +2092,84 @@ title: 'A Python Package for Sampling from Copulae: clayton' url: '' year: 2023 -- abstract': >- +- abstract'@: >- + Deep learning is used in computer vision problems with + important applications in several scientific fields. In ecology for + example, there is a growing interest in deep learning for + automatizing repetitive analyses on large amounts of images, such as + animal species identification. However, there are challenging issues + toward the wide adoption of deep learning by the community of + ecologists. First, there is a programming barrier as most algorithms + are written in `Python` while most ecologists are versed in `R`. + Second, recent applications of deep learning in ecology have focused + on computational aspects and simple tasks without addressing the + underlying ecological questions or carrying out the statistical data + analysis to answer these questions. Here, we showcase a reproducible + `R` workflow integrating both deep learning and statistical models + using predator-prey relationships as a case study. We illustrate + deep learning for the identification of animal species on images + collected with camera traps, and quantify spatial co-occurrence + using multispecies occupancy models. Despite average model + classification performances, ecological inference was similar + whether we analysed the ground truth dataset or the classified + dataset. This result calls for further work on the trade-offs + between time and resources allocated to train models with deep + learning and our ability to properly address key ecological + questions with biodiversity monitoring. We hope that our + reproducible workflow will be useful to ecologists and applied + statisticians. + authors@: Olivier Gimenez, Maëlis Kervellec, Jean-Baptiste Fanjul, Anna Chaine, Lucile Marescot, Yoann Bollet and Christophe Duchamp + bibtex@: >+ + @article{gimenez2022, + author = {Gimenez, Olivier and Kervellec, Maëlis and Fanjul, + Jean-Baptiste and Chaine, Anna and Marescot, Lucile and Bollet, + Yoann and Duchamp, Christophe}, + publisher = {French Statistical Society}, + title = {Trade-Off Between Deep Learning for Species Identification + and Inference about Predator-Prey Co-Occurrence}, + journal = {Computo}, + date = {2022-04-22}, + doi = {10.57750/yfm2-5f45}, + issn = {2824-7795}, + langid = {en}, + abstract = {Deep learning is used in computer vision problems with + important applications in several scientific fields. In ecology for + example, there is a growing interest in deep learning for + automatizing repetitive analyses on large amounts of images, such as + animal species identification. However, there are challenging issues + toward the wide adoption of deep learning by the community of + ecologists. First, there is a programming barrier as most algorithms + are written in `Python` while most ecologists are versed in `R`. + Second, recent applications of deep learning in ecology have focused + on computational aspects and simple tasks without addressing the + underlying ecological questions or carrying out the statistical data + analysis to answer these questions. Here, we showcase a reproducible + `R` workflow integrating both deep learning and statistical models + using predator-prey relationships as a case study. We illustrate + deep learning for the identification of animal species on images + collected with camera traps, and quantify spatial co-occurrence + using multispecies occupancy models. Despite average model + classification performances, ecological inference was similar + whether we analysed the ground truth dataset or the classified + dataset. This result calls for further work on the trade-offs + between time and resources allocated to train models with deep + learning and our ability to properly address key ecological + questions with biodiversity monitoring. We hope that our + reproducible workflow will be useful to ecologists and applied + statisticians.} + } + + date@: 2022-04-22 + description@: '' + doi@: 10.57750/yfm2-5f45 + draft@: false + journal@: Computo + pdf@: '' + repo@: published-202204-deeplearning-occupancy-lynx + title@: Trade-off between deep learning for species identification and inference about predator-prey co-occurrence + url@: '' + year@: 2022 + abstract': >- Deep learning is used in computer vision problems with important applications in several scientific fields. In ecology for example, there is a growing interest in deep learning for