forked from scipy/scipy-articles
/
paper.tex
1334 lines (1178 loc) · 66.8 KB
/
paper.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[fleqn,10pt]{wlscirep}
% Packages
\usepackage[super]{nth}
\usepackage{rotating}
\usepackage{makecell}
\usepackage{pifont}
\usepackage{amsfonts}
\usepackage{amsmath}
\usepackage{float}
\usepackage{pgfplotstable}
\usepackage[multiple]{footmisc}
\usepackage{booktabs,siunitx}
\pgfplotsset{compat=newest}
\usepackage{listings, textcomp}
\lstset{ %
basicstyle=\ttfamily\footnotesize, % size of fonts used for the code
breaklines=true, % automatic line breaking only at whitespace
captionpos=b, % sets the caption-position to bottom
commentstyle=\color{gray}, % comment style
keywordstyle=\color{blue}, % keyword style
stringstyle=\color{red}, % string literal style
upquote=true %straight single quotes (requires textcomp)
}
% New Commands
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%
\newcommand{\code}[1]{\texttt{#1}}
\newcommand{\fixme}[1]{\textcolor{red}{{#1}}}
\title{SciPy 1.0---Fundamental Algorithms for Scientific Computing in Python}
\author[1]{Pauli Virtanen}
\author[2,*]{Ralf Gommers}
\author[3,4]{Tyler Reddy}
\author[5]{Anne Archibald}
\author[6]{Andrew Nelson}
\author[7]{Charles Harris}
\author[8]{CJ Carey}
\author[9]{Denis Laxalde}
\author[10]{Eric Larson}
\author[11]{Eric Moore}
\author[12]{Eric Quintero}
\author[13]{Evgeni Burovski}
\author[14]{Jaime Fernández del Río}
\author[15]{Josef Perktold}
\author[16]{Josh Wilson}
\author[17]{Matthew Brett}
\author[18]{Nikolay Mayorov}
\author[19]{Warren Weckesser}
\author[20]{Matt Haberland}
\author[21]{Scott Sievert}
\author[22]{Yu Feng}
\author[23]{Antonio Horta Ribeiro}
\author[24]{Ian Henriksen}
\author[3,25]{K. Jarrod Millman}
\author[3]{St\'efan J. van der Walt}
\affil[1]{Affiliation, department, city, postcode, country}
\affil[2]{Affiliation, department, city, postcode, country}
\affil[2]{Affiliation, department, city, postcode, country}
\affil[3]{Berkeley Institute for Data Science, University of California, Berkeley, CA, 94720, USA}
\affil[4]{Los Alamos National Laboratory,
Theoretical Division 6,
Los Alamos, NM, 87545, USA}
\affil[5]{Affiliation, department, city, postcode, country}
\affil[6]{Affiliation, department, city, postcode, country}
\affil[7]{Affiliation, department, city, postcode, country}
\affil[8]{Affiliation, department, city, postcode, country}
\affil[9]{Affiliation, department, city, postcode, country}
\affil[10]{Affiliation, department, city, postcode, country}
\affil[11]{Affiliation, department, city, postcode, country}
\affil[12]{Affiliation, department, city, postcode, country}
\affil[13]{Affiliation, department, city, postcode, country}
\affil[14]{Affiliation, department, city, postcode, country}
\affil[15]{Affiliation, department, city, postcode, country}
\affil[16]{Affiliation, department, city, postcode, country}
\affil[17]{Affiliation, department, city, postcode, country}
\affil[18]{Affiliation, department, city, postcode, country}
\affil[19]{Affiliation, department, city, postcode, country}
\affil[20]{BioResource and Agricultural Engineering, California Polytechnic State University, San Luis Obispo, CA, 93407, USA}
\affil[21]{Affiliation, department, city, postcode, country}
\affil[22]{Affiliation, department, city, postcode, country}
\affil[23]{Affiliation, department, city, postcode, country}
\affil[24]{University of Texas at Austin,
Institute for Computational Engineering and Sciences,
Austin, TX, 78712, USA}
\affil[25]{Division of Biostatistics, University of California,
Berkeley, CA, 94720, USA}
\affil[*]{ralf.gommers@gmail.com}
\keywords{Scientific computing, Python, Mathematics}
% NOTE: some usage stats below are from https://libraries.io/pypi/scipy
% as well as github metrics; normally one doesn't put citations directly
% in abstract though (depends on field / journal)
\begin{abstract}
SciPy is an open source scientific computing library for the Python programming language.
SciPy 1.0 was released in late 2017, about 16 years after the original
version 0.1 release. SciPy
has become a \emph{de facto} standard for leveraging scientific algorithms
in the Python programming language, with more than 600
unique code contributors, millions of downloads per year, 161 dependent
packages, and 28700 dependent repositories. This includes usage of SciPy
in almost half of all machine learning projects on GitHub, and usage by
high profile projects including LIGO gravitational wave analysis and creation
of the first-ever image of a black hole (M87).
The library includes functionality spanning clustering, Fourier transforms,
integration, interpolation, file I/O, linear algebra, image processing,
orthogonal distance regression, minimization algorithms, signal processing,
sparse matrix handling, computational geometry, and statistics. In this
work, we provide an overview of the capabilities and development practices of the
SciPy library and highlight some recent technical developments.
\end{abstract}
\begin{document}
\flushbottom
\maketitle
\thispagestyle{empty}
\section*{Introduction}
%% SciPy is not what you think it is
%% - At risk of failure many times
%% - Funding was *hard* to come by; funding agencies didn't easily give
%% money. Some labs, Enthough committed a bit of money.
%% - Java and other enterprize packages were getting support; MATLAB
%% was huge. Not many wanted to encourage this ``amateur'' effort.
%% - Graduate students & post-docs doing the heavy lifting; they found
%% a system they enjoyed using, saw its potential, put energy into it.
%% - Driven by eclectic group of people being passionate about it, not
%% because they were paid or told to work on it.
%% - And not by EECS people, but by scientists. Some with fairly little
%% programming background (David), others in varying stages of academic/research careers.
%% - Over time, the project became more more professional:
%% - Standardized release formats
%% - Docstrings
%% - Testing & CI infrastructure
%% - APIs
%% - Issue tracking
%% - Revision control (CVS -> SVN -> Git)
%% - Conference
%% - Python wasn't perfectly suited in all ways; packaging, compiling,
%% indexing (indexing/array protocols and complex numbers added)
%% - Initially, this high quality, stable, widely used project, hinged on
%% three people doing the ``right'' thing to get it established.
%% - Bus factor for maintenance still very low.
%% The background section captures a selective history where we're trying
%% to identify some of the more important events. But really, it relies
%% on the collective work of numerous individuals, too many to mention.
SciPy is a library of numerical routines for the Python programming
language that provides fundamental building blocks for modeling and
solving scientific problems.
SciPy includes algorithms for optimization, integration, interpolation,
eigenvalue problems, algebraic equations, differential equations, and many other
classes of problems; it also provides
specialized data structures, such as sparse arrays
and $k$-dimensional trees. SciPy is built on top of
NumPy\cite{vanderwalt2011numpy,Oliphant-2015},
which provides array data structures and related fast numerical routines, and
SciPy is itself the foundation upon which higher level scientific libraries,
including scikit-learn\cite{pedregosa2011scikit}
and scikit-image\cite{vanderwalt2014scikit}, are built.
SciPy is relied upon by scientists, engineers, and others
around the world. For example, published
scripts\cite{alex_nitz_2018_1596771, LIGO-open}
used in the analysis of gravitational waves
\cite{PhysRevD.93.122003, abbott2017gw170817}
import several submodules of SciPy, and the M87 black
hole imaging project directly cites SciPy\cite{2019ApJ875L3E}.
Recently, SciPy released version 1.0, a milestone that traditionally
signals a library's API (Applications Programming Interface) being
mature enough to be be trusted in production pipelines. This version
numbering convention, however, belies the history of a project that has
become the standard which others follow and has seen extensive
adoption in research and industry.
SciPy's arrival at this point is surprising, and somewhat anomalous.
When started, the library had little funding, and was written mainly
by graduate students---many of them without a computer science education, and often without the
blessing of their advisors. To even imagine that a small group of
``rogue'' student programmers could upend the already well-established
ecosystem of research software---backed by millions in funding, and
many hundreds of highly qualified engineers\cite{mathworks-globe-97,
esri-revenue,bloom-wolfram}---was preposterous.
Yet, the philosophical motivations behind a fully open toolstack, and
an excited, friendly community with a singular focus, seemed to have
proven auspicious in the long run. It led not only to the library
described in this paper, but also to an entire ecosystem of related
packages \cite{scipy-ecosystem}, and a variety of social activities centered around
them\cite{social-python}. The packages in the SciPy ecosystem
share high standards of implementation,
documentation, and testing, and a culture eager to learn and adopt
better practices---both for community management and software
development.
% Despite its success, SciPy still faces challenges:
% a low ``bus factor''---meaning that the entire project depends on a small number of
% active contributors---being perhaps the most serious.
In the background section that follows, we capture a selective history
of some milestones and important events in the growth of SciPy.
Despite what we might highlight here, it is important to understand
that a project like SciPy is only possible because of the
contributions of very many contributors---too many to mention
individually, but each bringing an important piece to the puzzle.
\section*{Background}
Python is an interpreted, high-level, general-purpose computer programming
language, designed by Guido van Rossum in the late 1980s,
with a dynamic type system and an emphasis on readability and rapid prototyping.
The reference and most popular implementation of Python is
CPython\cite{cpython-refman,cpython-source}, which is written
in the C and Python languages and assumed throughout this paper.
% https://legacy.python.org/search/hypermail/python-1992/0000.html
%
As a general purpose programming language, it had no special support for
scientific data structures or algorithms, unlike many of the other established
computation platforms of the time. Yet, scientists soon discovered the
language's virtues, such as its ability to wrap C and Fortran
libraries and to then drive those libraries interactively. Scientists
could thereby gain access to a wide variety of existing computational
libraries without concerning themselves with low-level programming
concepts such as memory management.
In 1995, Jim Hugunin, a graduate student from MIT, wrote the first
message in a new Python Matrix Special Interest Group (Matrix-SIG)
mailing list\cite{Hugunin-first}:
\begin{quote}
There seems to be a fair amount of interest in the Python community
concerning the addition of numeric operations to Python. My own desire is
to have as large a library of matrix based functions available as possible
(linear algebra, eigenfunctions, signal processing, statistics, etc.). In
order to ensure that all of these libraries interoperate, there needs to
be agreement on a basic matrix object that can be used to represent arrays
of numbers.
\end{quote}
Over the next several months, conversations on that mailing
list by, among others, Jim Fulton, Jim Hugunin, Paul Dubois, Konrad
Hinsen, and Guido van Rossum, led to the creation of a package called Numeric with an array object
that supported a high number of dimensions. Jim Hugunin explained the utility
of Python for numerical computation\cite{Hugunin-whitepaper}:
\begin{quote}
I've used almost all of the available numerical languages at one time
or another over the past 8 years. One thing I've noticed is that over
time, the designers of these languages are steadily adding more of the
features that one would expect to find in a general-purpose
programming language.
\end{quote}
This is still a distinguishing feature of Python for science, and one of the
reasons why it has been so successful in the realm of data science: instead of
adding general features to a language designed for numerical and scientific
computing, here scientific features are added to a general purpose language.
This broadens the scope of problems that can be addressed easily, expands the
sources of data that are readily accessible, and increases the size of the
community that develops code for the platform.
The availability of a standard numerical array data structure in
Python led to a sudden, rapid growth in the number of scientific
packages available for solving common numeric problems.
A number of these packages were written by graduate students and
postdoctoral researchers to solve the very practical research problems
that they faced on a daily basis. While they had access to and were
familiar with specialized (often commercial) systems, many found
it easier to implement the domain-specific functionality they needed
in a general purpose programming language. And given Python's innate
ability to function as a systems language, controlling specialized or
custom-built hardware was also possible.
Initial work on numerical computing in Python was driven by graduate
students, but soon larger research labs became increasingly engaged.
For example, Paul Dubois at Lawrence Livermore National Laboratory (LLNL) took over the
maintenance of Numeric and funded the writing of its
manual\cite{Numeric-manual}, and
%in the Program for Climate Model Diagnosis and Intercomparison at
%Lawrence Livermore National Laboratory built their Climate Data
%Analysis Tools (``an open-source set of Python modules and tools for
%accessing, manipulating, and plotting scientific data in general and
%climate data sets in
%particular''\footnote{https://mail.python.org/pipermail/numpy-discussion/2001-May/000645.html})
%on Numeric, the newly established Python array object. Paul later
%became the maintainer for Numeric, and helped fund the writing of the
%first manual\footnote{https://conference.scipy.org/scipy2010/slides/travis_oliphant_keynote.pdf}
in 1998 the Space Telescope Science Institute (STScI), which was in charge of
Hubble Space Telescope science operations, decided to replace their
custom scripting language with Python\cite{STScI-slither}.
%Many of the first efforts were driven by graduate
%students and post-doctoral researchers, but soon attracted the
%attention of more senior researchers.
% Paul Dubois, CDAT
% https://mail.python.org/pipermail/numpy-discussion/2001-May/000645.html
% Perry Greenfield, Hubble
% https://conference.scipy.org/scipy2011/slides/greenfield_keynote_astronomy.pdf
%
% Note: plotting was important for them, so they eventually
% funded development on Matplotlib.
% Numeric, even though written by grad students, became critical for
% large scale endeavors
\subsection*{SciPy Begins}
% PhD Mayo Clinic, started faculty position at BYU in 2001
By the late 1990s, discussions appeared on Matrix-SIG
expressing a desire for a complete scientific data analysis environment\cite{Travis-Keynote-2010}.
Travis Oliphant, a PhD student at the Mayo Clinic,
released a number of packages\cite{Travis-some-modules,Travis-enhance}
that built on top of the Numeric array
package, and provided algorithms for signal processing, special
functions, sparse matrices, quadrature, optimization, Fast Fourier
Transforms, and more. One of these packages, Multipack, was a set of
extension modules that wrapped Fortran and C libraries such as
ODEPACK, QUADPACK, and MINPACK, to solve and minimize nonlinear
equations, integrate differential equations, and fit splines (the
latter implemented by Pearu Peterson). Robert Kern, then an
undergraduate student (and currently a SciPy core developer), provided
compilation instructions under Windows.
% 2001 PhD Tallinn Technical University
Around the same time, Pearu Peterson, a PhD student from Estonia,
released F2PY\cite{peterson2009f2py}, a command line tool for binding Python and Fortran
codes, and wrote modules for linear algebra and interpolation.
% Graduated 1999, Duke
Eric Jones, while a graduate student at Duke, wrote a number of
packages to support his dissertation, including a parallel job
scheduler and genetic optimizer.
% Graduated 1998, PhD from Brown, 97–99 postdoc at Harvard Medical
% School; at Harvard ever since
%
% https://mail.python.org/pipermail/scipy-dev/2001-November/000088.html
%
Gary Strangman, a postdoctoral fellow at Harvard Medical School,
published several descriptive and inferential statistical
routines\cite{Strangman-modules}.
\begin{figure}
\begin{verbatim}
SciPy is an open source package that builds on the strengths of Python and
Numeric providing a wide range of fast scientific and numeric functionality.
SciPy's current module set includes the following:
Special Functions (Bessel, hanker, Airy, etc.)
Signal/Image Processing
2D Plotting capabilities
Integration
ODE solvers
Optimization (simplex, BFGS, Netwon-CG, etc.)
Genetic Algorithms
Numeric -> C++ expression compiler
Parallel programming tools
Splines and Interpolation
And other stuff.
\end{verbatim}
\caption{Excerpt from SciPy 0.1 release announcement (typos included).}\label{fig:announce-0.1}
\end{figure}
With a rich programming environment and a numerical array object in
place, the time was ripe for the development of a full scientific
software stack.
In 2001, Eric Jones and Travis Vaught founded Enthought Scientific
Computing Solutions (now Enthought, Inc.) in Austin, Texas. In an
effort to simplify the tool stack, they created the SciPy project,
centered around the SciPy library which would subsume all the
above-mentioned packages.
The new project quickly gained momentum, with a website and code
repository appearing in
February\cite{archived-scipyorg}, and
a mailing list announced in
June \cite{new-scipy-list}. By
August, a first release was announced\cite{first-scipy-rel}, an excerpt of which is shown in
Fig.~\ref{fig:announce-0.1}.
In September, the first documentation was
published\cite{first-scipy-docs}.
The first SciPy
workshop\cite{first-scipy-workshop}
was held in September 2002 at Caltech---a single track, two day event with 50
participants, many of them developers of SciPy and surrounding libraries.
%and by the following September the inaugural SciPy workshop was held
%at Caltech\footnote{https://mail.python.org/pipermail/numpy-discussion/2002-June/001511.html}.
% Janko Hauser on toolboxes
% https://mail.python.org/pipermail/scipy-dev/2001-September/000032.html
At this point, scientific Python started attracting more serious attention;
code that started out as side projects by graduate students has grown into
essential infrastructure at national laboratories and research institutes.
For example, starting in 2000, STScI ported their IDL-based analysis pipeline,
IRAF, to Python.
As SciPy, the algorithms library of the ecosystem, began attracting the attention
of large research organizations,
the next generation of graduate students and postdocs were already exploring
other aspects of the computational environment.
In 2000, Prabhu Ramachandran, a PhD student at the Indian Institute of
Technology, started work on a 3D visualization application, based on
Kitware's C++ Visualization Toolkit\cite{schroeder:2006:VTK}, called
Mayavi\cite{mayavi-intro}.
In 2001, Fernando Pérez, a graduate student at the University of
Colorado, Boulder, created the IPython interactive shell, which
eventually blossomed into Project Jupyter\cite{Kluyver:2016aa}.
% IPython hosted on SciPy:
% https://mail.python.org/pipermail/scipy-user/2003-March/001455.html
John Hunter, a postdoc
%John got his PhD in 2001, then a post-doc with John Milton (dept of
%Neurology, UChicago?) a couple of years; then research professor
%under Kurt Hecox, whereafter at Tradelink from 2006
at the University
of Chicago, liked the plotting functionality
available in MATLAB, but had problems accessing their laboratory's
license, which was governed by a hardware dongle. In response, he
wrote a plotting library from scratch, and Matplotlib 0.1 was released
April 2003\cite{matplotlib-rel}.
% https://mail.python.org/pipermail/scipy-dev/2002-June/001007.html
% https://str.llnl.gov/str/News896.html
%2002—https://web.archive.org/web/20021020215937/http://www.scipy.org:80/site_content/scipy02/scipy02_agenda.htm
%2003—https://web.archive.org/web/20031207151811/http://www.scipy.org:80/site_content/scipy03/
%2004—Jim Hugunin
%https://web.archive.org/web/20040920080225/http://www.scipy.org:80/wikis/scipy04
%Travis Oliphant 2005—https://web.archive.org/web/20051103155042/http://www.scipy.org:80/wikis/scipy05/FrontPage
%Guido—2006 https://scipy.github.io/old-wiki/pages/SciPy2006.html
%Martelli—2008 https://scipy.github.io/old-wiki/pages/Developer_Zone/Conferences/SciPy2004.html
%Norvig—2009 https://scipy.github.io/old-wiki/pages/Developer_Zone/Conferences/SciPy2005.html
As STScI continued to use Python for an increasingly large portion
of the Hubble Space Telescope data analysis pipeline, they encountered
problems with the Python numerical array container.
Numeric, the original array package, was
suitable for small arrays, but not for the large images processed by
STScI. With the Numeric maintainer's blessing, the decision was made
to write NumArray, a library that could handle data on a larger
scale. Unfortunately, NumArray proved inefficient for small arrays,
presenting the community with a rather unfortunate choice. In 2005,
Travis Oliphant combined the best elements of Numeric and NumArray,
thereby solving the dilemma and paving the way for Python to become
a very significant player in the Data Science movement. NumPy
1.0 was released in October 2006\cite{numpy-1.0-tag}, with about 30
authors recognized for major contributions in its release notes.
% Future directions for SciPy after meeting at Berkeley
% https://mail.python.org/pipermail/scipy-user/2005-March/004164.html
%In April 2006, Paul
%Dubois\footnote{https://mail.python.org/pipermail/numpy-discussion/2006-April/007487.html} wrote
%to the NumPy mailing list:
%\begin{quote}
%IEEE's magazine, Computing in Science and Engineering (CiSE), has asked me
%to put together a theme issue on the use of Python in Science and
%Engineering. I will write an overview to be accompanied by 3-5 articles [. . .]
%that show a diverse set of applications or
%tools, to give our readers a sense of whether or not Pyth%on might be useful
%in their own work.
%\end{quote}
In a May/June 2007 special issue of IEEE Computing in Science and
Engineering, Paul Dubois wrote\cite{dubois2007guest}:
\begin{quote}
LLNL now has many Python-based efforts built from scratch or wrapped around
legacy codes [. . .] hundreds of thousands
of lines of C++, Python, and Fortran 95, all working together just as we hoped,
doing compute-intensive calculations on massively parallel computers.
\end{quote}
While it is now more commonly accepted, Dubois took the time to explain to
the 2007 reader that ``interpreted doesn't mean slow or only interactive.''
He also shared his own interest in Python for ``computational steering''
where ``Python serves as the input language to a scientific application, and the actual
computations are performed both in Python itself and in compiled extensions.''
%https://www.computer.org/csdl/mags/cs/2007/03/c3007.html
Using Python for computational steering was one of the earliest uses of Python
in large scientific processing pipelines.
In addition to Dubois' overview there were also articles introducing
Python and SciPy \cite{oliphant2007python}, IPython \cite{perez2007ipython},
and Matplotlib \cite{hunter2007matplotlib}.
There was also a diverse set of
research applications including
systems biology\cite{myers2007python},
astronomy\cite{greenfield2007reaching},
robotics\cite{krauss2007python},
nanophotonics\cite{bienstman2007python},
partial differential equations\cite{mardal2007using},
neuroimaging\cite{millman2007analysis},
geographic information systems\cite{shi2007python}, and
education\cite{backer2007computational, myers2007pythona}.
\subsection*{SciPy Matures}
%Seeking articles for special issue on Python and Science and Engineering
%Paul Dubois
%Wed Apr 12 13:00:04 EDT 2006
%https://mail.python.org/pipermail/numpy-discussion/2006-April/007487.html
%
%Python--Batteries Included
%https://www.computer.org/csdl/mags/cs/2007/03/c3007.pdf
%
%https://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber=4160244
%
%Python: Batteries Included
%Paul F. Dubois
%
%Python for Scientific Computing
%Travis E. Oliphant
%
%IPython: A System for Interactive Scientific Computing
%Fernando Perez ; Brian E. Granger
%
%Computational Physics Education with Python
%Arnd Backer
%
%Python Unleashed on Systems Biology
%Christopher R. Myers ; Ryan N. Gutenkunst ; James P. Sethna
%
%Reaching for the Stars with Python
%Perry Greenfield
%
%A Python Module for Modeling and Control Design of Flexible Robots
%Ryan W. Krauss ; Wayne J. Book
%
%Python in Nanophotonics Research
%Peter Bienstman ; Lieven Vanholme ; Wim Bogaerts ; Pieter Dumon ; Peter Vandersteegen
%
%Using Python to Solve Partial Differential Equations
%Kent-Andre Mardal ; Ola Skavhaug ; Glenn T. Lines ; Gunnar A. Staff ; Asmund Odegard
%
%Analysis of Functional Magnetic Resonance Imaging in Python
%K. Jarrod Millman ; Matthew Brett
%
%Python for Internet GIS Applications
%Xuan Shi
%
%Python for Education: Computational Methods for Nonlinear Systems
%Christopher R. Myers ; James P. Sethna
%
%Matplotlib: A 2D Graphics Environment
%John D. Hunter
%
%-----
%
%https://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber=5725228&punumber=5992
%
%Python for Scientists and Engineers
%K. Jarrod Millman ; Michael Aivazis
%
%
%Python: An Ecosystem for Scientific Computing
%Fernando Perez ; Brian E. Granger ; John D. Hunter
%
%The NumPy Array: A Structure for Efficient Numerical Computation
%Stefan van der Walt ; S. Chris Colbert ; Gael Varoquaux
%
%Cython: The Best of Both Worlds
%Stefan Behnel ; Robert Bradshaw ; Craig Citro ; Lisandro Dalcin ; Dag Sverre Seljebotn ; Kurt Smith
%
%Mayavi: 3D Visualization of Scientific Data
%Prabhu Ramachandran ; Gael Varoquaux
%
%??? these were in the same issue
%
%web2py for Scientific Applications
%Massimo Di Pierro
%
%From Equations to Code: Automated Scientific Computing
%Andy R. Terrel
%https://www.researchgate.net/publication/260585856_From_Equations_to_Code_Automated_Scientific_Computing
By the middle to late 2000s, SciPy was starting to mature after a long phase of significant
growth and adoption.
The informal workshops grew into international conferences with many
hundreds of attendees. Special issues were organized and published in a
leading scientific journal\cite{dubois2007guest}, and the scope of the SciPy library
narrowed, while the breadth of the ecosystem grew
through a new type of auxiliary package, the scikit\cite{scikits-general}.
The tooling, development, and release processes became more professional.
SciPy was expanded carefully, with the patience affordable in open source
projects and via best practices common in industry \cite{millman2014developing}.
%http://conference.scipy.org/proceedings/scipy2008/paper\_0/full\_text.pdf
%http://conference.scipy.org/proceedings/scipy2008/paper\_1/full\_text.pdf
In the early workshops, recurrent topics reflected the state of development, with
emphasis being placed on the underlying array package, plotting,
parallel processing, acceleration / wrapping, and user interfaces. By
2004, a significant shift occurred towards application of SciPy to
scientific problems. The event also started to draw in more keynote
speakers from outside the community, such as Guido van Rossum (creator
of Python, 2006), Ivan Krstić (One Laptop per Child, 2007), Alex
Martelli (Google, 2008), and Peter Norvig (Google Research, 2009).
The SciPy conference went from being a small gathering of
core developers to a multi-location event with increased funding, a
published proceedings, and scholarships for attending students.
By 2010, the US SciPy conference had multiple tracks, and satellite
conferences were being organized by volunteers elsewhere, such as EuroSciPy
(2008–) and SciPy India (2009–).
Special sessions and minisymposia dedicated to scientific Python began
appearing at many other events.
For example, a three-part minisymposium organized for CSE 2009 was
featured in SIAM News\cite{siamcse09}.
In 2007, Python had a strong enough presence in science and engineering
that the editors of IEEE Computing in Science and Engineering
(CiSE) solicited a special issue\cite{dubois2007guest}, edited by Paul
Dubois. However, Python was still sufficiently niche that the average reader
would need additional information in order to decide whether it would
``be useful in their own work.''
The follow-up CiSE March/April 2011 Python for Scientists and Engineers
special issue\cite{millman2011python}, focused more on the core parts
of the scientific Python ecosystem\cite{perez2011python} including
NumPy\cite{vanderwalt2011numpy}, Cython\cite{behnel2011cython},
and Mayavi\cite{ramachandran2011mayavi}. Python became so pervasive that
journals began publishing domain-specific special issues.
For example, in 2015 Frontiers in Neuroinformatics published a collection of 25 articles---covering
topics including modeling and simulation, data collection, electrophysiology, visualization,
as well as stimulus generation and presentation---called Python in
Neuroscience\cite{python-FIN}.
In 2012, Perry Greenfield, John Hunter, Jarrod Millman, Travis Oliphant,
and Fernando Pérez founded NumFOCUS\cite{numfocus},
a 501(c)3 public charity with a mission
``to promote sustainable high-level programming languages, open code development,
and reproducible scientific research.''
While NumFOCUS is language agnostic, many of the early sponsored projects
came from the scientific Python stack.
Today it has 50 affiliated and sponsored projects including NumPy, SciPy, IPython, and
Matplotlib.
Among its other projects, NumFOCUS organizes a global network of
community driven educational programs, meetups, and conferences
called PyData.
\subsection*{SciPy Today}
At the time of writing, the SciPy library consists of nearly
600,000 lines of code organized in 16 subpackages.
Over 85,000 GitHub repositories and almost 5000 packages depend
on SciPy. Some of the major
feature highlights from the three years preceding
SciPy 1.0 are discussed in Section \ref{sec:technical_improvements},
and milestones in its history are highlighted in Figure~\ref{fig:timeline}.
% At the time of writing, SciPy produces a new release
% approximately every six months, and some of the major
% feature highlights from the three years preceding
% SciPy 1.0 are discussed in a technical improvements
% section below. Some longer-term milestone events are
% highlighted in Figure~\ref{fig:timeline}.
\begin{figure}[H]
\centering
\includegraphics[width=0.95\textwidth]{static/scipy_timeline}
\caption{Major milestones from SciPy's initial release in 2001 to
the release of SciPy 1.0 in 2017. Logos reprinted with permission.}
\label{fig:timeline}
\end{figure}
% Release management
% Jarrod
%https://mail.python.org/pipermail/scipy-dev/2007-August/007613.html
%Ralf
%https://mail.python.org/pipermail/numpy-discussion/2010-March/049057.html
%https://mail.python.org/pipermail/scipy-dev/2010-March/014034.html
%https://mail.python.org/pipermail/scipy-dev/2010-April/014091.html
\section*{Architecture and implementation choices}
\subsection*{Project scope}
SciPy provides fundamental algorithms for scientific computing. The
breadth of its scope was derived from the Guide to Available Mathematical
Software classification system (GAMS\cite{boisvert1991guide}). In areas
that move relatively slowly, e.g. linear algebra, SciPy aims to provide
complete coverage. In other areas it aims to provide fundamental building
blocks while interacting well with other packages specialized in that area.
For example, SciPy provides what one expects to find in a
statistics textbook (probability distributions, hypothesis tests, frequency
statistics, correlation functions, and more), while
Statsmodels\cite{statsmodels2010} provides
more advanced statistical estimators and inference methods;
scikit-learn\cite{pedregosa2011scikit} covers machine learning; and
PyMC3\cite{10.7717/peerj-cs.55}, emcee\cite{2013PASP-emcee} and
PyStan\cite{pystan-ref} cover Bayesian statistics and probabilistic modeling.
scikit-image\cite{vanderwalt2014scikit} provides image processing
capabilities beyond \texttt{scipy.ndimage}, sympy\cite{meurer2017sympy}
provides a Python interface for symbolic computation, and
NetworkX\cite{hagberg2008networkx} is a Python library for probing
graphs and networks in a more specialized way than SciPy components
like \texttt{sparse.csgraph} or \texttt{spatial}.
We use the following criteria to determine whether or not to include new
functionality in SciPy:
\begin{itemize}
\item The algorithm is of relevance to multiple fields of science.
\item The algorithm is demonstrably important. For example, it is classic
enough to be included in textbooks, or it is based on a peer-reviewed article
which has a significant number of citations.
\end{itemize}
In terms of software systems and architecture, SciPy's scope matches NumPy's:
algorithms for in-memory computing on single machines, with support for a wide
range of data types and process architectures. Distributed computing and support
for graphics processing units (GPUs) is explicitly out of scope.
\subsection*{Package organization}
\input{subpackages}
\subsection*{Language choices}
Python, Cython, Fortran, C and C++ are the programming languages used to
implement scientific algorithms in the SciPy library. An analysis of our code
base using the \texttt{linguist} library\cite{linguistref} provides a
detailed breakdown as \% composition by programming language in
SciPy (Table~\ref{tab:linguist}).
\begin{table}
\centering
\begin{tabular}{p{3cm}S[table-format = <0.1]}
\toprule
Language & {Percent}\\
\midrule
Python & 49.5 \\
Fortran & 25.6 \\
C & 19.5 \\
Cython & 3.0 \\
C++ & 2.3 \\[\defaultaddspace]
{\TeX, Matlab, Shell, and Makefile} & <0.5 \\
\bottomrule
\end{tabular}
\caption{Language composition of SciPy codebase: lines of code in each
programming language as determined by the \texttt{linguist} package.
The last row denotes tools used in supporting roles in tests,
building, and documentation.}
\label{tab:linguist}
\end{table}
Fortran, despite its age, is still a high-performance scientific programming language with
continued contemporary usage\cite{Koelbel:1993:HPF:562354}. Thus, we wrap the following excellent, field-tested Fortran
libraries in order to provide Python convenience while benefiting from their performance:
FFTPACK\cite{SWARZTRAUBER198445, SWARZTRAUBER198251}, % for performing Fourier transforms,
ODEPACK\cite{citeulike:2644528}, % for solving ordinary differential equation initial value problems,
QUADPACK\cite{1983qspa.book.....P}, % for numerical integration of one-dimensional functions,
FITPACK\cite{Dierckx:1993:CSF:151103}, % for curve-fitting and interpolation,
ODRPACK\cite{ODRPACK_Boggs}, % for orthogonal distance regression,
MINPACK\cite{osti_6997568}, % for minimization of linear and nonlinear equations,
ARPACK\cite{leh:sor:yan96}, % for solving large scale eigenvalue problems,
ALGORITHM 644\cite{Amos:1986:APP:7921.214331}, and % for Bessel Functions, and
CDFLIB\cite{CDFLIB_site}. % for evaluation of cumulative density functions.
Rounding out the top three languages in SciPy is C, which is also extremely
well-established over several decades\cite{Kernighan:1988:CPL:576122} of
scientific computing. The C libraries that we wrap in SciPy include
trlib\cite{doi:10.1080/10556788.2018.1449842}, % (iterative solving of the trust region problem),
SuperLU\cite{li05,superlu_ug99}, % (solution of large, sparse, nonsymmetric systems of linear equations),
Qhull\cite{Barber:1996:QAC:235815.235821}, and % (computational geometry)
Cephes\cite{cephes_netlib}. % (mathematics algorithms).
Cython has been described as a creole language that mixes the best parts of Python and
lower-level C / C++ paradigms\cite{behnel2011cython}. We often use Cython
as a glue between well-established low-level scientific computing libraries written
in C / C++ and the Python interface offered by SciPy. We also use Cython to enable performance
enhancements in Python code, especially for cases where heavily used inner
loops benefit from a compiled code with static typing.
For implementing new functionality, Python is the still the language of choice. If Python
performance is an issue, then we prefer the use of Cython followed by C, C++, or Fortran (in that
order). The main motivation for this is maintainability: Cython has the
highest abstraction level and most Python developers will understand it. C is
also widely known, and easier for the current core development team to manage
than C++ and especially Fortran.
The distribution of secondary programming languages in SciPy is a compromise between
a powerful, performance-
enhancing language that interacts well with Python (i.e. Cython) and the
usage of languages (and their libraries) that have proven reliable and
performant over many decades. The position that SciPy occupies near the
foundation of the scientific Python ecosystem is such that adoption of new
languages or major dependencies is generally unlikely---our choices are strongly
driven by long-term stability. GPU acceleration, new transpiling libraries, and
the latest JIT compilation approaches (i.e.,
Numba\cite{Lam:2015:NLP:2833157.2833162}) are very powerful, but currently fall
outside the remit of the main SciPy library. That said, we have recently
increased our efforts to support compatibility with some of these options, and
having our full test suite pass with the PyPy JIT
compiler\cite{Bolz:2009:TMP:1565824.1565827} is now a requirement in our
development workflow.
\subsection*{API and ABI evolution}
The application programming interface (API) for SciPy consists of approximately
1500 functions and classes. Our policy for evolving the API over time is that
new functionality can be added, while removing or changing existing
functionality can only be done if the benefits of that exceeds the (often
significant) costs to users, \textit{and} only after giving clear deprecation
warnings to those users for at least one year. In general, we encourage
changes that improve clarity in the API of the library but strongly discourage
breaking backwards compatibility given our position near the base of the
scientific Python computing stack.
In addition to the Python API, SciPy has C and Cython interfaces in a number
of submodules. Therefore, we have to also consider the application binary
interface (ABI). This ABI has been stable for a long time, and we aim to
evolve it only in a backwards compatible way.
\section*{Key technical improvements}
\label{sec:technical_improvements}
Here we describe key technical improvements made in the last three years.
\subsection*{Data structures}
\subsubsection*{Sparse matrices}
\texttt{scipy.sparse} offers seven sparse matrix data structures,
also known as sparse formats. The most important ones are the row-
and column-compressed formats (CSR and CSC, respectively).
These offer fast major-axis indexing and fast matrix-vector multiplication,
and are used heavily throughout SciPy and dependent packages.
Over the last three years, our sparse matrix handling internals have been
rewritten and performance has been improved. Iterating over and slicing of CSC
and CSR matrices is now faster by up to 35\% (Figure~\ref{fig:sparse-iter}),
and the coordinate (COO) / diagonal (DIA) to CSR / CSC matrix format
conversions are now faster (Figure~\ref{fig:sparse-conv}). Importantly,
SuperLU\cite{superlu_ug99} was updated to version 5.2.1, enhancing the
low-level implementations leveraged by a subset of our \texttt{sparse}
offerings.
From a new features standpoint, \texttt{scipy.sparse} matrices and linear
operators now support the Python matrix multiplication (@) operator when
available. We've added \texttt{scipy.sparse.norm} and
\texttt{scipy.sparse.random} for computing sparse matrix norms and drawing
random variates from arbitrary distributions, respectively. Also, we've made a
concerted effort to bring the \texttt{scipy.sparse} API into line with the
equivalent NumPy API where possible.
\subsubsection*{\texttt{cKDTree}}
\input{ckdtree}
\subsection*{Unified bindings to compiled code}
\subsubsection*{LowLevelCallable}
As of SciPy version 0.19, it is possible for users to wrap low-level functions
in a \texttt{scipy.LowLevelCallable} object that reduces the overhead of
calling compiled C functions, such as those generated using \texttt{numba}
or Cython, directly from Python.
Supported low-level functions include \texttt{PyCapsule}
objects, \texttt{ctypes} function pointers, and \texttt{cffi} function pointers.
Furthermore, it is even possible to generate a low-level callback function
automatically from a Cython module using \texttt{scipy.LowLevelCallable.from\_cython}.
\subsection*{Cython bindings for BLAS, LAPACK, and special}
SciPy has provided special functions and leveraged BLAS and
LAPACK\cite{LAPACK} routines for many years. SciPy now additionally
includes Cython\cite{behnel2011cython} wrappers for
many BLAS and LAPACK routines (added in 2015) and the special functions
provided in the \texttt{scipy.{\allowbreak}special} submodule (added in 2016).
These Cython wrappers are available in the modules
\texttt{scipy.{\allowbreak}linalg.{\allowbreak}cython\_blas},
\texttt{scipy.{\allowbreak}linalg.{\allowbreak}cython\_lapack}, and
\texttt{scipy.{\allowbreak}special.{\allowbreak}cython\_special} respectively.
When writing algorithms in Cython, it is typically more efficient to call
directly into the libraries SciPy wraps rather than indirectly, using SciPy's
Python APIs. These low-level interfaces for Cython can also be used outside of
the SciPy codebase to gain access to the functions in the wrapped libraries
while avoiding the overhead of Python function calls. This can give
performance gains of one or two orders of magnitude for many use cases.
Developers can also use the low-level Cython interfaces without linking against
the wrapped libraries\cite{blas-lapack-wrappers-scipy-2015}. This lets other
extensions avoid the complexity of finding and using the correct libraries.
Avoiding this complexity is especially important when wrapping libraries
written in Fortran. Not only can these low-level wrappers be used without a
Fortran compiler, they can also be used without having to handle all the
different Fortran compiler ABIs and name mangling schemes.
Most of these low-level Cython wrappers are generated automatically to help
with both correctness and ease of maintenance. The wrappers for BLAS and
LAPACK are primarily generated using type information that is parsed from the
BLAS and LAPACK source files using F2PY\cite{peterson2009f2py}, though a small
number of routines use hand-written type signatures instead. The input and
output types of each routine are saved in a data file that is read at build
time and used to generate the corresponding Cython wrapper files. The wrappers
in \texttt{scipy.{\allowbreak}special.{\allowbreak}cython\_special} are also
generated from a data file containing type information for the wrapped
routines.
Since SciPy can be built with LAPACK 3.4.0 or later, Cython wrappers are only
provided for the routines that maintain a consistent interface across all
supported LAPACK versions. The standard BLAS interface provided by the various
existing BLAS libraries is not currently changing, so changes are not generally
needed in the wrappers provided by SciPy. Changes to the Cython wrappers for
the functions in \texttt{scipy.{\allowbreak}special} follow corresponding
changes to the interface of that submodule.
\subsection*{Numerical optimization}
\input{scipy-optimize}
\subsection*{Statistical distributions}
The \texttt{scipy.stats} module contains more than 100 probability
distributions: 96 continuous and 13 discrete univariate distributions
and 10 multivariate distributions. The implementation relies on a
consistent framework that provides methods to sample random variates,
to evaluate the cumulative distribution function (cdf) and the probability
density function (pdf) and to fit parameters for every distribution.
Generally, the methods rely on specific implementations for each
distribution such as a closed form expression of the cdf or a sampling
algorithm, if available. Otherwise, default methods are used
based on generic code, e.g., numerical integration of the pdf to
obtain the cdf.
Key recent distributions added to \texttt{scipy.stats} include the
histogram-based distribution in \texttt{scipy.stats.rv\_histogram}
and the multinomial distribution in \texttt{scipy.stats.multinomial}
(used, for example, in natural language processing, see
\cite{Griffiths5228}).
\subsection*{Polynomial interpolators}
\input{poly}
\subsection*{Test and benchmark suite}
\subsubsection*{Test suite}
The SciPy test suite is orchestrated by a continuous integration matrix that
includes POSIX and Windows (32/64-bit) platforms managed by Travis CI and
AppVeyor, respectively. Our tests cover Python versions 2.7, 3.4, 3.5, 3.6, and
include code linting with \texttt{pyflakes} and \texttt{pycodestyle}. There are more than $13,000$
unit tests in the test suite, which is written for usage with the \texttt{pytest}
framework, and with 87 \% and 45 \% line coverages for Python and compiled
code, respectively at the SciPy 1.0 release point (Figure~\ref{fig:coverage}). Documentation for the code is automatically built and hosted by
the CircleCI service to facilitate evaluation of documentation changes /
integrity. Our full test suite also passes with PyPy3, a just-in-time compiled
version of the Python language.
\begin{figure}[H]
\centering
\begin{tikzpicture}[]
\pgfplotstableread[column/ver/.style=string type]{
ver totpyline covpyline uncovpyline totcompline covcompline uncovcompline
v0.12.1 33844 25721 8123 273634 142290 131344
v0.13.3 36638 28944 7694 338926 172852 166074
v0.14.1 37301 30587 6714 277643 147151 130492
v0.15.1 38339 30288 8051 304898 158547 146351
v0.16.1 40094 32075 8019 322398 167647 154751
v0.17.1 42566 34478 8088 340903 170452 170451
v0.18.1 44711 36216 8495 353417 173174 180243
v0.19.1 43823 36373 7450 436900 200974 235926
v1.0.0 106878 92984 13894 462574 208158 254416
}\covtable
\begin{axis}[
width=\textwidth,
height=7cm,
enlargelimits=false,
ymin=0,
ymax=6e5,
stack plots=y,%
area style,
xtick=data,
ytick={2e5,4e5,6e5},
xticklabels from table={\covtable}{ver},
scaled y ticks=false,
y tick label style={/pgf/number format/.cd, fixed, 1000 sep={}},
legend entries={Compiled (covered),Compiled (uncovered),Python (covered),Python (uncovered)},
legend style={draw=none,fill=none, cells={align=left},/tikz/every even column/.append style={column sep=3mm}},
legend columns=-1,
legend image code/.code={\fill[#1] (0cm,-0.1cm) rectangle (0.2cm,0.1cm);},
]
\addplot+[fill=blue!50] table[x expr=\coordindex, y=covcompline] {\covtable} \closedcycle;
\addplot+[fill=blue!10] table[x expr=\coordindex, y=uncovcompline] {\covtable} \closedcycle;
\addplot+[fill=red!70] table[x expr=\coordindex, y=covpyline] {\covtable} \closedcycle;
\addplot+[fill=red!30] table[x expr=\coordindex, y=uncovpyline] {\covtable} \closedcycle;
\end{axis}
\end{tikzpicture}
\caption{
Python (red) and compiled (blue) code volume in SciPy over time.
Deep-shaded area represents lines of code covered by units tests;
light-shaded area represents lines not covered. With the exception
of the removal of $\approx 61,000$ lines of compiled code for SciPy
v0.14, the volume of both compiled and Python